## 0. 라이브러리

In [44]:
import os
import pandas as pd
import tensorflow as tf
#from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model
import json
from soynlp.normalizer import *
from tqdm import tqdm
import re
import tensorflow as tf
import matplotlib.pyplot as plt
import random
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Input




## 1. 데이터 불러오기

In [45]:
train_data_path ="~/aiffel/dktc/data2/train0.csv"
train_data = pd.read_csv(train_data_path,index_col=0)
train_data

Unnamed: 0_level_0,class,conversation
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
1,일반 대화,야 여행 가기 너무 좋은 날씨다\n날씨 진짜 너무 좋아 **이는 일하기 싫을 날씨야...
2,갈취 대화,사정이 있었어요 . 살려주세요 . 뒤진 사람들 중에 사정 없는 사람 없어 맞아 . ...
3,갈취 대화,배고프다 그러게 야 만원만 빌려줘봐 오늘도 ? 저번에도 만원 빌려가고 안 돌려 줬잖...
4,갈취 대화,저기 실례지만 제가 휠체어를 타는 장애인이라서 아 혼자 휠체어 운전하시는게 어렵다고...
5,직장 내 괴롭힘 대화,김 사원 낙하산이라는 소문이 있던데 ? 요즘 시대에도 낙하산이 있어요 ? 요즘 더 ...
...,...,...
4826,일반 대화,티비는 잘 안 보니?\n티비 매일 보지\n너는?\n키키 요즘 핫한 스우파 안 보니?...
4827,갈취 대화,이거 나이키 신상이네 ? 응 아빠가 사주셨어 우와 멋지네 근데 너랑은 별로 안어울린...
4828,직장 내 괴롭힘 대화,일처리를 어떻게 한거야 ! 기간 내에 상품이 못오면 그사이 손실은 어떻게 할거냐고 ...
4829,직장 내 괴롭힘 대화,어우 정대리 이번에도 참 글래머 스럽게 입고왔네 네 ? 지금 저한태 하신말이세요 ?...


## 2. 데이터 준비 (Data preparation)
### 2.1-1 전처리 함수 정의

In [46]:
def preprocess_sentence(sentence):
    # synolp
    emoticon_normalize(sentence)
    repeat_normalize(sentence)
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # base preprocess
    sentence = re.sub(r'([^a-zA-Zㄱ-ㅎ가-힣?.!,])', " ", sentence)
    sentence = re.sub(r'!+', '!', sentence)
    sentence = re.sub(r'\?+', '?', sentence)
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    # 엔터 구분 (\n)
    sentence = sentence.replace("\n", " ")
    sentence = sentence.strip()
    return sentence

### 2.1-2 전처리 함수 적용

In [47]:
sentences = [preprocess_sentence(val) for val in tqdm(train_data['conversation'])]

100%|██████████| 4830/4830 [00:01<00:00, 3255.64it/s]


### 2.2 최대 길이 지정

In [48]:
MAX_LEN = 300

### 2.3 class(label) 인코딩

In [49]:
from sklearn.preprocessing import LabelEncoder

CLASS_NAMES = ['협박 대화', '갈취 대화', '직장 내 괴롭힘 대화', '기타 괴롭힘 대화', '일반 대화']
encoder = LabelEncoder()
encoder.fit(CLASS_NAMES)
train_data['class'] = encoder.transform(train_data['class'])
labels = train_data['class']



In [50]:
class_mapping = {class_name: encoder.transform([class_name])[0] for class_name in CLASS_NAMES}
print("Class mapping:", class_mapping)

Class mapping: {'협박 대화': 4, '갈취 대화': 0, '직장 내 괴롭힘 대화': 3, '기타 괴롭힘 대화': 1, '일반 대화': 2}


### 2.4 train-val

In [51]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42)

## 3. 모델
### 3.1-1 토크나이저 정의

In [52]:
# BERT 토크나이저와 모델 준비
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

### 3.1-2 토크나이저 적용

In [53]:
# 데이터셋을 BERT 입력 형식으로 변환
train_encodings = tokenizer(train_sentences, truncation=True, padding=True, max_length=MAX_LEN)
val_encodings = tokenizer(val_sentences, truncation=True, padding=True, max_length=MAX_LEN)

### 3.2 모델 준비

In [54]:
#model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base', num_labels=5)

In [55]:
input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
roberta_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [56]:
# Get the output of the XLM-RoBERTa model
sequence_output = roberta_model(input_ids, attention_mask=attention_mask)[0]

# Extract the CLS token (the first token in the sequence)
cls_token = sequence_output[:, 0, :]

# Add a dense layer with softmax activation for classification
output_layer = Dense(300, activation='softmax')(cls_token)

# Define the model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output_layer)

### 3.3 파라미터

In [57]:
BATCH_SIZE = 8
lr = 5e-5
EPOCH = 10

### 3.4 TF 데이터셋 생성

In [58]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    train_labels
)).shuffle(100).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask']},
    val_labels
)).batch(BATCH_SIZE)

### 3.5 모델 컴파일

In [59]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


### 3.6 모델 훈련

### 3.6-1 콜백 설정

In [60]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    filepath='best_model_weights.h5',
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=True,
    mode='min',
    verbose=1
)

### 3.6-2 모델 훈련

In [61]:
#풀러 레이어가 고정되지 않았으며 올바르게 연결되었는지 확인하세요. 모든 레이어를 고정 해제하는 방법은 다음과 같습니다.
for layer in model.layers:
    layer.trainable = True

In [62]:
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCH,
    callbacks=[early_stopping, checkpoint]
)

Epoch 1/10





Epoch 00001: val_loss improved from inf to 0.69331, saving model to best_model_weights.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.69331 to 0.56669, saving model to best_model_weights.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.56669 to 0.43556, saving model to best_model_weights.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.43556
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.43556


<keras.callbacks.History at 0x7f758d21a340>

### 3.7 모델 평가

In [63]:
# 모델 평가
evaluation = model.evaluate(val_dataset)
print("평가 결과:", evaluation)

평가 결과: [0.43555948138237, 0.8716356158256531]


In [64]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score

def score(model, val_dataset):
    X, y = [], []
    for batch in val_dataset:
        inputs, labels = batch
        X.append(inputs)
        y.append(labels)
    X = {key: np.concatenate([d[key].numpy() for d in X], axis=0) for key in X[0].keys()}
    y = np.concatenate(y, axis=0)
    
    real_predictions = model.predict(X)
    logits = real_predictions

    if logits.ndim > 1:
        real_predicted_labels = np.argmax(logits, axis=1)
    else:
        real_predicted_labels = (logits > 0.5).astype(int)
    
    real_accuracy = accuracy_score(y, real_predicted_labels)
    print(f"Real Accuracy: {real_accuracy:.4f}")

    real_report = classification_report(y, real_predicted_labels, target_names=[f"Class {i}" for i in range(5)])
    print(real_report)

    real_f1 = f1_score(y, real_predicted_labels, average='weighted')
    print(f"\nWeighted F1 Score (based on real predictions): {real_f1:.4f}")



In [65]:
score(model, val_dataset)

Real Accuracy: 0.8716
              precision    recall  f1-score   support

     Class 0       0.85      0.81      0.83       175
     Class 1       0.73      0.89      0.80       215
     Class 2       0.99      0.96      0.97       178
     Class 3       0.93      0.94      0.93       201
     Class 4       0.93      0.76      0.84       197

    accuracy                           0.87       966
   macro avg       0.89      0.87      0.87       966
weighted avg       0.88      0.87      0.87       966


Weighted F1 Score (based on real predictions): 0.8729


## 4. 모델 적용

In [66]:
import json

test_data_path = "/aiffel/aiffel/dktc/data/test.json"
test = pd.read_json(test_data_path).transpose()

In [67]:
import numpy as np

test_predict = []

for idx, value in test.iterrows():

    test_sentence = value["text"]
    
    test_encodings = tokenizer(test_sentence, truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="tf")

    test_predictions = model.predict(
        (test_encodings["input_ids"],
         test_encodings["attention_mask"])
    )
    test_class_probabilities = tf.nn.softmax(test_predictions, axis=-1).numpy() # [[0.13297564 0.8358507  0.00801584 0.02315779]]
    test_predicted_class = np.argmax(test_class_probabilities, axis=1) # [ 1 ]
    test_predict.append(test_predicted_class[0])

In [68]:
# {'협박 대화': 4, '갈취 대화': 0, '직장 내 괴롭힘 대화': 3, '기타 괴롭힘 대화': 1, '일반 대화': 2}
#   협박 대화 : 0,  갈취 대화 : 1,  직장 내 괴롭힘 대화 : 2,  기타 괴롭힘 대화 : 3,  일반 대화 : 4
def labelnum_to_text(x):
    if x == 0:
        return '01'
    if x == 1:
        return '03'
    if x == 2:
        return '04'
    if x == 3:
        return '02'
    if x == 4:
        return '00'

import datetime
    
submission = pd.read_csv("/aiffel/aiffel/dktc/data/new_submission.csv")
submission["class"] = [ labelnum_to_text(pred) for pred in test_predict ]

now = datetime.datetime.now()
filename = now.strftime("submission %y-%m-%d %H:%M.csv")

submission.to_csv(filename, index=False)
submit_file = pd.read_csv(filename)

print(submit_file.shape)
print(submit_file.head())

(500, 2)
  file_name  class
0     t_000      1
1     t_001      2
2     t_002      3
3     t_003      3
4     t_004      3


In [69]:
submission.to_csv('~/aiffel/dktc/data2/submission_0.csv')