## 0. 라이브러리

In [108]:
import os
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, TFDistilBertForSequenceClassification, DistilBertTokenizer, TFDistilBertModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model
import json
from soynlp.normalizer import *
from tqdm import tqdm
import re
import tensorflow as tf
import matplotlib.pyplot as plt
import random
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


In [109]:
def aug(x,y,classs):
    def random_deletion(words, p=0.1):
        if len(words) == 1:
            return words

        new_words = []
        for word in words:
            r = random.uniform(0, 1)
            if r > p:
                new_words.append(word)

        if len(new_words) == 0:
            rand_int = random.randint(0, len(words)-1)
            return [words[rand_int]]

        return ''.join(new_words)

    def swap_word(new_words):
        n = 5
        for _ in range(n):
            random_idx_1 = random.randint(0, len(new_words)-1)
            random_idx_2 = random_idx_1
            counter = 0

            while random_idx_2 == random_idx_1:
                random_idx_2 = random.randint(0, len(new_words)-1)
                counter += 1
                if counter > 3:
                    return new_words

            new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
        return ' '.join(new_words)

    def random_swap(words):
        new_words = list()
        for word in words:
            new_words.append(swap_word(word.split()))

        return new_words
    df = pd.concat([x,y],axis=1).reset_index(drop=True)
    df_rd = df[df['class']==classs].copy()
    df_rd['conversation'] = df_rd['conversation'].apply(random_deletion)
    df_rs = df[df['class']==classs].copy()
    df_rs['conversation'] = random_swap(df_rs['conversation'].values)
    
    df_concated = pd.concat([df, df_rs])
    return df_concated.loc[:,['conversation']] , df_concated['class']


## 1. 데이터 불러오기

In [110]:
train_data_path ="~/aiffel/dktc/data2/train0.csv"
train_data = pd.read_csv(train_data_path,index_col=0)
train_data


Unnamed: 0_level_0,class,conversation
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
1,일반 대화,야 여행 가기 너무 좋은 날씨다\n날씨 진짜 너무 좋아 **이는 일하기 싫을 날씨야...
2,갈취 대화,사정이 있었어요 . 살려주세요 . 뒤진 사람들 중에 사정 없는 사람 없어 맞아 . ...
3,갈취 대화,배고프다 그러게 야 만원만 빌려줘봐 오늘도 ? 저번에도 만원 빌려가고 안 돌려 줬잖...
4,갈취 대화,저기 실례지만 제가 휠체어를 타는 장애인이라서 아 혼자 휠체어 운전하시는게 어렵다고...
5,직장 내 괴롭힘 대화,김 사원 낙하산이라는 소문이 있던데 ? 요즘 시대에도 낙하산이 있어요 ? 요즘 더 ...
...,...,...
4826,일반 대화,티비는 잘 안 보니?\n티비 매일 보지\n너는?\n키키 요즘 핫한 스우파 안 보니?...
4827,갈취 대화,이거 나이키 신상이네 ? 응 아빠가 사주셨어 우와 멋지네 근데 너랑은 별로 안어울린...
4828,직장 내 괴롭힘 대화,일처리를 어떻게 한거야 ! 기간 내에 상품이 못오면 그사이 손실은 어떻게 할거냐고 ...
4829,직장 내 괴롭힘 대화,어우 정대리 이번에도 참 글래머 스럽게 입고왔네 네 ? 지금 저한태 하신말이세요 ?...


## 2. 데이터 준비 (Data preparation)
### 2.1-1 전처리 함수 정의

In [111]:
def preprocess_sentence(sentence):
    # synolp
    emoticon_normalize(sentence)
    repeat_normalize(sentence)
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # base preprocess
    sentence = re.sub(r'([^a-zA-Zㄱ-ㅎ가-힣?.!,])', " ", sentence)
    sentence = re.sub(r'!+', '!', sentence)
    sentence = re.sub(r'\?+', '?', sentence)
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    # 엔터 구분 (\n)
    sentence = sentence.replace("\n", " ")
    sentence = sentence.strip()
    return sentence


### 2.1-2 전처리 함수 적용

In [112]:
# 학습할 문장이 담길 배열
sentences = []

for val in tqdm(train_data['conversation']):
    sentences.append(preprocess_sentence(val))


100%|██████████| 4830/4830 [00:01<00:00, 3205.63it/s]


### 2.2 최대 길이 지정

In [113]:
MAX_LEN = 300


### 2.3 class(label) 인코딩

In [114]:
from sklearn.preprocessing import LabelEncoder

CLASS_NAMES = ['협박 대화', '갈취 대화', '직장 내 괴롭힘 대화', '기타 괴롭힘 대화','일반 대화']

encoder = LabelEncoder()
encoder.fit(CLASS_NAMES)

train_data['class'] = encoder.transform(train_data['class'])
labels = train_data['class']

len(labels)


4830

In [115]:
class_mapping = {class_name: encoder.transform([class_name])[0] for class_name in CLASS_NAMES}
print("Class mapping:", class_mapping)


Class mapping: {'협박 대화': 4, '갈취 대화': 0, '직장 내 괴롭힘 대화': 3, '기타 괴롭힘 대화': 1, '일반 대화': 2}


### 2.4 train-val

In [116]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42)


### 2.5 데이터 증강

In [117]:
# train_sentences, train_labels = aug(train_sentences, train_labels, 0)
# train_sentences, train_labels = aug(train_sentences, train_labels, 1)
# train_sentences, train_labels = aug(train_sentences, train_labels, 2)
# train_sentences, train_labels = aug(train_sentences, train_labels, 3)
# train_sentences, train_labels = aug(train_sentences, train_labels, 4)

## 3. 모델
### 3.1-1 토크나이저 정의

In [118]:
# BERT 토크나이저와 모델 준비


# tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

### 3.1-2 토크나이저 적용

In [119]:
# 데이터셋을 BERT 입력 형식으로 변환
train_encodings = tokenizer(train_sentences, truncation=True, padding=True, max_length=MAX_LEN) # 뒤쪽에 패딩
val_encodings = tokenizer(val_sentences, truncation=True, padding=True, max_length=MAX_LEN)


### 3.2 모델 준비

In [120]:
#model = TFBertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=5)

#model = TFDistilBertModel.from_pretrained("distilbert-base-multilingual-cased", num_labels=5)
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=5)


Some layers from the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_96']
You should probably TRAIN this model on a down-stream ta

### 3.3 파라미터

In [121]:
BATCH_SIZE = 16
lr = 5e-5
EPOCH = 10


### 3.4 TF 데이터셋 생성

In [122]:
# TensorFlow 데이터셋 생성
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(100).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(BATCH_SIZE)


### 3.5 모델 컴파일

In [123]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


### 3.6 모델 훈련

### 3.6-1 콜백 설정

In [124]:
early_stopping = EarlyStopping(
    monitor='val_loss',    # 검증 손실을 모니터링
    patience=2,            # 3 에포크 동안 개선되지 않으면 중지
    restore_best_weights=True  # 최상의 가중치를 복원
)

checkpoint = ModelCheckpoint(
    filepath='best_model_weights.h5',  # 모델 가중치를 저장할 파일 경로
    monitor='val_loss',        # 검증 손실을 모니터링
    save_best_only=True,       # 최상의 모델만 저장
    save_weights_only=True,   # 저장 (가중치)
    mode='min',                # 'val_loss'가 최소일 때 저장
    verbose=1                  # 저장 시 로그 출력
)


### 3.6-2 모델 훈련

In [125]:
model.fit(
    train_dataset, 
    validation_data=val_dataset,
    epochs=EPOCH,
#     callbacks=[early_stopping, checkpoint]
    callbacks=[checkpoint]
)


Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.73355, saving model to best_model_weights.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.73355 to 0.43076, saving model to best_model_weights.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.43076 to 0.43061, saving model to best_model_weights.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.43061
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.43061
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.43061
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.43061
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.43061
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.43061
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.43061


<keras.callbacks.History at 0x78b2f4e6d730>

### 3.7 모델 평가

In [127]:
# 모델 평가
evaluation = model.evaluate(val_dataset)
print("평가 결과:", evaluation)


평가 결과: [0.6302611827850342, 0.8602484464645386]


In [128]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.metrics import accuracy_score
import numpy as np

def score(model, val):
    X, y = [], []
    for batch in val_dataset:
        inputs, labels = batch
        X.append(inputs)
        y.append(labels)
    # 각 입력 키에 대해 데이터를 결합하여 numpy 배열로 변환
    X = {key: np.concatenate([d[key].numpy() for d in X], axis=0) for key in X[0].keys()}
    y = np.concatenate(y, axis=0)
    
    # 실제 예측값 생성
    real_predictions = model.predict(X)
    logits = real_predictions.logits

    # 예측값을 레이블로 변환
    if logits.ndim > 1:
        real_predicted_labels = np.argmax(logits, axis=1)
    else:
        real_predicted_labels = (logits > 0.5).astype(int)
    

    # 정확도 계산
    real_accuracy = accuracy_score(y, real_predicted_labels)
    print(f"Real Accuracy: {real_accuracy:.4f}")

    # 분류 보고서 생성
    real_report = classification_report(y, real_predicted_labels, target_names=[f"Class {i}" for i in range(logits.shape[1])])
    print(real_report)

    # F1 스코어 계산
    real_f1 = f1_score(y, real_predicted_labels, average='weighted')
    print(f"\nWeighted F1 Score (based on real predictions): {real_f1:.4f}")


In [129]:
score(model, val_dataset)


Real Accuracy: 0.8602
              precision    recall  f1-score   support

     Class 0       0.73      0.84      0.78       175
     Class 1       0.76      0.87      0.81       215
     Class 2       1.00      0.94      0.97       178
     Class 3       0.93      0.93      0.93       201
     Class 4       0.95      0.72      0.82       197

    accuracy                           0.86       966
   macro avg       0.87      0.86      0.86       966
weighted avg       0.87      0.86      0.86       966


Weighted F1 Score (based on real predictions): 0.8616


## 4. 모델 적용

In [130]:
test_data_path ="/aiffel/aiffel/dktc/data2/test.json"

with open(test_data_path, "r", encoding="utf-8") as json_file:
    test = json.load(json_file)


In [131]:
import numpy as np

test_predicst = list()

for key in test:
    
    test_sentence = test[key]['text']
    inputs = tokenizer(test_sentence, truncation=True, padding=True, max_length=300, return_tensors="tf")
    test_predictions = model.predict(inputs.data) 
    test_class_probabilities = tf.nn.softmax(test_predictions.logits, axis=-1).numpy()
    test_predicted_class = np.argmax(test_class_probabilities, axis=1)
    test_predicst.append(test_predicted_class[0])


In [132]:
test_predicst


[0,
 3,
 3,
 1,
 1,
 0,
 4,
 0,
 1,
 0,
 4,
 1,
 3,
 3,
 3,
 1,
 1,
 1,
 4,
 1,
 4,
 1,
 1,
 0,
 0,
 3,
 0,
 1,
 3,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 4,
 0,
 4,
 0,
 3,
 0,
 1,
 3,
 0,
 1,
 1,
 4,
 0,
 1,
 0,
 3,
 3,
 1,
 1,
 1,
 1,
 3,
 0,
 1,
 4,
 1,
 3,
 4,
 1,
 1,
 0,
 0,
 1,
 3,
 1,
 3,
 1,
 3,
 3,
 0,
 3,
 1,
 0,
 4,
 0,
 0,
 1,
 3,
 4,
 1,
 0,
 3,
 3,
 3,
 4,
 2,
 1,
 3,
 3,
 1,
 0,
 1,
 0,
 0,
 1,
 4,
 1,
 3,
 3,
 1,
 1,
 0,
 0,
 4,
 0,
 3,
 0,
 0,
 1,
 3,
 3,
 0,
 3,
 3,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 1,
 1,
 1,
 3,
 1,
 0,
 4,
 4,
 4,
 1,
 1,
 1,
 1,
 0,
 1,
 3,
 4,
 4,
 4,
 4,
 0,
 4,
 1,
 3,
 0,
 0,
 0,
 4,
 3,
 2,
 1,
 4,
 1,
 3,
 4,
 3,
 1,
 3,
 2,
 1,
 0,
 4,
 3,
 3,
 1,
 1,
 1,
 0,
 3,
 0,
 0,
 3,
 1,
 3,
 3,
 4,
 3,
 3,
 4,
 0,
 4,
 4,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 3,
 0,
 1,
 1,
 4,
 1,
 3,
 3,
 3,
 3,
 1,
 0,
 1,
 3,
 0,
 1,
 1,
 0,
 4,
 4,
 0,
 3,
 1,
 0,
 4,
 1,
 0,
 4,
 0,
 0,
 3,
 4,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 3,
 0,
 0,
 3,
 0,
 3,
 0,


In [133]:
def labelnum_to_text(x):
    if x == 1 : # '기타 괴롭힘 대화'
        return '03'
    if x == 2 : # '일반 대화'
        return '04'
    if x == 3 : # '직장 내 괴롭힘 대화
        return '02'
    if x == 0 : # '갈취 대화' 
        return '01'
    if x == 4 : # '협박 대화'
        return "00"
    
submission = pd.DataFrame({'class':test_predicst}, index=list(test.keys()))
submission['class'] = submission['class'].apply(labelnum_to_text)
submission


Unnamed: 0,class
t_000,01
t_001,02
t_002,02
t_003,03
t_004,03
...,...
t_495,02
t_496,01
t_497,01
t_498,00


In [134]:
submission.to_csv('~/aiffel/dktc/data2/submission_0.csv')
