## [Settings] 라이브러리 임포트 및 다운로드

In [194]:
!pip install transformers



In [195]:
!pip install tensorflow_addons



In [196]:
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
import tensorflow as tf
import torch
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, \
                            roc_auc_score, confusion_matrix, classification_report, \
                            matthews_corrcoef, cohen_kappa_score, log_loss

## [Settings] 구글 드라이브 연동

In [197]:
from google.colab import drive
import os

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [198]:
os.listdir('drive/MyDrive')

FileNotFoundError: ignored

In [13]:
os.chdir('drive/MyDrive/주분2주차')

## [Settings] GPU 설정

In [199]:
n_devices = torch.cuda.device_count()
print(n_devices)

for i in range(n_devices):
    print(torch.cuda.get_device_name(i))

1
Tesla V100-SXM2-16GB


## 0. 데이터 로드

In [200]:
dataset = pd.read_csv("최종데이터셋.csv")
len(dataset)

2522

## 1. 언어모델 및 tokenizer 불러오기

In [201]:
MODEL_NAME = "klue/bert-base"
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3, from_pt=True)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 2. 데이터 분리

In [202]:
# 입출력 데이터 분리

X_data = dataset['comments']
y_data = dataset['label']

TEST_SIZE = 0.2 # Train: Test = 9 : 1 분리
RANDOM_STATE = 42
# strtify = True 일 경우, 데이터 분리 이전의 라벨별 분포 고려
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,
                                                    test_size = TEST_SIZE,
                                                    random_state = RANDOM_STATE,
                                                    stratify = y_data)

In [203]:
print(f"훈련 입력 데이터 개수: {len(X_train)}")
print(f"테스트 입력 데이터 개수: {len(X_test)}")

훈련 입력 데이터 개수: 2017
테스트 입력 데이터 개수: 505


In [204]:
# 훈련 데이터 라벨별 비율
y_train.value_counts(normalize = True)

1    0.500248
0    0.499752
Name: label, dtype: float64

In [205]:
# 테스트 데이터 라벨별 비율
y_test.value_counts(normalize = True)

0    0.50099
1    0.49901
Name: label, dtype: float64

## 3. BERT 입력용 데이터 포맷으로 변경

In [206]:
# 입력 데이터(문장) 길이 제한
MAX_SEQ_LEN = 128

In [207]:
def convert_data(X_data, y_data):
    # BERT 입력으로 들어가는 token, mask, segment, target 저장용 리스트
    tokens, masks, segments, targets = [], [], [], []

    for X, y in tqdm(zip(X_data, y_data)):
        # token: 입력 문장 토큰화
        token = tokenizer.encode(X, truncation = True, padding = 'max_length', max_length = MAX_SEQ_LEN)

        # Mask: 토큰화한 문장 내 패딩이 아닌 경우 1, 패딩인 경우 0으로 초기화
        num_zeros = token.count(0)
        mask = [1] * (MAX_SEQ_LEN - num_zeros) + [0] * num_zeros

        # segment: 문장 전후관계 구분: 오직 한 문장이므로 모두 0으로 초기화
        segment = [0]*MAX_SEQ_LEN

        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
        targets.append(y)

    # numpy array로 저장
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)

    return [tokens, masks, segments], targets

In [208]:
# train 데이터를 Bert의 Input 타입에 맞게 변환

train_x, train_y = convert_data(X_train, y_train)

2017it [00:02, 989.76it/s] 


In [209]:
# test 데이터를 Bert의 Input 타입에 맞게 변환

test_x, test_y = convert_data(X_test, y_test)

505it [00:00, 1084.51it/s]


## 4. BERT를 활용한 파인튜닝

In [210]:
# token, mask, segment 입력 정의

token_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_word_ids')
mask_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_masks')
segment_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_segment')
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

In [211]:
bert_outputs

TFSequenceClassifierOutput(loss=None, logits=<KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'tf_bert_for_sequence_classification_3')>, hidden_states=None, attentions=None)

In [212]:
bert_output = bert_outputs[0]

## 5. 모델 컴파일

In [213]:
DROPOUT_RATE = 0.5
NUM_CLASS = 2
dropout = tf.keras.layers.Dropout(DROPOUT_RATE)(bert_output)
# Multi-class classification 문제이므로 activation function은 softmax로 설정

sentiment_layer = tf.keras.layers.Dense(NUM_CLASS, activation='softmax', kernel_initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02))(dropout)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_layer)

In [214]:
# 옵티마이저 Rectified Adam 하이퍼파리미터 조정
OPTIMIZER_NAME = 'RAdam'
LEARNING_RATE = 5e-5
TOTAL_STEPS = 10000
MIN_LR = 1e-5
WARMUP_PROPORTION = 0.1
EPSILON = 1e-8
CLIPNORM = 1.0
optimizer = tfa.optimizers.RectifiedAdam(learning_rate = LEARNING_RATE,
                                          total_steps = TOTAL_STEPS,
                                          warmup_proportion = WARMUP_PROPORTION,
                                          min_lr = MIN_LR,
                                          epsilon = EPSILON,
                                          clipnorm = CLIPNORM)

In [215]:
# 감정분류 모델 컴파일
sentiment_model.compile(optimizer = optimizer,
                        loss = tf.keras.losses.SparseCategoricalCrossentropy(),
                        metrics = ['accuracy'])

## 6. EarlyStopping 설정

In [216]:
MIN_DELTA = 1e-3
PATIENCE = 5

early_stopping = EarlyStopping(
    monitor = "val_accuracy",
    min_delta = MIN_DELTA,
    patience = PATIENCE)

## 7. 최고 성능 모델 저장

In [217]:
# 최고 성능의 모델 파일을 저장할 이름과 경로 설정
BEST_MODEL_NAME = './model/best_model.h5'

In [219]:
model_checkpoint = ModelCheckpoint(
    filepath = BEST_MODEL_NAME,
    monitor = "val_loss",
    mode = "min",
    save_best_only = True, # 성능 향상 시에만 모델 저장
    verbose = 1
)

In [220]:
callbacks = [early_stopping, model_checkpoint]

## 8. 감정 분류 모델 학습

In [221]:
EPOCHS = 10
BATCH_SIZE = 32

In [222]:
len(train_x), type(train_x[0])

(3, numpy.ndarray)

In [223]:
sentiment_model.fit(train_x, train_y,
                    epochs = EPOCHS,
                    shuffle = True,
                    batch_size = BATCH_SIZE,
                    validation_data = (test_x, test_y),
                    callbacks = callbacks
                    )

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.69198, saving model to ./model/best_model.h5


  saving_api.save_model(


Epoch 2/10
Epoch 2: val_loss improved from 0.69198 to 0.68875, saving model to ./model/best_model.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.68875 to 0.67827, saving model to ./model/best_model.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.67827 to 0.63749, saving model to ./model/best_model.h5
Epoch 5/10
Epoch 5: val_loss improved from 0.63749 to 0.57670, saving model to ./model/best_model.h5
Epoch 6/10
Epoch 6: val_loss improved from 0.57670 to 0.51522, saving model to ./model/best_model.h5
Epoch 7/10
Epoch 7: val_loss improved from 0.51522 to 0.46682, saving model to ./model/best_model.h5
Epoch 8/10
Epoch 8: val_loss improved from 0.46682 to 0.42867, saving model to ./model/best_model.h5
Epoch 9/10
Epoch 9: val_loss improved from 0.42867 to 0.38557, saving model to ./model/best_model.h5
Epoch 10/10
Epoch 10: val_loss improved from 0.38557 to 0.36391, saving model to ./model/best_model.h5


<keras.src.callbacks.History at 0x7bdffb9abbb0>

## 9. 감정 분류의 예측값 계산

In [340]:
# 최고 성능의 모델 불러오기
sentiment_model_best = tf.keras.models.load_model(BEST_MODEL_NAME,
                                                  custom_objects={'TFBertForSequenceClassification': TFBertForSequenceClassification})

In [341]:
def convert_data2(X_data):
    # BERT 입력으로 들어가는 token, mask, segment, target 저장용 리스트
    tokens, masks, segments = [], [], []

    for X in tqdm(X_data):
        X = '[CLS] ' + str(X) + ' [SEP]'
        # token: 입력 문장 토큰화
        token = tokenizer.encode(X, truncation = True, padding = 'max_length', max_length = MAX_SEQ_LEN)

        # Mask: 토큰화한 문장 내 패딩이 아닌 경우 1, 패딩인 경우 0으로 초기화
        num_zeros = token.count(0)
        mask = [1] * (MAX_SEQ_LEN - num_zeros) + [0] * num_zeros

        # segment: 문장 전후관계 구분: 오직 한 문장이므로 모두 0으로 초기화
        segment = [0]*MAX_SEQ_LEN

        tokens.append(token)
        masks.append(mask)
        segments.append(segment)

    # numpy array로 저장
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)

    return [tokens, masks, segments]

## 10. 파이프라이닝

In [342]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

pipe = Pipeline([('preprocess', FunctionTransformer(convert_data2)),
                 ('model', sentiment_model_best)])

In [343]:
sent = ["야, 이 씨발 새끼야, 개새끼야, 좆같은 새끼야, 너 때문에 분위기가 이렇게 좆같이 된 거야 사랑해 쪽쪽 강아지 엘지 너무 좋아", "우리집 강아지 너무 귀여워 사랑해 밥 많이 먹어", "배고파"]
pipe.predict(sent)

100%|██████████| 3/3 [00:00<00:00, 1961.79it/s]




array([[0.2420646 , 0.7579354 ],
       [0.75851274, 0.24148725],
       [0.7566653 , 0.2433347 ]], dtype=float32)

## 11. LIME

In [344]:
!pip install lime



In [345]:
from lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names=[0, 1])

In [346]:
pipe.predict([sent[0]])

100%|██████████| 1/1 [00:00<00:00, 1297.74it/s]






array([[0.2420646, 0.7579354]], dtype=float32)

In [347]:
exp = explainer.explain_instance(sent[0], pipe.predict, top_labels=1)

100%|██████████| 5000/5000 [00:01<00:00, 2829.72it/s]




In [348]:
exp.available_labels()[0]

1

In [349]:
temp = exp.as_list(label=exp.available_labels()[0])

In [350]:
temp2 = np.array([[x[0], x[1]] for x in temp])
temp3 = temp2.T
temp3

array([['개새끼야', '새끼야', '씨발', '사랑해', '너무', '너', '강아지', '좋아', '분위기가', '거야'],
       ['0.30429028921363177', '0.1937487470789795',
        '0.18560275543508428', '-0.07659260415906342',
        '-0.0646635506252211', '0.061493085650806356',
        '-0.048169215880460324', '-0.04806852268828535',
        '-0.04518054335651558', '0.03656291012099749']], dtype='<U32')

In [None]:
exp.show_in_notebook()