In [1]:
import os
import re
import tensorflow as tf
import pandas as pd

from transformers import ElectraTokenizer
from transformers import TFElectraModel

In [2]:
# 데이터 불러오기
train_data_path ="train.csv"
train_data = pd.read_csv(train_data_path)
train_data.head()

Unnamed: 0,idx,class,conversation
0,0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1,1,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2,2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3,3,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4,4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...


In [3]:
# 토크나이저
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")

Downloading:   0%|          | 0.00/257k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/458 [00:00<?, ?B/s]

In [4]:
# 토큰화
START_TOKEN = '[CLS]'
END_TOKEN = '[SEP]'

def tokenize(conversations):  
    res = []
    for conversation in conversations:
        #tokens = [tokenizer.bos_token] + tokenizer.tokenize(conversation) + [tokenizer.eos_token]
        tokens = [START_TOKEN] + tokenizer.tokenize(conversation) + [END_TOKEN]
        res.append(tokenizer.convert_tokens_to_ids(tokens))

    return res

In [5]:
# 토크나이저 테스트
print(tokenizer.tokenize(train_data['conversation'][0]))

['지금', '너', '스스로', '##를', '죽여', '##달라', '##고', '애원', '##하', '##는', '것', '##인', '##가', '?', '아닙니다', '.', '죄송', '##합니다', '.', '죽', '##을', '거', '##면', '혼자', '죽', '##지', '우리', '##까', '##지', '사건', '##에', '휘말리', '##게', '해', '?', '진짜', '죽여', '##버리', '##고', '싶', '##게', '.', '정말', '잘못', '##했', '##습', '##니다', '.', '너', '##가', '선택', '##해', '.', '너', '##가', '죽', '##을', '##래', '네', '가족', '##을', '죽여', '##줄', '##까', '.', '죄송', '##합니다', '.', '정말', '잘못', '##했', '##습', '##니다', '.', '너', '##에', '##게', '##는', '선택', '##권', '##이', '없', '##어', '.', '선택', '못', '##한다', '##면', '너', '##와', '네', '가족', '##까', '##지', '모조리', '죽여', '##버릴', '##거', '##야', '.', '선택', '못하', '##겠', '##습', '##니다', '.', '한', '##번', '##만', '도와', '##주', '##세요', '.', '그냥', '다', '죽여', '##버려', '##야', '##겠', '##군', '.', '이의', '없', '##지', '?', '제발', '도와', '##주', '##세요', '.']


In [6]:
# 학습 데이터 토큰화
tokenized = tokenize(train_data['conversation'])

In [7]:
print(tokenized[0], len(tokenized))

[2, 6292, 2267, 6926, 4110, 13647, 28485, 4219, 26201, 4279, 4034, 2048, 4139, 4070, 35, 9312, 18, 11946, 17788, 18, 3324, 4292, 2041, 4181, 7422, 3324, 4200, 6233, 4149, 4200, 6388, 4073, 28110, 4325, 3764, 35, 7082, 13647, 13864, 4219, 3018, 4325, 18, 6595, 6997, 4398, 4576, 6216, 18, 2267, 4070, 6634, 4151, 18, 2267, 4070, 3324, 4292, 4395, 2279, 6507, 4292, 13647, 4612, 4149, 18, 11946, 17788, 18, 6595, 6997, 4398, 4576, 6216, 18, 2267, 4073, 4325, 4034, 6634, 4046, 4007, 3123, 4025, 18, 6634, 2684, 7796, 4181, 2267, 4192, 2279, 6507, 4149, 4200, 16285, 13647, 25748, 4216, 4474, 18, 6634, 31397, 5012, 4576, 6216, 18, 3757, 4467, 4172, 7733, 4076, 8553, 18, 6848, 2348, 13647, 15746, 4474, 5012, 4397, 18, 7818, 3123, 4200, 35, 11777, 7733, 4076, 8553, 18, 3] 3950


In [8]:
# 최대 길이 구하기
def max_token_length(tokenized):
    max_length = 0
    #len_tokens = []
    for t in tokenized:
        if (len(t) > max_length):
            max_length = len(t)
        #len_tokens.append(len(t))
    return max_length

In [9]:
MAX_LENGTH = max_token_length(tokenized)

In [10]:
# 텍스트 레이블을 숫자 레이블로 변환
def labelize(text):
    res = []
    for element in text:
        if element == "협박 대화":
            res.append(0)
        elif element == "갈취 대화":
            res.append(1)
        elif element == "직장 내 괴롭힘 대화":
            res.append(2)
        elif element == "기타 괴롭힘 대화":
            res.append(3)
        else:
            print(element)
            break
    return res

In [11]:
# padding
def padding(data, pad_len):
    res = []
    for tokens in data:
        if len(tokens) <= pad_len:
            res.append(tokens)

    res = tf.keras.preprocessing.sequence.pad_sequences(res, maxlen=pad_len, padding='post')

    return res

In [12]:
tokenized = padding(tokenized, MAX_LENGTH)

In [13]:
print(tokenized[0], tokenized.shape)

[    2  6292  2267  6926  4110 13647 28485  4219 26201  4279  4034  2048
  4139  4070    35  9312    18 11946 17788    18  3324  4292  2041  4181
  7422  3324  4200  6233  4149  4200  6388  4073 28110  4325  3764    35
  7082 13647 13864  4219  3018  4325    18  6595  6997  4398  4576  6216
    18  2267  4070  6634  4151    18  2267  4070  3324  4292  4395  2279
  6507  4292 13647  4612  4149    18 11946 17788    18  6595  6997  4398
  4576  6216    18  2267  4073  4325  4034  6634  4046  4007  3123  4025
    18  6634  2684  7796  4181  2267  4192  2279  6507  4149  4200 16285
 13647 25748  4216  4474    18  6634 31397  5012  4576  6216    18  3757
  4467  4172  7733  4076  8553    18  6848  2348 13647 15746  4474  5012
  4397    18  7818  3123  4200    35 11777  7733  4076  8553    18     3
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [14]:
label = labelize(train_data['class'])

In [15]:
print(label[0], len(label))

0 3950


In [16]:
# 모델 생성
pre_trained_model = TFElectraModel.from_pretrained("monologg/koelectra-small-v3-discriminator", from_pt=True)

Downloading:   0%|          | 0.00/54.0M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraModel: ['electra.embeddings.position_ids', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing TFElectraModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further train

In [17]:
# 모델 생성
NUM_CLASS = 5

inputs = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
hidden_1 = pre_trained_model([inputs])
outputs = tf.keras.layers.Dense(NUM_CLASS, activation="softmax")(hidden_1['last_hidden_state'][:,-1])

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 469)]             0         
_________________________________________________________________
tf_electra_model (TFElectraM TFBaseModelOutput(last_hi 14056192  
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 1285      
Total params: 14,057,477
Trainable params: 14,057,477
Non-trainable params: 0
_________________________________________________________________


In [18]:
import numpy as np

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

train_x = np.array(tokenized)
train_y = np.array(label)

history = model.fit(x=train_x, y=train_y, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
# 모델 2 - Dropout 추가
inputs = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
hidden_1 = pre_trained_model([inputs])
hidden_1 = hidden_1['last_hidden_state'][:,-1]
hidden_2 = tf.keras.layers.Dropout(0.2)(hidden_1)
outputs = tf.keras.layers.Dense(NUM_CLASS, activation="softmax")(hidden_2)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 469)]             0         
_________________________________________________________________
tf_electra_model (TFElectraM TFBaseModelOutput(last_hi 14056192  
_________________________________________________________________
tf.__operators__.getitem_1 ( (None, 256)               0         
_________________________________________________________________
dropout_37 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 1285      
Total params: 14,057,477
Trainable params: 14,057,477
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(x=train_x, y=train_y, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
 28/124 [=====>........................] - ETA: 1:11 - loss: 1.4082 - accuracy: 0.2533

KeyboardInterrupt: 

In [23]:
# 모델 3 - Dense 층 추가
inputs = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
hidden_1 = pre_trained_model([inputs])
hidden_1 = hidden_1['last_hidden_state'][:,-1]
hidden_2 = tf.keras.layers.Dense(64, activation="relu")(hidden_1)
hidden_3 = tf.keras.layers.Dropout(0.5)(hidden_2)
outputs = tf.keras.layers.Dense(4, activation="softmax")(hidden_3)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 469)]             0         
_________________________________________________________________
tf_electra_model (TFElectraM TFBaseModelOutput(last_hi 14056192  
_________________________________________________________________
tf.__operators__.getitem_3 ( (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                16448     
_________________________________________________________________
dropout_39 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 4)                 260       
Total params: 14,072,900
Trainable params: 14,072,900
Non-trainable params: 0
_______________________________________________

In [24]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(x=train_x, y=train_y, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
