# 0. 라이브러리 가져오기

In [13]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping

import pandas as pd
import numpy as np

# 1. 파일 불러오기

In [14]:
# desc, label이 컬럼으로 오게 데이터 정제해주기

data = pd.read_csv('테스트용.csv')
data.drop(labels='Unnamed: 0', axis=1, inplace=True)
data.columns = ['desc', 'label']
data

Unnamed: 0,desc,label
0,아예 스티커도 떼지 않은 미개봉 새상품 입니다,0
1,배터리는 공식서비스센터에서 갈아서 100%이구요,2
2,개통 후 미사용 입니다,0
3,배터리 성능 82퍼고요 후면 카메라 사용불가 화면에 점있어요,4
4,상태는 화면과 같고 잔상 조금 있습니다,4
...,...,...
1638,배터리 효율 83% 액정 기스 X,3
1639,갤럭시탭 A7 lite 32g 미개봉입니다,0
1640,하자 없는 극미중고 제품입니다 생각보다 사용할 일이 없어서 판매 합니다,1
1641,외에 장치 기능 관련 하자 전혀 없습니다 뒷유리 깨짐 사이드 기스있어요,4


In [15]:
data = data.sample(frac=1).reset_index(drop=True)  # 데이터 프레임 섞는 작업. 필요없으면 빼도 됨

In [16]:
train_data = data

# 2. 학습, 검증 데이터 분리

In [17]:
train_texts = train_data['desc'].to_list()
train_labels = train_data['label'].to_list()

In [18]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=0, stratify=train_labels)

# 3. 텍스트 토큰화
- 토크나이저를 거치면 input_ids, token_type_ids, attetion_mask가 생성됨

In [19]:
# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained('kykim/bert-kor-base')

# Tokenizing
train_encodings = tokenizer(train_texts, return_tensors='pt', truncation=True, padding=True)
val_encodings = tokenizer(val_texts, return_tensors='pt', truncation=True, padding=True)

# 4. 데이터셋 만들기

In [20]:
# trainset-set
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# validation-set
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# 5. 모델 생성

In [21]:
model = TFBertForSequenceClassification.from_pretrained('kykim/bert-kor-base', num_labels=6, from_pt=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 6. 콜백 함수 지정 + 모델 학습

In [22]:
# 컴퓨터 사양에 맞게 배치 사이즈 조절

callback_earlystop = EarlyStopping(
    monitor="val_loss", 
    min_delta=0.001, # the threshold that triggers the termination (acc should at least improve 0.001)
    patience=3)

model.fit(
    train_dataset.shuffle(1000).batch(32), epochs=7, batch_size=32,
    validation_data=val_dataset.shuffle(1000).batch(32),
    callbacks = [callback_earlystop]
)

Epoch 1/7


  return py_builtins.overload_of(f)(*args)


Epoch 2/7

KeyboardInterrupt: 

# 7. 모델 저장

In [65]:
import os

MODEL_NAME = 'fine-tuned-kykim-bert-base'
MODEL_SAVE_PATH = os.path.join("_model", MODEL_NAME) # change this to your preferred location

if os.path.exists(MODEL_SAVE_PATH):
    print(f"{MODEL_SAVE_PATH} -- Folder already exists \n")
else:
    os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
    print(f"{MODEL_SAVE_PATH} -- Folder create complete \n")

# save tokenizer, model
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

_model\fine-tuned-kykim-bert-base -- Folder already exists 



('_model\\fine-tuned-kykim-bert-base\\tokenizer_config.json',
 '_model\\fine-tuned-kykim-bert-base\\special_tokens_map.json',
 '_model\\fine-tuned-kykim-bert-base\\vocab.txt',
 '_model\\fine-tuned-kykim-bert-base\\added_tokens.json')

# 8. 테스트

- 테스트 코드 돌리기 전에 테스트로 사용할 데이터를 불러와야함

In [66]:
from transformers import TextClassificationPipeline

# Load Fine-tuning model
loaded_tokenizer = BertTokenizer.from_pretrained(MODEL_SAVE_PATH)
loaded_model = TFBertForSequenceClassification.from_pretrained(MODEL_SAVE_PATH, id2label={0: 0 , 1: 1, 2: 2, 3: 3, 4: 4, 5: 5})

text_classifier = TextClassificationPipeline(
    tokenizer=loaded_tokenizer, 
    model=loaded_model, 
    framework='tf',
    return_all_scores=True
)

Some layers from the model checkpoint at _model\fine-tuned-kykim-bert-base were not used when initializing TFBertForSequenceClassification: ['dropout_531']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at _model\fine-tuned-kykim-bert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [69]:
predicted_label_list = []
predicted_score_list = []

for text in test_data['desc']:
    # predict
    preds_list = text_classifier(text)[0]

    sorted_preds_list = sorted(preds_list, key=lambda x: x['score'], reverse=True)
    predicted_label_list.append(sorted_preds_list[0]['label']) # label
    predicted_score_list.append(sorted_preds_list[1]['score']) # score
test_data['pred'] = predicted_label_list
test_data['score'] = predicted_score_list
test_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['pred'] = predicted_label_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['score'] = predicted_score_list


Unnamed: 0,desc,label,pred,score
400,상태입니다게 임용 서브파손 있고 작은 점습이지 우레탄,2,2,0.009005
401,기스찍힘 없이 새것 같습니다,1,1,0.006330
402,파손 있지 실사용 기능 문제,2,2,0.005093
403,액정 파손워 부품용 사하실,2,2,0.004187
404,거의 하지 않아 새 상품 태원시면 케이스도 드림,1,1,0.007315
...,...,...,...,...
470,흠집 있습니다 인천 서울직거래,1,2,0.008882
471,기스 없고 되,1,1,0.006262
472,액정 파손 판매합니다 선택점 있습 미세해서,2,2,0.004329
473,파손 있고,2,2,0.004162


In [70]:
from sklearn.metrics import classification_report

print(classification_report(y_true=test_data['label'], y_pred=test_data['pred']))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        26
           1       0.88      0.93      0.90        15
           2       0.94      0.94      0.94        34

    accuracy                           0.95        75
   macro avg       0.94      0.95      0.94        75
weighted avg       0.95      0.95      0.95        75

