# 0. 라이브러리 가져오기

In [1]:
import torch

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
from transformers import TFTrainer, TFTrainingArguments
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
import tensorflow as tf

import pandas as pd
import numpy as np

# 1. 파일 불러오기

In [18]:
data = pd.read_csv('아이패드 프로 12.9 5세대_크롤링_전처리_ver3.0.csv')
data.drop(labels='Unnamed: 0', axis=1, inplace=True)
data

Unnamed: 0,desc,label
0,풀박승 거의 사용 안 함 액정 본체 전부 깨끗합니다,1
1,주말에 시간씩 영상 시청 용도로 사용했습니다 없이 새것 같습니다,1
2,거의 사용을 안 해서 상태가 매우 좋습니다,1
3,미개봉 새 상품입니다 기가 미개봉 상품입니다 미개봉 상품입니다 기가 미개봉 새 상품...,0
4,최초 실행 배터리 진단 결과 배터리 성능 흠집 찍힘,1
...,...,...
406,사진 보시면 정말 미개봉인 거 확인 가능하십니다,0
407,새 스토어 픽업 미개봉 가격 원 문자 카톡 전화 주세요,0
408,미개봉 애 커플까지 와이프 주려 샀는데 안 한 데서,0
409,비싼 기기인 만큼 미개봉 그대로 로만 거래할 예정이 궁금하신 거는 언제든지 문자 주세요,0


In [19]:
data = data.sample(frac=1).reset_index(drop=True)

In [20]:
train_data = data[:360]
test_data = data[360:]

# 2. 학습, 검증 데이터 분리

In [21]:
train_texts = train_data['desc'].to_list()
train_labels = train_data['label'].to_list()

In [22]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=0, stratify=train_labels)

# 3. 텍스트 토큰화
- 토크나이저를 거치면 input_ids, token_type_ids, attetion_mask가 생성됨

In [23]:
# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained('kykim/bert-kor-base')

# Tokenizing
train_encodings = tokenizer(train_texts, return_tensors='pt', truncation=True, padding=True)
val_encodings = tokenizer(val_texts, return_tensors='pt', truncation=True, padding=True)

# 3-1. SMOTE를 활용한 오버 샘플링
- 토크나이저에서 나온 input_ids를 train_labels와 함께 오버샘플링
- 오버샘플링되어 나온 결과로 다시 새로운 어텐션 마스크 생성

In [24]:
smote = SMOTE()
train_inputs_over, train_labels_over = smote.fit_resample(train_encodings.input_ids, train_labels)

In [25]:
train_attention_masks = []

for seq in train_inputs_over:
    seq_mask = [float(i>0) for i in seq]
    train_attention_masks.append(seq_mask)

In [26]:
train_encodings_over = {
    'input_ids': train_inputs_over,
    'attention_mask': train_attention_masks,
}

# 4. 데이터셋 만들기

In [27]:
# trainset-set
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings_over),
    train_labels_over
))

# validation-set
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# 5. 모델 생성

In [28]:
model = TFBertForSequenceClassification.from_pretrained('kykim/bert-kor-base', num_labels=3, from_pt=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 6. 콜백 함수 지정 + 모델 학습

In [29]:
callback_earlystop = EarlyStopping(
    monitor="val_loss", 
    min_delta=0.001, # the threshold that triggers the termination (acc should at least improve 0.001)
    patience=2)

model.fit(
    train_dataset.shuffle(1000).batch(16), epochs=5, batch_size=16,
    validation_data=val_dataset.shuffle(1000).batch(16),
    callbacks = [callback_earlystop]
)

Epoch 1/5


  return py_builtins.overload_of(f)(*args)


Epoch 2/5
Epoch 3/5


<tensorflow.python.keras.callbacks.History at 0x243341bf5e0>

# 7. 모델 저장

In [30]:
import os

MODEL_NAME = 'fine-tuned-kykim-bert-base'
MODEL_SAVE_PATH = os.path.join("_model", MODEL_NAME) # change this to your preferred location

if os.path.exists(MODEL_SAVE_PATH):
    print(f"{MODEL_SAVE_PATH} -- Folder already exists \n")
else:
    os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
    print(f"{MODEL_SAVE_PATH} -- Folder create complete \n")

# save tokenizer, model
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

_model\fine-tuned-kykim-bert-base -- Folder already exists 



('_model\\fine-tuned-kykim-bert-base\\tokenizer_config.json',
 '_model\\fine-tuned-kykim-bert-base\\special_tokens_map.json',
 '_model\\fine-tuned-kykim-bert-base\\vocab.txt',
 '_model\\fine-tuned-kykim-bert-base\\added_tokens.json')

# 8. 테스트

In [31]:
from transformers import TextClassificationPipeline

# Load Fine-tuning model
loaded_tokenizer = BertTokenizer.from_pretrained(MODEL_SAVE_PATH)
loaded_model = TFBertForSequenceClassification.from_pretrained(MODEL_SAVE_PATH, id2label={0: 0 , 1: 1, 2: 2})

text_classifier = TextClassificationPipeline(
    tokenizer=loaded_tokenizer, 
    model=loaded_model, 
    framework='tf',
    return_all_scores=True
)

Some layers from the model checkpoint at _model\fine-tuned-kykim-bert-base were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at _model\fine-tuned-kykim-bert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [32]:
predicted_label_list = []
predicted_score_list = []

for text in test_data['desc']:
    # predict
    preds_list = text_classifier(text)[0]

    sorted_preds_list = sorted(preds_list, key=lambda x: x['score'], reverse=True)
    predicted_label_list.append(sorted_preds_list[0]['label']) # label
    predicted_score_list.append(sorted_preds_list[1]['score']) # score
test_data['pred'] = predicted_label_list
test_data['score'] = predicted_score_list
test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['pred'] = predicted_label_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['score'] = predicted_score_list


Unnamed: 0,desc,label,pred,score
360,서 그 색상 미개봉 합니다,0,0,0.003629
361,올 한 거의 새 상품입니다,1,1,0.030661
362,색상 둘 다 미개봉 신품입니다,0,0,0.00223
363,유심 넣 필요한 앱 몇 개 깔 한 달 출장 다녀와서 거의 만져 보지 않았을 정도네요,1,1,0.01243
364,실 사용이 없어서 급여하려 합니다 배터리 효율입니다,1,1,0.029144


In [33]:
from sklearn.metrics import classification_report

print(classification_report(y_true=test_data['label'], y_pred=test_data['pred']))

              precision    recall  f1-score   support

           0       0.90      0.95      0.93        20
           1       0.87      0.93      0.90        28
           2       0.00      0.00      0.00         3

    accuracy                           0.88        51
   macro avg       0.59      0.63      0.61        51
weighted avg       0.83      0.88      0.86        51



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
