https://breezymind.com/kcbert-find-tuning/

In [10]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

In [67]:
rawdata = pd.read_csv('./preprocess_PCIT_09152023.csv')
rawdata.head()

Unnamed: 0.1,Unnamed: 0,label,parent
0,0,중립적 말,오늘 우리 마당에 귀여운 토끼가 있었어
1,1,부정적 말,날 귀찮게 하고있어
2,2,반영,포테이토 헤드를 위한 모자이구나
3,3,명령,그림에 말을 그리렴
4,4,행동 묘사,우리는 색칠을 하고 있어


In [68]:
rawdata.drop(['Unnamed: 0'], axis=1, inplace=True)
rawdata

Unnamed: 0,label,parent
0,중립적 말,오늘 우리 마당에 귀여운 토끼가 있었어
1,부정적 말,날 귀찮게 하고있어
2,반영,포테이토 헤드를 위한 모자이구나
3,명령,그림에 말을 그리렴
4,행동 묘사,우리는 색칠을 하고 있어
...,...,...
215,질문,어떻게
216,질문,빨간색을 원하니
217,중립적 말,블럭이 쌓여있어
218,질문,아니


In [69]:
rawdata['label'].value_counts()

중립적 말      58
질문         32
부정적 말      28
행동 묘사      27
명령         26
반영         22
구체적 칭찬     20
비구체적 칭찬     7
Name: label, dtype: int64

In [70]:
labels = rawdata.label.unique()
labels

array(['중립적 말', '부정적 말', '반영', '명령', '행동 묘사', '질문', '비구체적 칭찬', '구체적 칭찬'],
      dtype=object)

In [71]:
label_dict = {}
for index, label in enumerate(labels):
    label_dict[label] = index
label_dict

{'중립적 말': 0,
 '부정적 말': 1,
 '반영': 2,
 '명령': 3,
 '행동 묘사': 4,
 '질문': 5,
 '비구체적 칭찬': 6,
 '구체적 칭찬': 7}

In [72]:
len(label_dict)

8

In [73]:
rawdata['label_ct'] = rawdata.label.replace(label_dict)

In [74]:
rawdata.tail(50)

Unnamed: 0,label,parent,label_ct
170,부정적 말,아니야 그냥 말이야,1
171,중립적 말,다른 사람들과 노는 게 좋구나,0
172,행동 묘사,차를 운전하고 있구나,4
173,반영,네 말이 노란색 이랑 빨간색이구나,2
174,명령,블럭을 주면 너랑 놀게,3
175,중립적 말,이거 가지고 놀거야,0
176,중립적 말,저녁은 집에서 먹을거야,0
177,중립적 말,저 소들은 배가 고프구나,0
178,중립적 말,나는 보라색을 좋아해,0
179,행동 묘사,점프하고 있구나,4


In [75]:
texts = []
labels = []

for idx, label in rawdata.iterrows():
  texts.append(label['parent'])
  labels.append(label['label_ct'])


In [76]:
len(labels)

220

In [78]:
# KcBERT 모델과 토크나이저 로드
model_name = "JuneKo/kcbert_family"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_dict))

# 원하는 최대 시퀀스 길이
max_length = 128

labels = torch.tensor(labels, dtype=torch.long)
dataset = CustomDataset(texts, labels, tokenizer, max_length)

# 데이터 로더 생성
batch_size = 11

# Train / Test set 분리
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset, test_size=0.15, random_state=42)

train_dataloader = DataLoader(train, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(test, batch_size=batch_size, shuffle=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JuneKo/kcbert_family and are newly initialized: ['classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  labels = torch.tensor(labels, dtype=torch.long)


In [79]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 하이퍼파라미터 설정
learning_rate = 1e-5
epochs = 5

# 옵티마이저 및 손실 함수 설정
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# 모델 재학습
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # 그래디언트 초기화
        optimizer.zero_grad()
        # 모델에 입력을 주어 예측을 생성합니다.
        outputs = model(input_ids, attention_mask=attention_mask)
        # 모델 출력에서 로짓(분류에 대한 점수)을 얻습니다.
        logits = outputs.logits
        # 손실을 계산합니다.
        loss = criterion(logits, labels)
        # 역전파를 통해 그래디언트 계산
        loss.backward()
        # 옵티마이저를 사용해 가중치를 업데이트
        optimizer.step()
        # 에포크 전체 손실을 누적합니다.
        total_loss += loss.item()

    # 에포크 평균 손실 계산
    avg_loss = total_loss / len(train_dataloader)
    # 에포크별 손실 출력
    print(f"Epoch {epoch+1}/{epochs} - Avg Loss: {avg_loss:.4f}")

    # 모델 평가
    model.eval()
    val_total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for val_batch in valid_dataloader:
            # Validation 데이터 가져오기
            val_input_ids = val_batch['input_ids']
            val_attention_mask = val_batch['attention_mask']
            val_labels = val_batch['label']

            val_input_ids = val_input_ids.to(device)
            val_attention_mask = val_attention_mask.to(device)
            val_labels = val_labels.to(device)

            # 모델 예측
            val_outputs = model(val_input_ids, attention_mask=val_attention_mask)
            val_logits = val_outputs.logits

            # 손실 계산
            val_loss = criterion(val_logits, val_labels)
            val_total_loss += val_loss.item()

            # 정확도 계산
            val_preds = val_logits.argmax(dim=1)
            correct += (val_preds == val_labels).sum().item()
            total += val_labels.size(0)

    val_avg_loss = val_total_loss / len(valid_dataloader)
    val_accuracy = correct / total
    print(f"Validation Loss: {val_avg_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")



Epoch 1/5 - Avg Loss: 1.9988
Validation Loss: 1.8959 - Validation Accuracy: 0.1818
Epoch 2/5 - Avg Loss: 1.6800
Validation Loss: 1.6905 - Validation Accuracy: 0.2424
Epoch 3/5 - Avg Loss: 1.4816
Validation Loss: 1.4821 - Validation Accuracy: 0.5152
Epoch 4/5 - Avg Loss: 1.1983
Validation Loss: 1.3415 - Validation Accuracy: 0.5152
Epoch 5/5 - Avg Loss: 0.9573
Validation Loss: 1.1314 - Validation Accuracy: 0.6667


In [80]:
!pwd

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/data/pcit


In [81]:
# 모델 저장
model_save_path = "./kc_bert_classifier_09152023.pth"
torch.save(model.state_dict(), model_save_path)

# 모델 아키텍처 생성
loaded_model = AutoModelForSequenceClassification.from_pretrained("JuneKo/kcbert_family", num_labels=8)

# 저장된 가중치 불러오기
loaded_model.load_state_dict(torch.load(model_save_path))

# 모델을 평가 모드로 설정
loaded_model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JuneKo/kcbert_family and are newly initialized: ['classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
from google.colab import files
files.download("kc_bert_emotion_classifier.pth")

# predict

In [84]:
label_dict
reverse_label_dict = dict(map(reversed,label_dict.items()))
reverse_label_dict

{0: '중립적 말',
 1: '부정적 말',
 2: '반영',
 3: '명령',
 4: '행동 묘사',
 5: '질문',
 6: '비구체적 칭찬',
 7: '구체적 칭찬'}

In [85]:
 def valid_label(label):
  if label == 0:
    return '여성/가족'
  elif label == 1:
    return '남성'
  elif label == 2:
    return '성소수자'
  elif label == 3:
    return '인종/국적'
  elif label == 4:
    return '연령'
  elif label == 5:
    return '지역'
  elif label == 6:
    return '종교'
  elif label == 7:
    return '기타 혐오'
  elif label == 8:
    return '악플/욕설'
  elif label == 9:
    return '개인지칭'
  else:
    return '일반문장'

# 입력 데이터 준비 (위 예제와 유사한 방법으로)
input_data = [
    "잘 하네",
    "이거는 여기 위에 올려두자",
    "책을 다 읽었구나",
    "신발 벗어",
    "더럽히면 안돼",
    ]
input_encodings = tokenizer(input_data, padding=True, truncation=True, return_tensors="pt")

# 모델에 입력 데이터 전달
with torch.no_grad():
    output = loaded_model(**input_encodings)

# 예측 결과 확인
logits = output.logits
predicted_labels = logits.argmax(dim=1)


# 예측 결과 출력
for i, input_text in enumerate(input_data):
    predicted_label = predicted_labels[i].item()
    print(f"Input: {input_text} - Predicted Label: {reverse_label_dict[predicted_label]}")

Input: 잘 하네 - Predicted Label: 구체적 칭찬
Input: 이거는 여기 위에 올려두자 - Predicted Label: 명령
Input: 책을 다 읽었구나 - Predicted Label: 반영
Input: 신발 벗어 - Predicted Label: 명령
Input: 더럽히면 안돼 - Predicted Label: 부정적 말
