In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [4]:
import torch
torch.cuda.is_available()

True

In [5]:
#GPU
device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [6]:
df=pd.read_csv("/content/drive/MyDrive/병합데이터셋-v2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,sentence,emotion
0,0,일은 왜 해도 해도 끝이 없을까? 화가 난다.,분노
1,1,이번 달에 또 급여가 깎였어! 물가는 오르는데 월급만 자꾸 깎이니까 너무 화가 나.,분노
2,2,회사에 신입이 들어왔는데 말투가 거슬려. 그런 애를 매일 봐야 한다고 생각하니까 스...,분노
3,3,직장에서 막내라는 이유로 나에게만 온갖 심부름을 시켜. 일도 많은 데 정말 분하고 ...,분노
4,4,얼마 전 입사한 신입사원이 나를 무시하는 것 같아서 너무 화가 나.,분노


In [7]:
#감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
df['emotion'] = df.emotion.map(emotions)
df.sample(n=5)

Unnamed: 0.1,Unnamed: 0,sentence,emotion
91742,91742,아... 제가... 다른 일이 많아서 아직...,5
85399,85399,서래마을 부부 너무 인품도 좋으시고 교양도 넘치시고 보는내내 부럽고 행복해보여서 좋...,0
37211,37211,남편이 세상을 떠난 지 사 년째인데 여전히 못 해준 것만 생각나.,2
18471,18471,남자친구가 이해력이 떨어지나 봐. 무엇을 설명해도 말귀를 못 알아들어.,4
112214,112214,"그런데, 하라는 자료분석은 안하고 여긴 왜 온 거야?",5


In [8]:
train_data=df.sample(frac=0.8, random_state=234)
test_data=df.drop(train_data.index)

In [9]:
MODEL_NAME="beomi/KcELECTRA-base"
tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
tokenized_train_sentences=tokenizer(
    list(train_data["sentence"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [11]:
print(tokenized_train_sentences[0])
print(tokenized_train_sentences[0].tokens)
print(tokenized_train_sentences[0].ids)
print(tokenized_train_sentences[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['Ġê²°íĺ¼íķĺê³ł', 'ĠëĤ´', 'Ġíİ¸ìĿ´', 'ĠìĥĿê²¼', 'ëĭ¤ëĬĶ', 'ĠìĥĿê°ģìĿ´', 'Ġëĵ¤ìĸ´ìĦľ', 'Ġê·¸ëŁ°', 'Ġê±´ì§Ģ', 'ĠìĤ¶ìĿ´', 'ĠìķĦì£¼', 'ĠëĬĲ', 'ê¸ĭ', 'íķ´ì¡Į', 'ìĸ´', '.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[

In [12]:
tokenized_test_sentences=tokenizer(
    list(test_data["sentence"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [13]:
class CurseDataset(torch.utils.data.Dataset):
  def __init__(self,encodings,labels):
    self.encodings=encodings
    self.labels=labels
  
  def __getitem__(self,idx):
    #item={key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item={key: val[idx].clone().detach() for key, val in self.encodings.items()}
    item["labels"]=torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [14]:
train_label=train_data["emotion"].values
test_label=test_data["emotion"].values

train_dataset=CurseDataset(tokenized_train_sentences,train_label)
test_dataset=CurseDataset(tokenized_test_sentences,test_label)

In [15]:
model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,num_labels=6)
model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.weight', 'classifi

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=3)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [16]:
#!pip uninstall -y transformers accelerate
#!pip install transformers accelerate

In [17]:
training_args=TrainingArguments(
    output_dir='./',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=2
)

In [18]:
def compute_metrics(pred):
  labels=pred.label_ids
  preds=pred.predictions.argmax(-1)
  precision,recall,f1,_=precision_recall_fscore_support(labels,preds,average='weighted')
  acc=accuracy_score(labels,preds)
  return {
      'accuracy': acc,
      'f1': f1,
      'precision': precision,
      'recall': recall
  }

In [19]:
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [20]:
#def change_transformers_dataset_2_right_format(dataset, label_name):
#  return dataset.map(lambda example: {'label': example[label_name]}, remove_columns=[label_name])

In [21]:
trainer.train()



Step,Training Loss
500,1.429
1000,1.2282
1500,1.1089
2000,1.0719
2500,1.0629
3000,1.0412
3500,1.0089
4000,1.003
4500,0.9992
5000,0.9717


Step,Training Loss
500,1.429
1000,1.2282
1500,1.1089
2000,1.0719
2500,1.0629
3000,1.0412
3500,1.0089
4000,1.003
4500,0.9992
5000,0.9717


TrainOutput(global_step=68160, training_loss=0.6231161126508399, metrics={'train_runtime': 13634.3745, 'train_samples_per_second': 39.992, 'train_steps_per_second': 4.999, 'total_flos': 3.586792941743616e+16, 'train_loss': 0.6231161126508399, 'epoch': 5.0})

In [22]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 1.7083837985992432,
 'eval_accuracy': 0.6790155155338737,
 'eval_f1': 0.6772304889086543,
 'eval_precision': 0.675982262722972,
 'eval_recall': 0.6790155155338737,
 'eval_runtime': 180.1663,
 'eval_samples_per_second': 151.321,
 'eval_steps_per_second': 2.364,
 'epoch': 5.0}

In [24]:
torch.save(model,'/content/drive/MyDrive/kcelectra-v3.pt')

In [26]:
# 모델 사이즈 확인(파라미터는 v1과 동일)
import os

model_path = '/content/drive/MyDrive/kcelectra-v3.pt'
size2 = os.path.getsize(model_path) / (1024*1024) # mb 단위
print(f"Model size: {size2:.2f} MB")

Model size: 416.25 MB
