In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
#GPU
device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [5]:
df=pd.read_csv("/content/drive/MyDrive/sentiment/floread/sentiment-analysis/data/감성대화말뭉치(병합)2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,sentence,emotion
0,0,일은 왜 해도 해도 끝이 없을까? 화가 난다.,분노
1,1,이번 달에 또 급여가 깎였어! 물가는 오르는데 월급만 자꾸 깎이니까 너무 화가 나.,분노
2,2,회사에 신입이 들어왔는데 말투가 거슬려. 그런 애를 매일 봐야 한다고 생각하니까 스...,분노
3,3,직장에서 막내라는 이유로 나에게만 온갖 심부름을 시켜. 일도 많은 데 정말 분하고 ...,분노
4,4,얼마 전 입사한 신입사원이 나를 무시하는 것 같아서 너무 화가 나.,분노


In [6]:
#감정을 정수 라벨로 변경
emotions = {'기쁨': 0, '불안': 1, '당황': 2, '슬픔': 3, '분노': 4, '상처': 5}
df['emotion'] = df.emotion.map(emotions)
df.sample(n=5)

Unnamed: 0.1,Unnamed: 0,sentence,emotion
39609,39620,한때는 참 건강했는데 이제는 내 몸이 말을 듣지 않아.,3
55764,4164,아내가 일이 바빠서 함께 할 시간이 별로 없는데 며칠 뒤에 다시 장기 출장을 간다고...,2
20650,20650,술자리에서 남자친구가 나에게 집에 빚이 많다는 사실을 숨긴 걸 알아서 놀랬어.,5
55251,3651,노후 준비와 관련한 프로그램을 찾고 있는데 아무런 자료가 없다는 게 어이없네.,4
18015,18015,당뇨 때문에 집에 식단을 관리하니 손자들이 놀러 오면 맛있는 게 없다고 투덜거려.,2


In [7]:
train_data=df.sample(frac=0.8, random_state=7)
test_data=df.drop(train_data.index)

In [8]:
MODEL_NAME="beomi/KcELECTRA-base"
tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME)

In [9]:
tokenized_train_sentences=tokenizer(
    list(train_data["sentence"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [10]:
print(tokenized_train_sentences[0])
print(tokenized_train_sentences[0].tokens)
print(tokenized_train_sentences[0].ids)
print(tokenized_train_sentences[0].attention_mask)

Encoding(num_tokens=64, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['ĠëĤĺ', 'Ġê°Ħ', 'ìķĶ', 'ìĿ´ëŀĺ', '.', 'Ġì£¼ë', '¶Ģ', 'ìļ°', 'ìļ¸', 'ì¦Ŀ', 'ìĿ´ëŀĢ', 'ĠìĿ´ìľłë¡ľ', 'ĠìĪłìĿĦ', 'ĠëĦĪë¬´', 'Ġë§İìĿ´', 'Ġë§Ī', 'ìħ¨', 'ëįĺ', 'Ġê²Į', 'ĠíĽĦíļĮ', 'ëĲĺê³ł', 'ĠìŀĲ', 'ì±ħ', 'ê°ĲìĿ´', 'Ġëĵ¤ìĸ´', '.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
[349, 1083, 4409, 4871, 18, 1053, 311, 610, 957, 1051, 1808, 7013, 15650, 977, 1159, 634, 1633, 691, 1337, 5388, 2555, 384, 980, 4413, 1214, 18, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [11]:
tokenized_test_sentences=tokenizer(
    list(test_data["sentence"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [12]:
class CurseDataset(torch.utils.data.Dataset):
  def __init__(self,encodings,labels):
    self.encodings=encodings
    self.labels=labels
  
  def __getitem__(self,idx):
    #item={key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item={key: val[idx].clone().detach() for key, val in self.encodings.items()}
    item["labels"]=torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [13]:
train_label=train_data["emotion"].values
test_label=test_data["emotion"].values

train_dataset=CurseDataset(tokenized_train_sentences,train_label)
test_dataset=CurseDataset(tokenized_test_sentences,test_label)

In [14]:
model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,num_labels=6)
model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.bias', 'classifier

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=3)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [15]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Found existing installation: transformers 4.29.1
Uninstalling transformers-4.29.1:
  Successfully uninstalled transformers-4.29.1
Found existing installation: accelerate 0.19.0
Uninstalling accelerate-0.19.0:
  Successfully uninstalled accelerate-0.19.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Using cached transformers-4.29.1-py3-none-any.whl (7.1 MB)
Collecting accelerate
  Using cached accelerate-0.19.0-py3-none-any.whl (219 kB)
Installing collected packages: transformers, accelerate
Successfully installed accelerate-0.19.0 transformers-4.29.1


In [16]:
training_args=TrainingArguments(
    output_dir='./',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=2
)

In [17]:
def compute_metrics(pred):
  labels=pred.label_ids
  preds=pred.predictions.argmax(-1)
  precision,recall,f1,_=precision_recall_fscore_support(labels,preds,average='weighted')
  acc=accuracy_score(labels,preds)
  return {
      'accuracy': acc,
      'f1': f1,
      'precision': precision,
      'recall': recall
  }

In [18]:
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [19]:
#def change_transformers_dataset_2_right_format(dataset, label_name):
#  return dataset.map(lambda example: {'label': example[label_name]}, remove_columns=[label_name])

In [20]:
trainer.train()



Step,Training Loss
500,1.6442
1000,1.4864
1500,1.4033
2000,1.3842
2500,1.3624
3000,1.3115
3500,1.341
4000,1.2886
4500,1.2249
5000,1.2512


Step,Training Loss
500,1.6442
1000,1.4864
1500,1.4033
2000,1.3842
2500,1.3624
3000,1.3115
3500,1.341
4000,1.2886
4500,1.2249
5000,1.2512


TrainOutput(global_step=29120, training_loss=1.0567240961305389, metrics={'train_runtime': 3519.7426, 'train_samples_per_second': 66.187, 'train_steps_per_second': 8.273, 'total_flos': 7662069100707840.0, 'train_loss': 1.0567240961305389, 'epoch': 5.0})

In [21]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 1.3016859292984009,
 'eval_accuracy': 0.5843063186813187,
 'eval_f1': 0.5844650337903026,
 'eval_precision': 0.5850077568894553,
 'eval_recall': 0.5843063186813187,
 'eval_runtime': 29.3665,
 'eval_samples_per_second': 396.642,
 'eval_steps_per_second': 6.198,
 'epoch': 5.0}