In [2]:
!pip install transformers
import pandas as pd

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

import transformers
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [3]:
#%cd /content/drive/MyDrive/Github/floread/sentiment-analysis
%cd /content/drive/MyDrive/Colab Notebooks

train_set = pd.read_csv('병합데이터셋-v3.csv', index_col=0)
train_set.sample(n=5)

/content/drive/MyDrive/Colab Notebooks


Unnamed: 0,sentence,emotion
120022,앉아. 이거라도 좀 먹어.,중립
117461,동영상 저작권 걸렸자나!!,분노
49904,고객님!! 너무 순진하시다. 소녀두 아니고.... 중매결혼에 웬 사랑이요? 주변에서...,중립
57974,애인 만나려고 기껏 꾸몄는데 애인이랑 싸우고 돌아왔어. 속상해.,슬픔
113592,"네, 손님은 얼굴이 작아서 짧은 머리가 잘 어울릴 거예요. 그런데 이런 머리 모양은...",중립


In [4]:
# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5}
train_set['emotion'] = train_set.emotion.map(emotions)

train_set.sample(n=5)

Unnamed: 0,sentence,emotion
25399,이름값 하는거 보소ㄷㄷ,2
40672,그냥 칠레에서 알아서 하세요.,5
23692,얼마 전에 뇌에 문제가 있다는 진단을 받았어. 난 왜 이렇게 불행한 건지 모르겠어.,3
66201,아내가 나를 휠체어에서 떨어뜨려서 화를 내고 말았어. 이러기 싫은데 너무 슬퍼.,3
9990,아니. 느이 담임선생님께서 할미 일하는 델 찾아오셨지 뭐냐.,5


In [5]:
# torch.utils.data.Dataset 상속
class mscDataset(Dataset):  #multi-label sentiment classification
  def __init__(self, csv_file):
    # 결측치 제거
    self.dataset = csv_file
    # 중복값 제거
    self.dataset.drop_duplicates(subset=['sentence'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")

    print(self.dataset.describe())

#   def __init__(self, csv_file):
#     self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")
#     print(self.dataset.describe())

  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [6]:
train_data, test_data = train_test_split(train_set, test_size=0.2, random_state=0)
train_dataset = mscDataset(train_data)
test_dataset = mscDataset(test_data)

Downloading (…)okenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

             emotion
count  108617.000000
mean        3.113343
std         1.738215
min         0.000000
25%         2.000000
50%         3.000000
75%         5.000000
max         5.000000
            emotion
count  27159.000000
mean       3.115800
std        1.739844
min        0.000000
25%        2.000000
50%        3.000000
75%        5.000000
max        5.000000


In [7]:
#모델 불러오기
device = torch.device("cuda:0") #(colab)그냥 "cuda"로 하면 애러
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator", num_labels=6).to(device)

# 하이퍼 파라미터
epochs = 6
batch_size = 16
optimizer = AdamW(model.parameters(), lr=5e-6)

#데이터 로더
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

Downloading pytorch_model.bin:   0%|          | 0.00/56.6M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

In [8]:
# 학습
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/6789 [00:00<?, ?it/s]



Batch Loss: 177.2447907924652 Accuracy: tensor(0.3431, device='cuda:0')
Batch Loss: 351.91840982437134 Accuracy: tensor(0.3384, device='cuda:0')
Batch Loss: 525.0809745788574 Accuracy: tensor(0.3346, device='cuda:0')
Batch Loss: 697.1054005622864 Accuracy: tensor(0.3328, device='cuda:0')
Batch Loss: 867.604327082634 Accuracy: tensor(0.3316, device='cuda:0')
Batch Loss: 1035.412055015564 Accuracy: tensor(0.3324, device='cuda:0')
Batch Loss: 1199.7841721773148 Accuracy: tensor(0.3420, device='cuda:0')
Batch Loss: 1360.396688580513 Accuracy: tensor(0.3523, device='cuda:0')
Batch Loss: 1518.782312989235 Accuracy: tensor(0.3601, device='cuda:0')
Batch Loss: 1677.198271870613 Accuracy: tensor(0.3643, device='cuda:0')
Batch Loss: 1833.8944092988968 Accuracy: tensor(0.3686, device='cuda:0')
Batch Loss: 1988.9194014072418 Accuracy: tensor(0.3722, device='cuda:0')
Batch Loss: 2140.0555357933044 Accuracy: tensor(0.3762, device='cuda:0')
Batch Loss: 2288.755019426346 Accuracy: tensor(0.3812, devic

  0%|          | 0/6789 [00:00<?, ?it/s]

Batch Loss: 128.81978332996368 Accuracy: tensor(0.4975, device='cuda:0')
Batch Loss: 259.7958554029465 Accuracy: tensor(0.4959, device='cuda:0')
Batch Loss: 393.83743262290955 Accuracy: tensor(0.4952, device='cuda:0')
Batch Loss: 521.2827425599098 Accuracy: tensor(0.5016, device='cuda:0')
Batch Loss: 651.1576960086823 Accuracy: tensor(0.5046, device='cuda:0')
Batch Loss: 780.3341899514198 Accuracy: tensor(0.5034, device='cuda:0')
Batch Loss: 902.5727220773697 Accuracy: tensor(0.5090, device='cuda:0')
Batch Loss: 1031.0436394810677 Accuracy: tensor(0.5094, device='cuda:0')
Batch Loss: 1155.7440653443336 Accuracy: tensor(0.5136, device='cuda:0')
Batch Loss: 1280.1459568738937 Accuracy: tensor(0.5160, device='cuda:0')
Batch Loss: 1408.2818250656128 Accuracy: tensor(0.5157, device='cuda:0')
Batch Loss: 1532.4842338562012 Accuracy: tensor(0.5153, device='cuda:0')
Batch Loss: 1659.4133672714233 Accuracy: tensor(0.5155, device='cuda:0')
Batch Loss: 1782.9153820872307 Accuracy: tensor(0.5161, 

  0%|          | 0/6789 [00:00<?, ?it/s]

Batch Loss: 112.24206179380417 Accuracy: tensor(0.5475, device='cuda:0')
Batch Loss: 226.15706479549408 Accuracy: tensor(0.5512, device='cuda:0')
Batch Loss: 337.8583177924156 Accuracy: tensor(0.5577, device='cuda:0')
Batch Loss: 452.9446815252304 Accuracy: tensor(0.5570, device='cuda:0')
Batch Loss: 564.6462891101837 Accuracy: tensor(0.5611, device='cuda:0')
Batch Loss: 672.2960765957832 Accuracy: tensor(0.5659, device='cuda:0')
Batch Loss: 785.8252856731415 Accuracy: tensor(0.5661, device='cuda:0')
Batch Loss: 898.1522571444511 Accuracy: tensor(0.5670, device='cuda:0')
Batch Loss: 1008.4352394342422 Accuracy: tensor(0.5668, device='cuda:0')
Batch Loss: 1115.8415058255196 Accuracy: tensor(0.5679, device='cuda:0')
Batch Loss: 1222.6402948200703 Accuracy: tensor(0.5706, device='cuda:0')
Batch Loss: 1332.2900922596455 Accuracy: tensor(0.5702, device='cuda:0')
Batch Loss: 1443.8468291461468 Accuracy: tensor(0.5697, device='cuda:0')
Batch Loss: 1554.6155144870281 Accuracy: tensor(0.5709, d

  0%|          | 0/6789 [00:00<?, ?it/s]

Batch Loss: 106.76666921377182 Accuracy: tensor(0.5881, device='cuda:0')
Batch Loss: 209.59215658903122 Accuracy: tensor(0.5972, device='cuda:0')
Batch Loss: 313.0338616967201 Accuracy: tensor(0.5969, device='cuda:0')
Batch Loss: 419.5221570134163 Accuracy: tensor(0.5980, device='cuda:0')
Batch Loss: 522.183384925127 Accuracy: tensor(0.5984, device='cuda:0')
Batch Loss: 628.4996302425861 Accuracy: tensor(0.5976, device='cuda:0')
Batch Loss: 731.3651430308819 Accuracy: tensor(0.5971, device='cuda:0')
Batch Loss: 834.4447713494301 Accuracy: tensor(0.5993, device='cuda:0')
Batch Loss: 938.744508266449 Accuracy: tensor(0.5994, device='cuda:0')
Batch Loss: 1044.4386338591576 Accuracy: tensor(0.5988, device='cuda:0')
Batch Loss: 1148.2625023722649 Accuracy: tensor(0.5987, device='cuda:0')
Batch Loss: 1253.5129329562187 Accuracy: tensor(0.5990, device='cuda:0')
Batch Loss: 1357.858092725277 Accuracy: tensor(0.5996, device='cuda:0')
Batch Loss: 1465.0807394385338 Accuracy: tensor(0.5983, devic

  0%|          | 0/6789 [00:00<?, ?it/s]

Batch Loss: 95.2673749923706 Accuracy: tensor(0.6400, device='cuda:0')
Batch Loss: 191.19394081830978 Accuracy: tensor(0.6422, device='cuda:0')
Batch Loss: 286.32200062274933 Accuracy: tensor(0.6444, device='cuda:0')
Batch Loss: 380.57874223589897 Accuracy: tensor(0.6498, device='cuda:0')
Batch Loss: 475.56078347563744 Accuracy: tensor(0.6478, device='cuda:0')
Batch Loss: 572.6314339637756 Accuracy: tensor(0.6490, device='cuda:0')
Batch Loss: 669.6362198889256 Accuracy: tensor(0.6479, device='cuda:0')
Batch Loss: 764.6736714243889 Accuracy: tensor(0.6494, device='cuda:0')
Batch Loss: 860.0752490460873 Accuracy: tensor(0.6487, device='cuda:0')
Batch Loss: 955.7210849821568 Accuracy: tensor(0.6494, device='cuda:0')
Batch Loss: 1047.9383464157581 Accuracy: tensor(0.6507, device='cuda:0')
Batch Loss: 1143.034412920475 Accuracy: tensor(0.6509, device='cuda:0')
Batch Loss: 1238.0285795927048 Accuracy: tensor(0.6512, device='cuda:0')
Batch Loss: 1335.7228624522686 Accuracy: tensor(0.6502, dev

  0%|          | 0/6789 [00:00<?, ?it/s]

Batch Loss: 89.72361528873444 Accuracy: tensor(0.6875, device='cuda:0')
Batch Loss: 180.0715980231762 Accuracy: tensor(0.6841, device='cuda:0')
Batch Loss: 270.52556559443474 Accuracy: tensor(0.6775, device='cuda:0')
Batch Loss: 360.02367463707924 Accuracy: tensor(0.6728, device='cuda:0')
Batch Loss: 450.95584177970886 Accuracy: tensor(0.6728, device='cuda:0')
Batch Loss: 542.763541162014 Accuracy: tensor(0.6721, device='cuda:0')
Batch Loss: 632.9038734734058 Accuracy: tensor(0.6730, device='cuda:0')
Batch Loss: 724.5054668188095 Accuracy: tensor(0.6715, device='cuda:0')
Batch Loss: 822.018376648426 Accuracy: tensor(0.6679, device='cuda:0')
Batch Loss: 912.6683161556721 Accuracy: tensor(0.6691, device='cuda:0')
Batch Loss: 1002.96099999547 Accuracy: tensor(0.6697, device='cuda:0')
Batch Loss: 1093.5077257454395 Accuracy: tensor(0.6692, device='cuda:0')
Batch Loss: 1183.832862406969 Accuracy: tensor(0.6686, device='cuda:0')
Batch Loss: 1276.7171142697334 Accuracy: tensor(0.6680, device=

In [9]:
losses, accuracies

([9725.681244909763,
  8104.095777839422,
  7391.026688337326,
  6876.541300296783,
  6445.992814809084,
  6195.132044821978],
 [tensor(0.4466, device='cuda:0'),
  tensor(0.5426, device='cuda:0'),
  tensor(0.5787, device='cuda:0'),
  tensor(0.6227, device='cuda:0'),
  tensor(0.6519, device='cuda:0'),
  tensor(0.6652, device='cuda:0')])

In [10]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

  0%|          | 0/1698 [00:00<?, ?it/s]

Accuracy: tensor(0.6657, device='cuda:0')


In [11]:
# torch.save(model, 'model/koelectra-v2.pt')
torch.save(model, 'koelectra-v2.pt')

In [12]:
import os

# size = os.path.getsize('model/koelectra-v2.pt') / (1024*1024) # mb 단위
size = os.path.getsize('koelectra-v2.pt') / (1024*1024) # mb 단위
print(f"Model size: {size:.2f} MB")

Model size: 54.00 MB
