## **데이터 불러오기**

코랩

In [None]:
# from google.colab import drive  #코랩
# drive.mount('/content/drive')

In [None]:
# 밑에서 상대경로 안되면 sentiment-analysis 까지 이동
# %cd /content/drive/MyDrive/Github/floread/sentiment-analysis  #코랩
%cd sentiment-analysis

In [2]:
import pandas as pd

train_set = pd.read_csv('data/감성대화말뭉치(병합).csv', index_col=0) #첫 번째 열이 인덱스 열일 때 Unnamed: 0 빼기
train_set.sample(n=5)

Unnamed: 0,sentence,emotion
42271,맞벌이하는데 아이를 유치원에 데려다주고 다시 집으로 데리고 오는 게 쉽지가 않아.,불안
1093,나는 내 주위 사람들 중에 내가 제일 노래를 잘한다고 생각해.,기쁨
4792,이번에 청약이 또 떨어졌어.,슬픔
7408,부서원 사람들과 대화가 잘 안 되는 것 같아서 걱정이야.,상처
49823,사고로 얼굴에 흉이 진 후로 밖으로 나가질 못하겠어.,슬픔


**감정을 정수 라벨로 변경**

In [3]:
# 감정을 정수 라벨로 변경
emotions = {'기쁨': 0, '불안': 1, '당황': 2, '슬픔': 3, '분노': 4, '상처': 5}
train_set['emotion'] = train_set.emotion.map(emotions)

train_set.sample(n=5)

Unnamed: 0,sentence,emotion
3479,내 잘못도 아닌데 정직 처분을 받아 너무 속상해.,5
43386,아내와 사별한 이후 아들과 어떤 교류도 없이 지내고 있어. 내가 왜 사는 건지 모르겠어.,5
34020,친구들한테 무시당하는 게 일상이 되어버려서 서러워. 오늘도 말을 걸었는데 무시당했어.,4
19368,난 알코올 중독을 앓고 있어. 그런데도 친구들은 나에게 술을 권해서 이따금 화가 나.,4
8272,내가 원하던 일은 이게 아닌데.,1


__패키지 설치: Korean BERT pre-trained cased (KoBERT) for Huggingface Transformers__
https://github.com/SKTBrain/KoBERT/blob/master/kobert_hf/requirements.txt


-> 가상환경 설정 `ve_kobert.ipynb`참고

(in colab)

```python
%pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
%pip install tqdm   
%pip install sentencepiece transformers
%pip install torch

%pip install mxnet
# !pip install -U --pre "mxnet-cu118>=2.0.0a"   # cu118이랑 맞는 버전 없음
# !pip install -U --pre "mxnet>=2.0.0a"   #cpu only

%pip install gluonnlp
```

In [4]:
# 라이브러리 불러오기
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
# from tqdm import tqdm, tqdm_notebook  //
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

#kobert
from kobert_tokenizer import KoBERTTokenizer

# transformers
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 사용시 필요
#device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device("cuda")
#device = torch.device('cpu')

**토크나이저, pretrained 모델, vocabulary 로드**

In [5]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters(KoBERT finetuning 베에스 라인) -> 
max_len = 32    #베이스라인 64
batch_size = 32 #베이스라인 64
warmup_ratio = 0.1
num_epochs = 5  # 에포크 횟수
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

**KOBert 클래스 정의**

In [6]:
# 모델에 사용되는 데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))

In [7]:
# 감성 분류 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6, 
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

**데이터 셋 피팅**

In [8]:
# 모델 학습에 사용할 데이터셋을 [data, label] 배열로 피팅
train_set_data = [[i, str(j)] for i, j in zip(train_set['sentence'], train_set['emotion'])]

# sklearn 의 train_test_split 모듈-> 4:1로 학습&검증 데이터를 분류 
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)

# 데이터셋을 Bert모델에 입력할 수 있게 변환
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)

# 배치데이터셋 생성
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=0)    # num_workers: 데이터 로딩할때 쓰는 프로세스 수(로딩속도)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=0)

모델 선언

In [9]:
# 모델 선언
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

**정확도 계산 함수**

In [10]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [11]:
# 입력에 대한 예측 반환
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)  #로컬에서는 디폴트(0)으로 수정
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [12]:
import os
import psutil

cpu_count = os.cpu_count()
available_memory = psutil.virtual_memory().available

print("CPU 코어 수:", cpu_count)
print("사용 가능한 메모리:", available_memory)

CPU 코어 수: 6
사용 가능한 메모리: 9935974400


로컬에서 RuntimeError방지

In [1]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

학습

In [13]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):   # 아까 만든 테스트 배치 데이터 - 정확도 측정

        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/1457 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.7916381359100342 train acc 0.125
epoch 1 batch id 201 loss 1.858939528465271 train acc 0.1982276119402985
epoch 1 batch id 401 loss 1.2915337085723877 train acc 0.30860349127182046
epoch 1 batch id 601 loss 1.2887184619903564 train acc 0.37583194675540765
epoch 1 batch id 801 loss 1.2145276069641113 train acc 0.4165106117353308
epoch 1 batch id 1001 loss 1.274234652519226 train acc 0.4406843156843157
epoch 1 batch id 1201 loss 1.4031423330307007 train acc 0.45691090757701913
epoch 1 batch id 1401 loss 0.9159780740737915 train acc 0.4697760528194147
epoch 1 train acc 0.47312542896362386


  0%|          | 0/365 [00:00<?, ?it/s]

epoch 1 test acc 0.5810053816046966


  0%|          | 0/1457 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.172863245010376 train acc 0.5
epoch 2 batch id 201 loss 1.3949519395828247 train acc 0.5558146766169154
epoch 2 batch id 401 loss 0.9046579003334045 train acc 0.5660847880299252
epoch 2 batch id 601 loss 1.2276753187179565 train acc 0.5680636439267887
epoch 2 batch id 801 loss 1.1491986513137817 train acc 0.5765059300873908
epoch 2 batch id 1001 loss 1.0736383199691772 train acc 0.5805444555444556
epoch 2 batch id 1201 loss 1.3014317750930786 train acc 0.5839664862614488
epoch 2 batch id 1401 loss 0.7532556056976318 train acc 0.5865899357601713
epoch 2 train acc 0.588244966826813


  0%|          | 0/365 [00:00<?, ?it/s]

epoch 2 test acc 0.5921844422700586


  0%|          | 0/1457 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.026536464691162 train acc 0.65625
epoch 3 batch id 201 loss 1.434572696685791 train acc 0.6005907960199005
epoch 3 batch id 401 loss 0.8356627225875854 train acc 0.6161159600997507
epoch 3 batch id 601 loss 1.1390113830566406 train acc 0.6206322795341098
epoch 3 batch id 801 loss 1.0710867643356323 train acc 0.6321395131086143
epoch 3 batch id 1001 loss 1.0520281791687012 train acc 0.6391733266733267
epoch 3 batch id 1201 loss 1.225669026374817 train acc 0.644020607826811
epoch 3 batch id 1401 loss 0.5863918662071228 train acc 0.6477962169878658
epoch 3 train acc 0.6495224204987416


  0%|          | 0/365 [00:00<?, ?it/s]

epoch 3 test acc 0.594924168297456


  0%|          | 0/1457 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.9479301571846008 train acc 0.65625
epoch 4 batch id 201 loss 1.2953827381134033 train acc 0.6764614427860697
epoch 4 batch id 401 loss 0.7716375589370728 train acc 0.6931889027431422
epoch 4 batch id 601 loss 0.9818992018699646 train acc 0.6979513311148087
epoch 4 batch id 801 loss 0.9610339999198914 train acc 0.7094647315855181
epoch 4 batch id 1001 loss 0.8501701354980469 train acc 0.7148476523476524
epoch 4 batch id 1201 loss 1.1831800937652588 train acc 0.7172928809325562
epoch 4 batch id 1401 loss 0.5624518394470215 train acc 0.7204452177016417
epoch 4 train acc 0.7217241477922671


  0%|          | 0/365 [00:00<?, ?it/s]

epoch 4 test acc 0.5960738747553815


  0%|          | 0/1457 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.8474833965301514 train acc 0.75
epoch 5 batch id 201 loss 1.1202601194381714 train acc 0.7366293532338308
epoch 5 batch id 401 loss 0.7263317704200745 train acc 0.7520261845386533
epoch 5 batch id 601 loss 0.7553805708885193 train acc 0.7566035773710482
epoch 5 batch id 801 loss 0.7687376737594604 train acc 0.7625624219725343
epoch 5 batch id 1001 loss 0.8911058306694031 train acc 0.7657030469530469
epoch 5 batch id 1201 loss 1.0076465606689453 train acc 0.7654818900915903
epoch 5 batch id 1401 loss 0.5187050104141235 train acc 0.7662607066381156
epoch 5 train acc 0.7671227979867308


  0%|          | 0/365 [00:00<?, ?it/s]

epoch 5 test acc 0.5915362035225049


**모델 정보**

In [14]:
# 모델 정보
print(type(model), model, sep="\n\n")

<class '__main__.BERTClassifier'>

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

학습 모델 저장

In [None]:
# 코랩

# torch.save(model, './model/sa-tutorial1.pt')
# torch.save(model.state_dict(), './model/sa-tutorial1_StateDict.pt')

torch.save(model, f'/content/drive/MyDrive/Github/sentiment-analysis/model/sa-tutorial1.pt')
torch.save(model.state_dict(), f'/content/drive/MyDrive/Github/sentiment-analysis/model/sa-tutorial1_StateDict.pt')

In [15]:
# 로컬 
torch.save(model, 'model/kobert-v1.pt')

In [16]:
# 모델 사이즈 확인
import os

model_path1 = 'model/kobert-v1.pt'

size1 = os.path.getsize(model_path1) / (1024*1024) # mb 단위
#size2 = os.path.getsize(model_path2) / (1024*1024) 
print(f"Model size: {size1:.2f} MB")
#print(f"Model_StateDict size: {size2:.2f} MB")

Model size: 351.79 MB
