In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch



In [3]:
#깃허브에서 KoBERT 파일 로드

# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master -> 오류 날 수 있음
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf' # hugging face를 통한 모델 다운로드 방식

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-m9rx4jkh/kobert-tokenizer_574c2d64c5f445749164cccf316ec7f8
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-m9rx4jkh/kobert-tokenizer_574c2d64c5f445749164cccf316ec7f8
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [4]:
import os
import sys
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import gluonnlp as nlp
from tqdm.notebook import tqdm
from tqdm import tqdm, tqdm_notebook

In [5]:
# Hugging Face를 통한 모델 및 토크나이저 Import
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [6]:
#GPU 사용
device = torch.device("cuda:0")

In [7]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


### 말뭉치 데이터

In [8]:
train_dataset = pd.read_csv('/content/drive/MyDrive/YBIGTA/YBIGTA 투게더 프로젝트/대화 데이터/감성대화말뭉치(전처리)_Training.csv')
test_dataset = pd.read_csv('/content/drive/MyDrive/YBIGTA/YBIGTA 투게더 프로젝트/대화 데이터/감성대화말뭉치(전처리)_Validation.csv')

In [9]:
train_dataset.loc[(train_dataset['감정_대분류'] == "기쁨"), 'Emotion'] = 0
train_dataset.loc[(train_dataset['감정_대분류'] == "슬픔"), 'Emotion'] = 1
train_dataset.loc[(train_dataset['감정_대분류'] == "분노"), 'Emotion'] = 2
train_dataset.loc[(train_dataset['감정_대분류'] == "당황/불안"), 'Emotion'] = 3

In [10]:
test_dataset.loc[(test_dataset['감정_대분류'] == "기쁨"), 'Emotion'] = 0
test_dataset.loc[(test_dataset['감정_대분류'] == "슬픔"), 'Emotion'] = 1
test_dataset.loc[(test_dataset['감정_대분류'] == "분노"), 'Emotion'] = 2
test_dataset.loc[(test_dataset['감정_대분류'] == "당황/불안"), 'Emotion'] = 3

In [20]:
train_dataset['Emotion'] = train_dataset['Emotion'].astype(int)
test_dataset['Emotion'] = test_dataset['Emotion'].astype(int)

In [21]:
# 텍스트, 레이블 형태의 리스트로 변환

train_data = []
for q, label in zip(train_dataset['사람문장'], train_dataset['Emotion']):
    data = [q, str(label)]
    train_data.append(data)

test_data = []
for q, label in zip(test_dataset['사람문장'], test_dataset['Emotion']):
    data = [q, str(label)]
    test_data.append(data)

In [22]:
train_data[0]

['일은 왜 해도 해도 끝이 없을까? 화가 난다. 그냥 내가 해결하는 게 나아. 남들한테 부담 주고 싶지도 않고. ', '2']

In [36]:
len(train_data), len(test_data)

(51630, 6641)

In [23]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [24]:
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 15
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [25]:
tok = tokenizer.tokenize

train_data = BERTDataset(train_data, 0, 1, tok, vocab, max_len, True, False)
test_data = BERTDataset(test_data, 0, 1, tok, vocab, max_len, True, False)

In [26]:
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, num_workers=2)

### BERT 모델

In [27]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [28]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [29]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [30]:
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, weight_decay=0.01)  # weight_decay 설정 추가

loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [31]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [35]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/807 [00:00<?, ?it/s]


RuntimeError: ignored

In [34]:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.enabled = True
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x795ec7287010>