# KoBERT finetuning

In [1]:
!pip install ipywidgets  # for vscode
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 6.9 MB/s 
Installing collected packages: jedi
Successfully installed jedi-0.18.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-ch9gnku9
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-ch9gnku9
Collecting boto3<=1.15.18
  Downloading boto3-1.15.18-py2.py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 6.6 MB/s 
[?25hCollecting gluonnlp<=0.10.0,>=0.6.0
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 71.0 MB/s 
[?25hCollecting mxnet<

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

In [3]:
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

In [4]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [5]:
## CPU
# device = torch.device("cpu")

## GPU
device = torch.device("cuda:0")

In [6]:
bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


# KOR PARAMETERS

In [7]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

frac = 1
test_size = 0.2
dr_rate = 0.7

# 데이터 불러오기

In [None]:
# !wget -O .cache/ratings_train.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_train.txt
# !wget -O .cache/ratings_test.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_test.txt

In [None]:
# dataset_train = nlp.data.TSVDataset(".cache/ratings_train.txt", field_indices=[1,2], num_discard_samples=1)
# dataset_test = nlp.data.TSVDataset(".cache/ratings_test.txt", field_indices=[1,2], num_discard_samples=1)

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
#데이터 가공
#청구기호 숫자 앞 2개만 따와서 각각 매핑. 65~70은 예외적으로 경영학
import pandas as pd
import re 
data = pd.read_csv("/content/gdrive/MyDrive/LibraryCsv/hapbon.csv", encoding = "cp949")
data = data.loc[:,['제목', '청구번호']]
data['청구번호'] = data['청구번호'].apply(str)

data['청구번호'] = data['청구번호'].replace({r'(.*?)(\d{2})\d.*' : r'\2'}, regex=True)

data['청구번호'] = pd.to_numeric(data['청구번호'])
data.loc[(data['청구번호'] < 10), ['청구번호']] = 1000
data.loc[(data['청구번호'] < 20), ['청구번호']] = 1001
data.loc[(data['청구번호'] < 30), ['청구번호']] = 1002
data.loc[(data['청구번호'] < 40), ['청구번호']] = 1003
data.loc[(data['청구번호'] < 50), ['청구번호']] = 1004
data.loc[(data['청구번호'] < 60), ['청구번호']] = 1005
data.loc[(data['청구번호'] < 65), ['청구번호']] = 1006 #경영학쪽은 따로 분류이기 떄문에 65 사용
data.loc[(data['청구번호'] < 70), ['청구번호']] = 1007 #즉, 1007 쪽이 경영학책
data.loc[(data['청구번호'] < 80), ['청구번호']] = 1008
data.loc[(data['청구번호'] < 90), ['청구번호']] = 1009
data.loc[(data['청구번호'] <  100), ['청구번호']] = 1010

labels = {'0':'총류',
          '1':'철학',
          '2':'종교',
          '3':'사회학',
          '4':'언어',
          '5':'자연과학',
          '6':'기술과학',
          '7':'경영학',
          '8':'예술',
          '9':'문학',
          '10':'역사'
          }

data['청구번호'] = data['청구번호']%1000
data['청구번호'] = data['청구번호'].astype(int)
# data['청구번호'] = data['청구번호'].apply(str)
# data['청구번호'] = data['청구번호'].replace(labels)

#input_string = "Peace-building and development in Guatemala and Northern Ireland"
regex = '[0-9|A-Z|a-z|ㄱ-ㅎ|ㅏ-ㅣ|가-힣|\s]*[ㄱ-ㅎ|ㅏ-ㅣ|가-힣][0-9|A-Z|a-z|ㄱ-ㅎ|ㅏ-ㅣ|가-힣|\s]*'
kor_data = data[data.제목.str.fullmatch(regex)]                                 # 한글이 있다면 따로 빼내기
eng_data = pd.concat([data, kor_data, kor_data]).drop_duplicates(keep=False)

kor_data

Unnamed: 0,제목,청구번호
1,대학수학능력시험의 영어 독해문제 분석 및 개선방안,0
5,위험한 생각들,0
34,무지의 사전,0
38,21세기 지구에 등장한 새로운 지식,0
42,두 문화,0
...,...,...
436189,독도연감,10
436250,중국현대사사전,10
436276,이슬람 사전,10
436281,싱가포르 편람,10


In [None]:
# en_labels = {'0':'totals',
#            '1':'philosophy',
#            '2':'religion',
#            '3':'sociology',
#            '4':'language',
#            '5':'Natural science',
#            '6':'Technology',
#            '7':'Business Administration',
#            '8':'art',
#            '9':'literature',
#            '10': 'history'
#            }

# sum1 = 0
# sum2 = 0
# for i in range(0,11):
#   nameparsed = data[data['제목'].str.contains(labels[str(i)])]
#   namecount = len(nameparsed)
#   numberparsed = nameparsed[nameparsed['청구번호']==i]
#   numbercount = len(numberparsed)
#   print(labels[str(i)], namecount, numbercount, numbercount*100/namecount)
#   # print(nameparsed)
#   sum1 += namecount
#   sum2 += numbercount

# print(sum1)
# print(sum2)
# print(sum2*100/sum1)

In [None]:
# eng_data['청구번호'].value_counts(sort=False)

In [19]:
kor_data = kor_data.sample(frac = frac, random_state = 1)
# kor_data.to_csv("/content/gdrive/MyDrive/LibraryCsv/hapbonkordata.txt", sep="\t")
from sklearn.model_selection import train_test_split

train, test = train_test_split(kor_data, test_size=test_size, stratify = kor_data['청구번호'])
train.to_csv("/content/gdrive/MyDrive/LibraryCsv/train.txt", sep="\t")
test.to_csv("/content/gdrive/MyDrive/LibraryCsv/test.txt", sep="\t")

In [20]:
dataset_train = nlp.data.TSVDataset("/content/gdrive/MyDrive/LibraryCsv/train.txt", field_indices=[1,2], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset("/content/gdrive/MyDrive/LibraryCsv/test.txt", field_indices=[1,2], num_discard_samples=1)

# KOR

In [12]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [13]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [None]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [14]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=11,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)

        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.classifier(out)

In [15]:
model = BERTClassifier(bertmodel, dr_rate=dr_rate).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [None]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

# SAVE

In [10]:
def save_checkpoint(save_path, model, valid_loss):# ------ 모델 평가를 위해 훈련 과정을 저장
    if save_path == None:
        return
    state_dict = {'model_state_dict': model.state_dict(), 'valid_loss': valid_loss}
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):# ------ save_checkpoint 함수에서 저장된 모델을 가져옵니다.
    if load_path == None:
        return
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

In [None]:
save_checkpoint("/content/gdrive/MyDrive/learned.multiLangModel", model, 0.325)

# TEST

In [16]:
load_checkpoint("/content/gdrive/MyDrive/final.koBertModel", model)

Model loaded from <== /content/gdrive/MyDrive/final.koBertModel


0.325

In [27]:
from pandas import DataFrame

# print(test['제목'].iloc(0))
toselftest = ["넛지"]

labels = {0:'총류',
          1:'철학',
          2:'종교',
          3:'사회학',
          4:'언어',
          5:'자연과학',
          6:'기술과학',
          7:'경영학',
          8:'예술',
          9: '문학',
          10:'역사'
}

selftest = DataFrame({
  'value': toselftest,
  'null': [0 for x in toselftest]
})

selftest.to_csv("/content/gdrive/MyDrive/LibraryCsv/selftest.txt", sep="\t")
dataset_selftest = nlp.data.TSVDataset("/content/gdrive/MyDrive/LibraryCsv/selftest.txt", field_indices=[1,2], num_discard_samples=1)
data_selftest = BERTDataset(dataset_selftest, 0, 1, tok, max_len, True, False)
selftest_dataloader = torch.utils.data.DataLoader(data_selftest, batch_size=1, num_workers=1)

# test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=1, num_workers=5)

# print(len(test))
# print(len(test_dataloader))

correct = 0
for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(selftest_dataloader), total=len(selftest_dataloader)):
      token_ids = token_ids.long().to(device)
      segment_ids = segment_ids.long().to(device)
      valid_length= valid_length
      label = label.long().to(device)
      out = model(token_ids, valid_length, segment_ids)
      output = F.softmax(out[0],dim=-1)
      answer = int(test['청구번호'].iloc[batch_id])
      prediction = out.cpu().detach().numpy()[0]
      minval = prediction.min()
      prediction = [x-minval for x in prediction]
      sumval = sum(prediction)

      # answer = int(test['청구번호'].iloc[batch_id])
      # print("{} => {}".format(test['제목'].iloc[batch_id], answer))
      print("{}".format(toselftest[batch_id]))
      print(torch.topk(output,3))
      top3 = 0
      # for index, score in sorted(enumerate(prediction), reverse = True, key = lambda prediction:prediction[1]):
      #   # print("{} {}%".format(labels[index], score*100/sumval))
      #   print(torch.topk(output,3))
      #   if index == answer:
      #     correct += 1
      #     # print("Success")
      #     break
      #   else:
      #     top3 += 1
      #     if top3 == 3: 
      #       # print("Failed")
      #       break
# print(correct)
# print(len(test_dataloader))
# print(correct*100/len(test_dataloader))

  0%|          | 0/1 [00:00<?, ?it/s]

인간의 품격
torch.return_types.topk(
values=tensor([0.8600, 0.0597, 0.0285], device='cuda:0', grad_fn=<TopkBackward0>),
indices=tensor([ 1, 10,  3], device='cuda:0'))
