In [1]:
import os
import json
import pandas as pd
import numpy as np
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd 
import mxnet as mx
import time
import itertools
import random
import csv

from sklearn.model_selection import train_test_split
from kobert.mxnet_kobert import get_mxnet_kobert_model
from kobert.utils import get_tokenizer

# Loading KoBERT

In [2]:
devices = [mx.gpu(i) for i in range(8)]
bert_base, vocab = get_mxnet_kobert_model(use_decoder=False, use_classifier=False, ctx=devices)
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model
using cached model
using cached model


# Loading Data

### preprocessing my data

In [3]:
## dataset
dataset = pd.read_csv('data/voc-web-train.csv', sep='\t')

In [4]:
## label_dict
utils_config = dict()
label = sorted(dataset['class'].unique())
utils_config['label2idx'] = {c:i for i, c in enumerate(label)}
utils_config['idx2label'] = {i:c for i, c in enumerate(label)}
with open('config.json','w') as f:
    json.dump(utils_config,f)

In [5]:
#### for train test
dataset.drop(['class_1','class_2'],axis=1, inplace=True) # drop column
dataset = dataset.loc[dataset['text'].isna().apply(lambda elm: not elm), :]
train, validation = train_test_split(dataset, test_size=0.1, random_state=777)
train['class'] = train['class'].apply(lambda x:utils_config['label2idx'][x])
validation['class'] = validation['class'].apply(lambda x:utils_config['label2idx'][x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
np.save('train.npy',train.to_numpy()[:,1:])
np.save('validation.npy',validation.to_numpy()[:,1:])

In [7]:
dataset_train = nlp.data.NumpyDataset('train.npy',allow_pickle=True)
dataset_validation = nlp.data.NumpyDataset('validation.npy',allow_pickle=True)

# Loading Data

In [8]:
class BERTDataset(mx.gluon.data.Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        sent_dataset = gluon.data.SimpleDataset([[
            i[sent_idx],
        ] for i in dataset])
        self.sentences = sent_dataset.transform(transform)
        self.labels = gluon.data.SimpleDataset(
            [np.array(np.int32(i[label_idx])) for i in dataset])

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [9]:
max_len = 512

In [10]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_validation = BERTDataset(dataset_validation, 0, 1, tok, max_len, True, False)

# Build Bert fine-tuner

In [11]:
class BERTClassifier(nn.Block):
    def __init__(self,
                 bert,
                 num_classes=2,
                 dropout=None,
                 prefix=None,
                 params=None):
        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
        self.bert = bert
        with self.name_scope():
            self.classifier = nn.HybridSequential(prefix=prefix)
            if dropout:
                self.classifier.add(nn.Dropout(rate=dropout))
            self.classifier.add(nn.Dense(units=num_classes))

    def forward(self, inputs, token_types, valid_length=None):
        _, pooler = self.bert(inputs, token_types, valid_length)
        return self.classifier(pooler)
                                           

In [12]:
model = BERTClassifier(bert_base, num_classes=63, dropout=0.1)
# 분류 레이어만 초기화 한다. 
model.classifier.initialize(init=mx.init.Normal(0.02), ctx=devices)

In [None]:
model.summary()

In [None]:
model.hybridize()

# softmax cross entropy loss for classification
loss_function = gluon.loss.SoftmaxCELoss()

metric = mx.metric.TopKAccuracy(top_k=3)

In [13]:
batch_size = 32
lr = 5e-5

train_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=batch_size, num_workers=16)
validation_dataloader = mx.gluon.data.DataLoader(data_validation, batch_size=int(batch_size/2), num_workers=16)

In [14]:
trainer = gluon.Trainer(model.collect_params(), 'bertadam',{'learning_rate': lr, 'epsilon': 1e-9, 'wd':0.01}, update_on_kvstore=False)
log_interval = 4
num_epochs = 20

In [15]:
# LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다. 
for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
    v.wd_mult = 0.0
params = [p for p in model.collect_params().values() if p.grad_req != 'null']

In [16]:
def evaluate_accuracy(model, data_iter, ctx):
    acc = mx.metric.TopKAccuracy(top_k=3)
    i = 0
    for i, (t,v,s, label) in enumerate(data_iter):
        token_ids = gluon.utils.split_and_load(t, ctx,even_split=False)
        valid_length = gluon.utils.split_and_load(v, ctx,even_split=False)
        segment_ids = gluon.utils.split_and_load(s, ctx,even_split=False)
        label = gluon.utils.split_and_load(label, ctx,even_split=False)
    
        output = [model(ti, si, vl.astype('float32')) for ti, si, vl in zip(token_ids, segment_ids, valid_length)]
        acc.update(preds=output, labels=label)
        if i > 1000:
            break
        i += 1
    return(acc.get()[1])

In [17]:
#learning rate warmup을 위한 준비 
accumulate = 4
step_size = batch_size * accumulate if accumulate else batch_size
num_train_examples = len(data_train)
num_train_steps = int(num_train_examples / step_size * num_epochs)
warmup_ratio = 0.1
num_warmup_steps = int(num_train_steps * warmup_ratio)
step_num = 0
all_model_params = model.collect_params()

In [18]:
# Set grad_req if gradient accumulation is required
if accumulate and accumulate > 1:
    for p in params:
        p.grad_req = 'add'

# Train

In [19]:
tic = time.time()
for epoch_id in range(num_epochs):
    metric.reset()
    step_loss = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
        if step_num < num_warmup_steps:
            new_lr = lr * step_num / num_warmup_steps
        else:
            non_warmup_steps = step_num - num_warmup_steps
            offset = non_warmup_steps / (num_train_steps - num_warmup_steps)
            new_lr = lr - offset * lr
        trainer.set_learning_rate(new_lr)
        
        with mx.autograd.record():
            # load data to Multi-GPU
            token_ids = gluon.utils.split_and_load(token_ids, devices)
            valid_length = gluon.utils.split_and_load(valid_length, devices)
            segment_ids = gluon.utils.split_and_load(segment_ids, devices)
            label = gluon.utils.split_and_load(label, devices)
            
            # forward computation
            out = [model(ti, si, vl.astype('float32')) for ti, si, vl in zip(token_ids, segment_ids, valid_length)]
            losses = [loss_function(o,l).mean() for o, l in zip(out, label)]

        # backward computation
        for l in losses:
            l.backward()
    
        if not accumulate or (batch_id + 1) % accumulate == 0:
            trainer.allreduce_grads()
            nlp.utils.clip_grad_global_norm(params, 1)
            trainer.update(accumulate if accumulate else 1)
            step_num += 1
            if accumulate and accumulate > 1:
                # set grad to zero for gradient accumulation
                all_model_params.zero_grad()
                
        step_loss += sum([l.sum().asscalar() for l in losses])/len(devices)
        metric.update(label, out)
        if (batch_id + 1) % (50) == 0:
            print(f'[Epoch {epoch_id + 1} Batch {batch_id + 1}/{len(train_dataloader)}] loss={step_loss / log_interval:.4f}, lr={trainer.learning_rate:.10f}, Top3-acc={metric.get()[1]:.3f}')
            step_loss = 0
    validation_acc = evaluate_accuracy(model, validation_dataloader, devices)
    print('validation Top3-Acc : {}'.format(validation_acc))
toc = time.time()

[Epoch 1 Batch 50/2581] loss=52.2822, lr=0.0000004651, Top3-acc=0.034
[Epoch 1 Batch 100/2581] loss=51.6737, lr=0.0000009302, Top3-acc=0.046
[Epoch 1 Batch 150/2581] loss=50.4763, lr=0.0000014341, Top3-acc=0.083
[Epoch 1 Batch 200/2581] loss=49.7722, lr=0.0000018992, Top3-acc=0.115
[Epoch 1 Batch 250/2581] loss=48.8493, lr=0.0000024031, Top3-acc=0.142
[Epoch 1 Batch 300/2581] loss=47.9204, lr=0.0000028682, Top3-acc=0.160
[Epoch 1 Batch 350/2581] loss=47.0147, lr=0.0000033721, Top3-acc=0.178
[Epoch 1 Batch 400/2581] loss=46.0174, lr=0.0000038372, Top3-acc=0.194
[Epoch 1 Batch 450/2581] loss=44.8868, lr=0.0000043411, Top3-acc=0.211
[Epoch 1 Batch 500/2581] loss=44.0108, lr=0.0000048062, Top3-acc=0.225
[Epoch 1 Batch 550/2581] loss=42.7425, lr=0.0000053101, Top3-acc=0.241
[Epoch 1 Batch 600/2581] loss=40.9505, lr=0.0000057752, Top3-acc=0.258
[Epoch 1 Batch 650/2581] loss=39.1099, lr=0.0000062791, Top3-acc=0.278
[Epoch 1 Batch 700/2581] loss=37.6074, lr=0.0000067442, Top3-acc=0.298
[Epoch 

---

# Predict

In [34]:
test_set = pd.read_csv('data/voc-web-test.csv', sep ='\t').loc[:,['text','class']]

In [36]:
test_set = test_set.loc[test_set['text'].isna().apply(lambda elm: not elm), :]

In [37]:
test_set['class'] = test_set['class'].apply(lambda x:utils_config['label2idx'][x])

In [41]:
np.save('test.npy',test_set.to_numpy())

In [42]:
dataset_test = nlp.data.NumpyDataset('test.npy',allow_pickle=True)

In [43]:
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [45]:
validation_dataloader = mx.gluon.data.DataLoader(data_test, batch_size=int(batch_size/2), num_workers=16)

In [46]:
evaluate_accuracy(model, validation_dataloader, devices)

0.9143865842894969

In [50]:
print(f'{num_epochs} 에폭의 학습 시간 : {(toc-tic)/(3600)} 시간')

20 에폭의 학습 시간 : 7.897514385249879 시간
