# 1. Load dataset

In [1]:
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


# 2. Load modules

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel
import torch.multiprocessing as mp

import pandas as pd
import time
import datetime
import os
from IPython.display import clear_output
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup 

In [3]:
class Args():
    def __init__(self):
        self.epochs = 10
        self.arch = 'BERT_NSMC_singleGPU'
        self.lr = 5e-5
        self.batch_size = 32
        self.gpu = 7

In [4]:
args = Args()

# 3. load model and tokenizer

In [5]:
model = BertForSequenceClassification.from_pretrained("kykim/bert-kor-base")
tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

# 4. load dataset

In [6]:
class NSMCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# load nsmc dataset
nsmc_train = pd.read_csv('./nsmc/ratings_train.txt', sep='\t', encoding='utf-8')
nsmc_test = pd.read_csv('./nsmc/ratings_test.txt', sep='\t', encoding='utf-8')

# slicing dataset
nsmc_train = nsmc_train[:100000]
nsmc_test = nsmc_test

nsmc_train['document'] = nsmc_train['document'].apply(str)
nsmc_test['document'] = nsmc_test['document'].apply(str)


# encoding
train_encodings = tokenizer(list(nsmc_train['document']), truncation=True, padding=True)
test_encodings = tokenizer(list(nsmc_test['document']), truncation=True, padding=True)

train_dataset = NSMCDataset(train_encodings, nsmc_train['label'])
test_dataset = NSMCDataset(test_encodings, nsmc_test['label'])

In [7]:
# dataloader

train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)

In [20]:
len(train_loader) * args.batch_size

100000

In [19]:
len(test_loader) * args.batch_size

50016

In [22]:
x = next(iter(train_loader))['input_ids']

# 6. train and test

In [9]:
# define optimizer and scheduler 

# layerNorm and bias do not trained
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# scheduler reference => https://huggingface.co/transformers/main_classes/optimizer_schedules.html#transformers.get_linear_schedule_with_warmup
# using scheduler 
optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)
scheduler = get_linear_schedule_with_warmup(optimizer, len(train_loader) * args.epochs // 8, len(train_loader)*args.epochs)

In [10]:
len(test_loader)

1563

In [11]:
# training
# Single GPU

torch.cuda.empty_cache()

writer = SummaryWriter(f'./runs/{args.arch}')
print_train = len(train_loader) // 10
total_epoch_start = time.time()

# for load model
best_acc = 0
best_model_name = ''

# for plot train and test loss
train_iter_list = []
train_loss_list = []
train_acc_list = []
test_iter_list = []
test_loss_list = []
test_acc_list = []
for epoch in range(args.epochs):
    model.train()
    model.cuda(args.gpu)
    
    train_loss = 0
    correct = 0
    total = 0
    print('====================================================')
    print('=================== Training =======================')
    print('====================================================')

    epoch_start = time.time()
    for idx, batch in enumerate(train_loader):
        start = time.time()
        inputs = {k: v.cuda(args.gpu) for k, v in batch.items()}
        outputs = model(**inputs)
        
        optimizer.zero_grad()
        outputs.loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_loss += outputs.loss.item()
        total += inputs['labels'].size(0)
        correct += inputs['labels'].eq(outputs.logits.argmax(axis=1)).sum().item()
        
        acc = 100 * correct / total
        batch_time = time.time() - start
        
        train_iter_list.append(epoch * len(train_loader) + idx)
        train_loss_list.append(outputs.loss.item())
        train_acc_list.append(acc)
        if idx % print_train == 0:
            print(f'Epoch: {epoch} \n'
                  f'total_steps: {epoch * len(train_loader) + idx} \n'
                  f'loss: {train_loss / (idx+1):.3f} \n'
                  f'acc : {acc:.3f} \n'
                  f'batch_time : {batch_time} \n'
                  )
        writer.add_scalar('Loss/train',
                           outputs.loss.item(),
                           epoch * len(train_loader) + idx)
        writer.add_scalars('Loss', 
                           {'Train_Loss': outputs.loss.item()}, 
                          epoch * len(train_loader) + idx)
        writer.add_scalar('Accuracy/train',
                          acc,
                          epoch * len(train_loader) + idx)
        
    
    # test
    test_loss = 0
    correct = 0
    total = 0
    model.eval()
    elapse_time = time.time() - epoch_start
    elapse_time = datetime.timedelta(seconds=elapse_time)
    print(f"Epoch training: {elapse_time}")
    clear_output(wait=True)
    # test
    print('===================================================')
    print('=================== test ==========================')
    print('===================================================')
    with torch.no_grad():
        for inputs in test_loader:
            start = time.time()
            inputs = {k: v.cuda(args.gpu) for k, v in inputs.items()}
            outputs = model(**inputs)

            total += inputs['labels'].size(0)
            correct += inputs['labels'].eq(outputs.logits.argmax(axis=1)).sum().item()
            test_loss += outputs.loss.item()
            acc = 100 * correct / total
            
    test_loss = test_loss / len(test_loader)
    
    # write test result
    test_iter_list.append(epoch * len(train_loader) + idx)
    test_loss_list.append(test_loss)
    test_acc_list.append(acc)
    print(f'Epoch: {epoch} \n'
          f'total_steps: {epoch * len(train_loader) + idx} \n'
          f'loss: {test_loss:.3f} \n'
          f'test_acc : {acc:.3f} \n'
          )
    writer.add_scalar('Loss/test',
                       test_loss,
                       epoch * len(train_loader) + idx
                       )
    writer.add_scalars('Loss', 
                       {'Test_Loss': test_loss},
                       epoch * len(train_loader) + idx
                      )
    writer.add_scalar('Accuracy/test', 
                     acc, 
                     epoch * len(train_loader) + idx)
    
    print(f'Epoch {epoch} finished!. Save model')
    os.makedirs(f'./{args.arch}', exist_ok=True)
    # model parameter save per epoch. 
    torch.save(model.state_dict(), f'./{args.arch}/{epoch}_{test_loss:.3f}_{acc:.3f}.pt')
    if best_acc < acc:
        best_acc = acc
        best_model_name = f'./{args.arch}/{epoch}_{test_loss:.3f}_{acc:.3f}.pt'
        
print(f'Total training time: {time.time() - total_epoch_start}')

Epoch: 6 
total_steps: 21874 
loss: 0.388 
test_acc : 90.102 

Epoch 6 finished!. Save model
Epoch: 7 
total_steps: 21875 
loss: 0.069 
acc : 96.875 
batch_time : 0.29984259605407715 

Epoch: 7 
total_steps: 22187 
loss: 0.027 
acc : 99.141 
batch_time : 0.30092811584472656 

Epoch: 7 
total_steps: 22499 
loss: 0.027 
acc : 99.070 
batch_time : 0.30440831184387207 

Epoch: 7 
total_steps: 22811 
loss: 0.027 
acc : 99.116 
batch_time : 0.30628156661987305 

Epoch: 7 
total_steps: 23123 
loss: 0.028 
acc : 99.097 
batch_time : 0.30324769020080566 

Epoch: 7 
total_steps: 23435 
loss: 0.028 
acc : 99.069 
batch_time : 0.30539774894714355 

Epoch: 7 
total_steps: 23747 
loss: 0.028 
acc : 99.081 
batch_time : 0.305239200592041 

Epoch: 7 
total_steps: 24059 
loss: 0.028 
acc : 99.096 
batch_time : 0.3074514865875244 



KeyboardInterrupt: 

In [38]:
model.eval()
clear_output(wait=True)

test_loss = 0
total = 0
correct = 0
# test
print('===================================================')
print('=================== test ==========================')
print('===================================================')
with torch.no_grad():
    for idx, inputs in enumerate(test_loader):
        start = time.time()
        inputs = {k: v.cuda(args.gpu) for k, v in inputs.items()}
        outputs = model(**inputs)

        total += inputs['labels'].size(0)
        correct += inputs['labels'].eq(outputs.logits.argmax(axis=1)).sum().item()
        test_loss += outputs.loss.item()
        acc = 100 * correct / total
        
        if idx % 100 == 0:
            print(f'{idx} - loss: {outputs.loss.item()}')

0 - loss: 0.7621586322784424
100 - loss: 0.3927501440048218
200 - loss: 0.6057535409927368
300 - loss: 0.5997530221939087
400 - loss: 0.28308576345443726
500 - loss: 0.300104022026062
600 - loss: 0.1321668177843094
700 - loss: 0.22531689703464508
800 - loss: 0.24568776786327362
900 - loss: 0.32960253953933716
1000 - loss: 0.805182933807373
1100 - loss: 0.24527928233146667
1200 - loss: 0.24607160687446594
1300 - loss: 0.14246365427970886
1400 - loss: 0.6441746354103088
1500 - loss: 0.34215572476387024


In [28]:
correct

207452

In [29]:
total

222528

In [30]:
outputs.logits

tensor([[ 3.3943, -4.1652],
        [-3.5154,  4.1229],
        [-2.9718,  3.2908],
        [-1.2286,  1.8589],
        [ 3.4289, -4.2430],
        [ 3.4299, -4.2364],
        [ 3.4250, -4.2448],
        [-3.4777,  4.1581],
        [-3.4675,  4.1722],
        [-3.4856,  4.1616],
        [-2.1384,  2.6234],
        [-3.5232,  3.9496],
        [ 3.4252, -4.2436],
        [ 3.3885, -4.1542],
        [ 3.4289, -4.2480],
        [ 1.6961, -1.2069]], device='cuda:7')

In [34]:
test_loss / len(test_loader)

0.48114596343403676

In [39]:
test_loss

751.0741811843473

In [40]:
test_loss / len(test_loader)

0.4805337051723271

In [36]:
outputs.loss

tensor(0.0071, device='cuda:7')

In [15]:
test_loss

0.9569596630521356

In [16]:
acc

94.0496615042196

In [13]:
len(train_loader), len(test_loader)

(3125, 1563)

In [None]:
# train_iter_list = []
# train_loss_list = []
# train_acc_list = []
# test_iter_list = []
# test_loss_list = []
# test_acc_list = []

import matplotlib.pyplot as plt

In [None]:
test_loss_list

In [None]:
# plot train and test loss
fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot()
ax.scatter(train_iter_list, train_loss_list, color='r', label='Train_Loss')
ax.scatter(test_iter_list, test_loss_list, color='b', label='Test_Loss')
ax.grid(True)

In [None]:
best_model_name

In [None]:
best_acc

In [None]:
# load best model
model.load_state_dict(torch.load(best_model_name))

In [None]:
# test
test_loss = 0
correct = 0
total = 0
model.eval()
model.cuda(args.gpu)
# test
print('===================================================')
print('=================== test ==========================')
print('===================================================')
with torch.no_grad():
    for inputs in test_loader:
        start = time.time()
        inputs = {k: v.cuda(args.gpu) for k, v in inputs.items()}
        outputs = model(**inputs)

        total += inputs['labels'].size(0)
        correct += inputs['labels'].eq(outputs.logits.argmax(axis=1)).sum().item()
        test_loss += outputs.loss.item()
        acc = 100 * correct / total

test_loss = test_loss / len(test_loader)
test_loss

In [None]:
test_loss