In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel
import torch.multiprocessing as mp

import pandas as pd
import time
import datetime
import os
from IPython.display import clear_output
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup 

class Args():
    def __init__(self):
        self.epochs = 5
        self.arch = 'BERT_NSMC_singleGPU'
        self.lr = 5e-5
        self.batch_size = 64
        self.gpu = 2
        
args = Args()
model = BertForSequenceClassification.from_pretrained("kykim/bert-kor-base")
tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

class NSMCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# load nsmc dataset
nsmc_train = pd.read_csv('./nsmc/ratings_train.txt', sep='\t', encoding='utf-8')
nsmc_test = pd.read_csv('./nsmc/ratings_test.txt', sep='\t', encoding='utf-8')

# slicing dataset
nsmc_train = nsmc_train[:10000]
nsmc_test = nsmc_test[:2000]

nsmc_train['document'] = nsmc_train['document'].apply(str)
nsmc_test['document'] = nsmc_test['document'].apply(str)


# encoding
train_encodings = tokenizer(list(nsmc_train['document']), truncation=True, padding=True)
test_encodings = tokenizer(list(nsmc_test['document']), truncation=True, padding=True)

train_dataset = NSMCDataset(train_encodings, nsmc_train['label'])
test_dataset = NSMCDataset(test_encodings, nsmc_test['label'])

# dataloader

train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)

# define optimizer and scheduler 

# layerNorm and bias do not trained
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# scheduler reference => https://huggingface.co/transformers/main_classes/optimizer_schedules.html#transformers.get_linear_schedule_with_warmup
# using scheduler 
optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)
scheduler = get_linear_schedule_with_warmup(optimizer, len(train_loader) * args.epochs // 8, len(train_loader)*args.epochs)

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [2]:
# training
# Single GPU

torch.cuda.empty_cache()

writer = SummaryWriter(f'./runs/{args.arch}')
print_train = len(train_loader) // 10
total_epoch_start = time.time()

# for load model
best_acc = 0
best_model_name = ''

# for plot train and test loss
train_iter_list = []
train_loss_list = []
train_acc_list = []
test_iter_list = []
test_loss_list = []
test_acc_list = []

In [5]:
def train(epoch):
    model.train()
    model.cuda(args.gpu)
    
    train_loss = 0
    correct = 0
    total = 0
    print('====================================================')
    print('=================== Training =======================')
    print('====================================================')

    epoch_start = time.time()
    for idx, batch in enumerate(train_loader):
        start = time.time()
        inputs = {k: v.cuda(args.gpu) for k, v in batch.items()}
        outputs = model(**inputs)
        
        optimizer.zero_grad()
        outputs.loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_loss += outputs.loss.item()
        total += inputs['labels'].size(0)
        correct += inputs['labels'].eq(outputs.logits.argmax(axis=1)).sum().item()
        
        acc = 100 * correct / total
        batch_time = time.time() - start
        
        train_iter_list.append(epoch * len(train_loader) + idx)
        train_loss_list.append(outputs.loss.item())
        train_acc_list.append(acc)
        if idx % print_train == 0:
            print(f'Epoch: {epoch} \n'
                  f'total_steps: {epoch * len(train_loader) + idx} \n'
                  f'loss: {train_loss / (idx+1):.3f} \n'
                  f'acc : {acc:.3f} \n'
                  f'batch_time : {batch_time} \n'
                  )
        writer.add_scalar('Loss/train',
                           outputs.loss.item(),
                           epoch * len(train_loader) + idx)
        writer.add_scalars('Loss', 
                           {'Train_Loss': outputs.loss.item()}, 
                          epoch * len(train_loader) + idx)
        writer.add_scalar('Accuracy/train',
                          acc,
                          epoch * len(train_loader) + idx)

In [6]:
train(0)

Epoch: 0 
total_steps: 0 
loss: 0.697 
acc : 54.688 
batch_time : 0.41045498847961426 

Epoch: 0 
total_steps: 15 
loss: 0.692 
acc : 54.004 
batch_time : 0.34609556198120117 

Epoch: 0 
total_steps: 30 
loss: 0.665 
acc : 58.770 
batch_time : 0.35009098052978516 

Epoch: 0 
total_steps: 45 
loss: 0.600 
acc : 65.353 
batch_time : 0.3516230583190918 

Epoch: 0 
total_steps: 60 
loss: 0.549 
acc : 69.723 
batch_time : 0.3528482913970947 

Epoch: 0 
total_steps: 75 
loss: 0.518 
acc : 72.266 
batch_time : 0.35560178756713867 

Epoch: 0 
total_steps: 90 
loss: 0.501 
acc : 73.884 
batch_time : 0.3559277057647705 

Epoch: 0 
total_steps: 105 
loss: 0.476 
acc : 75.531 
batch_time : 0.35722899436950684 

Epoch: 0 
total_steps: 120 
loss: 0.456 
acc : 76.911 
batch_time : 0.35912227630615234 

Epoch: 0 
total_steps: 135 
loss: 0.442 
acc : 77.941 
batch_time : 0.3595130443572998 

Epoch: 0 
total_steps: 150 
loss: 0.430 
acc : 78.787 
batch_time : 0.36043334007263184 



In [8]:
# test
test_loss = 0
correct = 0
total = 0
model.eval()
clear_output(wait=True)
# test
print('===================================================')
print('=================== test ==========================')
print('===================================================')
with torch.no_grad():
    for inputs in test_loader:
        start = time.time()
        inputs = {k: v.cuda(args.gpu) for k, v in inputs.items()}
        outputs = model(**inputs)

        total += inputs['labels'].size(0)
        correct += inputs['labels'].eq(outputs.logits.argmax(axis=1)).sum().item()
        test_loss += outputs.loss.item()
        acc = 100 * correct / total

test_loss = test_loss / len(test_loader)
print(f'Loss: {test_loss}, Acc: {acc}')

0.2906091446056962


In [12]:
acc

89.25

In [9]:
train(1)

Epoch: 1 
total_steps: 157 
loss: 0.182 
acc : 93.750 
batch_time : 0.38258957862854004 

Epoch: 1 
total_steps: 172 
loss: 0.243 
acc : 90.137 
batch_time : 0.3526291847229004 

Epoch: 1 
total_steps: 187 
loss: 0.241 
acc : 90.323 
batch_time : 0.35072946548461914 

Epoch: 1 
total_steps: 202 
loss: 0.234 
acc : 90.625 
batch_time : 0.35261964797973633 

Epoch: 1 
total_steps: 217 
loss: 0.236 
acc : 90.471 
batch_time : 0.35449767112731934 

Epoch: 1 
total_steps: 232 
loss: 0.240 
acc : 90.440 
batch_time : 0.3553934097290039 

Epoch: 1 
total_steps: 247 
loss: 0.238 
acc : 90.591 
batch_time : 0.35776686668395996 

Epoch: 1 
total_steps: 262 
loss: 0.241 
acc : 90.448 
batch_time : 0.3586394786834717 

Epoch: 1 
total_steps: 277 
loss: 0.242 
acc : 90.276 
batch_time : 0.36110949516296387 

Epoch: 1 
total_steps: 292 
loss: 0.240 
acc : 90.384 
batch_time : 0.35903358459472656 

Epoch: 1 
total_steps: 307 
loss: 0.240 
acc : 90.459 
batch_time : 0.3609626293182373 



In [11]:
# test
test_loss = 0
correct = 0
total = 0
model.eval()
clear_output(wait=True)
# test
print('===================================================')
print('=================== test ==========================')
print('===================================================')
with torch.no_grad():
    for inputs in test_loader:
        start = time.time()
        inputs = {k: v.cuda(args.gpu) for k, v in inputs.items()}
        outputs = model(**inputs)

        total += inputs['labels'].size(0)
        correct += inputs['labels'].eq(outputs.logits.argmax(axis=1)).sum().item()
        test_loss += outputs.loss.item()
        acc = 100 * correct / total

test_loss = test_loss / len(test_loader)
print(f'Loss: {test_loss}, Acc: {acc}')

Loss: 0.28330868063494563, Acc: 89.25
