In [1]:
#!pip install tensorflow-gpu
import pandas as pd
from transformers import GPT2TokenizerFast, GPT2Config, GPT2ForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from datasets import load_metric
import wandb
from tqdm import tqdm
import os

In [2]:
gpt_name = 'skt/ko-gpt-trinity-1.2B-v0.5'

tokenizer = GPT2TokenizerFast.from_pretrained(gpt_name)

config = GPT2Config.from_pretrained(gpt_name)
config.num_labels = 2


In [3]:
def accuracy(output, target):
    with torch.no_grad():
        pred = torch.argmax(output, dim=1)
        assert pred.shape[0] == len(target)
        correct = 0
        correct += torch.sum(pred == target).item()
    return correct / len(target)

In [4]:
def read_boolq(data_path):
    data = pd.read_csv(data_path, delimiter='\t')
    label_col = 'Answer(FALSE = 0, TRUE = 1)'
    texts = []
    labels = []
    sos = '<s>'
    eos = '<\s>'
    t = '<unused0>'
    q = '<unused1>'

    for i in range(len(data)):
        text = t + sos + data['Text'][i] + eos + q + sos + data['Question'][i] + eos
        texts.append(text)
        labels.append(data[label_col][i])
    return texts, labels

In [5]:
TRAIN_PATH = '/opt/ml/corpus_korean/data/BoolQ/SKT_BoolQ_Train.tsv'
VALID_PATH = '/opt/ml/corpus_korean/data/BoolQ/SKT_BoolQ_Dev.tsv'

In [6]:
train_texts, train_labels = read_boolq(TRAIN_PATH)
valid_texts, valid_labels = read_boolq(VALID_PATH)

In [7]:
class BooqDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
  
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
train_dataset = BooqDataset(train_encodings, train_labels)
valid_dataset = BooqDataset(valid_encodings, valid_labels)

In [13]:
'''
text token + sos + 문장 + eos + question token 
<t>
'''

'\ntext token + sos + 문장 + eos + question token \n<t>\n'

In [None]:
model = GPT2ForSequenceClassification(config).from_pretrained(gpt_name)

In [11]:
for param in model.parameters():
    param.requires_grad = False
    
for param in model.score.parameters():
    param.requires_grad = True

In [None]:
config = {'epochs' : 20 , 'learning_rate' : 5e-6, 'batch_size' : 32, 'weight_decay' : 0}
wandb.init(project='nlp_test',config=config)

In [13]:
cfg = wandb.config

In [14]:
train_loader = DataLoader(train_dataset,cfg.batch_size, shuffle = True)
valid_loader = DataLoader(valid_dataset,cfg.batch_size, shuffle = True)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
LEARNING_RATE = cfg.learning_rate
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE,amsgrad=True,weight_decay=cfg.weight_decay)
#scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)
output_dir='./results'

In [None]:
model.to(device)
model.train()

In [None]:
grad_num = 2
layer_idx = [[j - i for i in range(grad_num)] for j in range(len(model.transformer.h)-1, -1, -1 * grad_num)]
vacc_li = [0]


for e in range(cfg.epochs):
    epoch_loss = 0
    epoch_acc = 0
    for batch in tqdm(train_loader,'train: '):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        acc = accuracy(outputs.logits, labels)
        wandb.log({'loss': loss.item(), 'accuracy': acc})
        
        loss.backward()
        optimizer.step()
    # valid
        del input_ids
        del attention_mask
        del labels
        del loss
        del outputs
        torch.cuda.empty_cache()
    
    
    
    model.eval()
    valid_acc = 0
    valid_loss = 0
    for batch in tqdm(valid_loader, 'valid: '):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = criterion(outputs.logits, labels)
            
            acc = accuracy(outputs.logits, labels)
            valid_acc += acc
            
            valid_loss += loss.item()
            
            del input_ids
            del attention_mask
            del labels
            del loss
            del outputs
            torch.cuda.empty_cache()
    
    vacc = valid_acc/len(valid_loader)
    vacc_f = f'{vacc:.3f}'
    wandb.log({'valid_loss': valid_loss/len(valid_loader)})
    wandb.log({'val_accuracy' :vacc })
    model.train()
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if vacc_li[-1] < vacc:
        torch.save(model.state_dict(), os.path.join(output_dir, f'boolq_{e+0:03}_{vacc_f}.pt'))
        vacc_li.append(vacc)
        
    for param in model.parameters():
        param.requires_grad = False

    for param in model.score.parameters():
        param.requires_grad = True

    for idx in layer_idx[e % len(layer_idx)]:
        for param in model.transformer.h[idx].parameters():
            param.requires_grad = True

wandb.finish()