In [1]:
import math
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

from datasets import Dataset as D
from tqdm import tqdm

!git clone https://github.com/TQuad/turkish-nlp-qa-dataset

In [2]:
def tquad2df(path):
    df = pd.DataFrame(columns=['title', 'content', 'question'])
    
    dataset = D.from_json(path)['data'][0]
    
    for data in tqdm(dataset):
        title = data['title']
        for para in data['paragraphs']:
            context = para['context']
            for qa in para['qas']:
                question = qa['question']
                
                df_el = pd.DataFrame([{'title': title, 
                                       'content': context, 
                                       'question': question}])
                df = pd.concat([df, df_el], ignore_index=True)
    
    return df

In [3]:
class CustomDataset(Dataset):
    def __init__(self, X, tokenizer, device='cpu'):
        self.X = X
        self.tokenizer = tokenizer
        self.device = device
        
    def __getitem__(self, i):
        tokenized_el = self.tokenizer(self.X[i][0], padding='max_length', return_tensors='pt')
        
        for k, v in tokenized_el.items():
            v_device = v.to(self.device)               
            tokenized_el[k] = v_device.squeeze()
        
        return tokenized_el
    
    def __len__(self):
        return len(self.X)


In [4]:
def get_tokenizable(df, tokenizer):
    ls = []
    for i, row in tqdm(df.iterrows()):
        el_sent = f"{tokenizer.bos_token}Paragraf: {row.content} Soru: {row.question}{tokenizer.eos_token}"
        tok_sent = tokenizer.tokenize(el_sent)
        
        if len(tok_sent) < tokenizer.model_max_length-1:
            ls.append(el_sent)
            
    return pd.DataFrame(data={'concat_content': ls})

In [5]:
def finetune(model, dataloader, epochs, optimizer, device='cpu'):
    losses = []
    
    model = model.to(device)
    model.train()
    
    for epoch in range(epochs):
        pbar = tqdm(dataloader)
        pbar.set_description(f"Epoch #{epoch+1}")
        for i, batch in enumerate(pbar):
            inputs = batch['input_ids']
            optimizer.zero_grad()

            outputs = model(inputs, labels=inputs)
            
            loss = outputs.loss
            losses.append(loss.item())
            loss.backward()

            optimizer.step()
            
            del inputs
            del outputs
            
            pbar.set_postfix(bce_loss=f"{sum(losses)/len(losses)}")

    return model, losses

def test(model, dataloader, device='cpu'):
    loss_sum = 0
    
    model.eval()
    with torch.no_grad():
        pbar = tqdm(dataloader)
        for i, batch in enumerate(pbar):
            inputs = batch['input_ids']
            loss = model(inputs, labels=inputs).loss
            
            loss_sum += loss
        
    loss_sum = loss_sum / i
    perplexity = math.exp(loss_sum)
    
    return perplexity
    
def calc_perplexity(model, tokenizer, sentence):
    sent = tokenizer.bos_token + sentence + tokenizer.eos_token
    sent = tokenizer(sent, padding='max_length', return_tensors='pt')
    sent = sent['input_ids']
    
    model.eval()
    with torch.no_grad():
        loss = model(sent, labels=sent).loss
        perplexity = math.exp(loss)
        
        return perplexity
    

In [7]:
TRAIN_DIR = './turkish-nlp-qa-dataset/train-v0.1.json'
DEV_DIR= './turkish-nlp-qa-dataset/dev-v0.1.json'
MODEL_PATH = 'redrussianarmy/gpt2-turkish-cased'

MAX_LEN = 512
BATCH_SIZE = 8
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
THRESHOLD = 35
EPOCHS = 1
LR = 0.001

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, model_max_length=MAX_LEN, force_download=True)
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 
                       'sep_token': '<SEP>', 'pad_token':'<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

train_df = tquad2df(TRAIN_DIR)
train_df = get_tokenizable(train_df, tokenizer)

dev_df = tquad2df(DEV_DIR)
dev_df = get_tokenizable(dev_df, tokenizer)

train_set = CustomDataset(train_df.to_numpy(), tokenizer=tokenizer, device=DEVICE)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE)

test_set = CustomDataset(dev_df.to_numpy(), tokenizer=tokenizer, device=DEVICE)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)

gpt_config = AutoConfig.from_pretrained(MODEL_PATH, model_max_length=MAX_LEN, max_position_embeddings=MAX_LEN, force_download=True)
gpt_model = AutoModelForCausalLM.from_config(gpt_config)
gpt_model.resize_token_embeddings(len(tokenizer))

opt = optim.AdamW(gpt_model.parameters(), lr=LR) 
gpt_model, losses = finetune(gpt_model, train_loader, epochs=EPOCHS, optimizer=opt, device=DEVICE)
perplexity = test(gpt_model, test_loader, device=DEVICE)
print(f"Test Set Perplexity: {perplexity}")

torch.save(gpt_model, f'question_generation_model_{perplexity}')