In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

from datetime import datetime
import math
from tqdm import tqdm

from transformers import AutoConfig, AutoTokenizer,  AutoModelForCausalLM, get_linear_schedule_with_warmup

In [4]:
class CustomDataset(Dataset):
    def __init__(self, X, tokenizer, device='cpu'):
        self.X = X
        self.tokenizer = tokenizer
        self.device = device
        
    def __getitem__(self, i):
        tokenized_el = self.tokenizer(X[i], padding='max_length', return_tensors='pt')
        
        for k, v in tokenized_el.items():
            v_device = v.to(self.device)               
            tokenized_el[k] = v_device.squeeze()
        
        return tokenized_el
    
    def __len__(self):
        return len(self.X)


In [5]:
def finetune(model, dataloader, epochs, optimizer, device='cpu'):
    losses = []
    
    model = model.to(device)
    model.train()
    
    for epoch in range(epochs):
        pbar = tqdm(dataloader)
        pbar.set_description(f"Epoch #{epoch+1}")
        for i, batch in enumerate(pbar):
            inputs = batch['input_ids']
            optimizer.zero_grad()

            outputs = model(inputs, labels=inputs)
            
            loss = outputs.loss
            losses.append(loss.item())
            loss.backward()

            optimizer.step()
            
            del inputs
            del outputs
            
            pbar.set_postfix(bce_loss=f"{sum(losses)/len(losses)}")

    return model, losses

def test(model, dataloader, device='cpu'):
    loss_sum = 0
    
    model.eval()
    with torch.no_grad():
        pbar = tqdm(dataloader)
        for i, batch in enumerate(pbar):
            inputs = batch['input_ids']
            loss = model(inputs, labels=inputs).loss
            
            loss_sum += loss
        
    loss_sum = loss_sum / i
    perplexity = math.exp(loss_sum)
    
    return perplexity
    
def calc_perplexity(model, tokenizer, sentence):
    sent = tokenizer.bos_token + sentence + tokenizer.eos_token
    sent = tokenizer(sent, padding='max_length', return_tensors='pt')
    sent = sent['input_ids']
    
    model.eval()
    with torch.no_grad():
        loss = model(sent, labels=sent).loss
        perplexity = math.exp(loss)
        
        return perplexity
    

In [6]:
def get_tokenizable(df, tokenizer):
    ls = []
    for i, row in tqdm(df.iterrows()):
        el_sent = ' '.join([row.post, row.response])
        el_sent = tokenizer.bos_token + el_sent + tokenizer.eos_token
        tok_sent = tokenizer.tokenize(el_sent)
        
        if len(tok_sent) < tokenizer.model_max_length-1:
            ls.append(i)
            
    return df.iloc[ls, :]

In [None]:
MAX_LEN = 512
MODEL_PATH = 'redrussianarmy/gpt2-turkish-cased'
BATCH_SIZE = 2
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
THRESHOLD = 35
EPOCHS = 10
LR = 0.005

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, model_max_length=MAX_LEN)
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

df = pd.read_csv('../input/taboo-datasets/forum_dh.csv')
df = get_tokenizable(df, tokenizer)

X = np.append(df['post'].to_numpy(), df['response'].to_numpy())
X_train, X_test = train_test_split(X, test_size=0.2)

train_set = CustomDataset(X_train, tokenizer=tokenizer, device=DEVICE)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE)

test_set = CustomDataset(X_test, tokenizer=tokenizer, device=DEVICE)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)

gpt_config = AutoConfig.from_pretrained(MODEL_PATH, model_max_length=MAX_LEN, max_position_embeddings=MAX_LEN)
gpt_model = AutoModelForCausalLM.from_config(gpt_config)
gpt_model.resize_token_embeddings(len(tokenizer))

opt = optim.AdamW(gpt_model.parameters(), lr=LR) 
gpt_model, losses = finetune(gpt_model, train_loader, epochs=EPOCHS, optimizer=opt, device=DEVICE)
perplexity = test(gpt_model, test_loader, device=DEVICE)
print(f"Test Set Perplexity: {perplexity}")

torch.save(gpt_model, f'fluency_model_{perplexity}')

In [None]:
torch.save(bert_model, f'fluency_model_{perplexity}')