In [6]:
import pandas as pd
import numpy as np
# from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

from datetime import datetime
import math
import re
from tqdm import tqdm, trange

from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup

In [40]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, device='cpu'):
        self.data = data
        self.tokenizer = tokenizer
        self.device = device
        
    def __getitem__(self, i):
        text = self.tokenize(self.data[i])
        tokenized_el = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
        tokenized_el = {k: v.squeeze().to(self.device) for k, v in tokenized_el.items()}
        return tokenized_el

    def tokenize(self, row):
        text = row.post + self.tokenizer.pad_token + row.response
        text = self.tokenizer.bos_token + text + self.tokenizer.eos_token
        return text

    def __len__(self):
        return len(self.data)


In [41]:
def finetune(model, dataloader, epochs, optimizer, device='cpu'):
    losses = []
    
    model = model.to(device)
    model.train()
    
    for epoch in range(epochs):
        pbar = tqdm(dataloader)
        pbar.set_description(f"Epoch #{epoch+1}")
        for i, batch in enumerate(pbar):
            inputs = batch['input_ids']
            optimizer.zero_grad()

            outputs = model(inputs, labels=inputs)
            
            loss = outputs.loss
            losses.append(loss.item())
            loss.backward()

            optimizer.step()
            
            del inputs
            del outputs
            
            pbar.set_postfix(bce_loss=f"{sum(losses)/len(losses)}")

    return model, losses

def test(model, dataloader, device='cpu'):
    loss_sum = 0
    
    model.eval()
    with torch.no_grad():
        pbar = tqdm(dataloader)
        for i, batch in enumerate(pbar):
            inputs = batch['input_ids']
            loss = model(inputs, labels=inputs).loss
            
            loss_sum += loss
        
    loss_sum = loss_sum / i
    perplexity = math.exp(loss_sum)
    
    return perplexity
    
def calc_perplexity(model, tokenizer, sentence):
    sent = tokenizer.bos_token + sentence + tokenizer.eos_token
    sent = tokenizer(sent, padding='max_length', return_tensors='pt')
    sent = sent['input_ids']
    
    model.eval()
    with torch.no_grad():
        loss = model(sent, labels=sent).loss
        perplexity = math.exp(loss/len(sent))
        
        return perplexity


In [42]:
DATA_PATH = '../../../datasets/Turkish-Reddit-Dataset/tr-reddit.parquet'
MAX_LEN = 512
MODEL_PATH = 'redrussianarmy/gpt2-turkish-cased'
BATCH_SIZE = 8
DEVICE = 'mps' if torch.backends.mps.is_available() else 'cpu'
THRESHOLD = 35
EPOCHS = 1
LR = 0.0001

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, model_max_length=MAX_LEN)
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>', 'sep_token': '<SEP>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [43]:
df = pd.read_parquet(DATA_PATH)
df = df[['post', 'response']].applymap(lambda x: re.sub('\s', ' ', x))
df = df[['post', 'response']].applymap(lambda x: re.sub('View Poll', '', x))

X_train, X_test = train_test_split(df, test_size=0.2)

train_set = CustomDataset(X_train, tokenizer=tokenizer, device=DEVICE)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE)

test_set = CustomDataset(X_test, tokenizer=tokenizer, device=DEVICE)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)

gpt_config = AutoConfig.from_pretrained(MODEL_PATH, model_max_length=MAX_LEN, max_position_embeddings=MAX_LEN)
gpt_model = AutoModelForCausalLM.from_config(gpt_config)
gpt_model.resize_token_embeddings(len(tokenizer))

opt = optim.AdamW(gpt_model.parameters(), lr=LR)
gpt_model, losses = finetune(gpt_model, train_loader, epochs=EPOCHS, optimizer=opt, device=DEVICE)

perplexity = test(gpt_model, test_loader, device=DEVICE)
print(f"Test Set Perplexity: {perplexity}")

torch.save(gpt_model, f'fluency_model_{perplexity}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [47]:
# PERPLEXITY TRIAL
gpt_model = torch.load('fluency_model_3.2621438768707276', map_location=DEVICE)
gpt_model.device

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [48]:
sent1 = "Bu ben akşam people kalmak kalıypor kalacak"
# sent2 = "Bu konu hakkındaki fikirlerinizi de öğrenmek isterim."
ans1 = None

sep_token = tokenizer.special_tokens_map['sep_token']
sentence = ' '.join([sent1, sep_token, ans1]) if ans1 else sent1

sent = tokenizer.bos_token + sentence + tokenizer.eos_token
sent = tokenizer(sent, return_tensors='pt', truncation=True, padding=True)
sent = sent['input_ids'][0].to(device=DEVICE)

nll_list = []
for i in range(1, len(sent)):
    non_len = len(sent) - i
    
    curr_sent = sent.clone()
    curr_sent[:-non_len] = -100

    with torch.no_grad():
        loss = gpt_model(sent, labels=curr_sent).loss
        nll_list.append(loss*non_len)
        
autoreg_ppl = torch.exp(torch.stack(nll_list).sum() / len(sent)).item()
loss_ppl = torch.exp(gpt_model(sent, labels=sent).loss / len(sent)).item()

loss_ppl

Epoch #1:   0%|          | 1/75427 [01:42<2146:24:48, 102.45s/it, bce_loss=11.081254959106445]


KeyboardInterrupt: 

In [105]:
sent1 = "Bu ben akşam people"
sent1 = "Bu konu hakkındaki fikirlerinizi de öğrenmek isterim."
ans1 = None

sep_token = tokenizer.special_tokens_map['sep_token']
sentence = ' '.join([sent1, sep_token, ans1]) if ans1 else sent1

sent = tokenizer.bos_token + sentence + tokenizer.eos_token
sent = tokenizer(sent, return_tensors='pt', truncation=True, padding=True)
sent = sent['input_ids'][0].to(device=DEVICE)

with torch.no_grad():
    ppl = torch.exp(gpt_model(sent, labels=sent).loss / len(sent))
ppl

tensor(2.2602)

In [None]:
out.logits.reshape(1, 50261, 1)