In [16]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, get_linear_schedule_with_warmup

from tqdm import tqdm
import optuna

set_seed(42)

In [17]:
def get_gpt_tokenizer(path, max_len=512, more_tokens_dict={}):
    tokenizer = AutoTokenizer.from_pretrained(path, model_max_length=max_len)
    special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>',
                           'pad_token': '<PAD>', 'sep_token': '<SEP>'}

    special_tokens_dict = {**special_tokens_dict, **more_tokens_dict}
    tokenizer.add_special_tokens(special_tokens_dict)

    return tokenizer

def get_fluency_model(model_name_or_path, tokenizer_length=None, device='cpu'):
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device=device)

    if tokenizer_length:
        model.resize_token_embeddings(tokenizer_length)

    return model

def tokenize(text, tokenizer):
    if tokenizer.bos_token is not None:
        text = tokenizer.bos_token + text + tokenizer.eos_token

    return tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')


In [18]:
class FluencyDataset(Dataset):
    def __init__(self, data, tokenizer, device='cpu'):
        super().__init__()

        self.data = data
        self.tokenizer = tokenizer
        self.device = device

    def __getitem__(self, i):
        text = self.tokenizer.bos_token + self.data.content[i] + self.tokenizer.eos_token
        tokenized_text = tokenize(text, self.tokenizer)

        return {k: v.squeeze().to(self.device) for k, v in tokenized_text.items()}

    def __len__(self):
        return len(self.data)

In [19]:
def get_data(data_name, task_name):
    if data_name == 'reddit':
        if task_name == 'lm':
            train_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit_train.parquet')
            val_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit_val.parquet')
            test_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit_test.parquet')
        elif task_name == 'cls':
            train_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit-pairs_train.parquet')
            val_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit-pairs_val.parquet')
            test_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit-pairs_test.parquet')
        else:
            raise ValueError('Invalid task name')
    elif data_name == 'forum_dh':
        if task_name == 'lm':
            train_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh_train.parquet')
            val_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh_val.parquet')
            test_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh_test.parquet')
        elif task_name == 'cls':
            train_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh-pairs_train.parquet')
            val_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh-pairs_val.parquet')
            test_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh-pairs_test.parquet')
        else:
            raise ValueError('Invalid task name')
    else:
        raise ValueError('Invalid data name')

    return train_data, val_data, test_data

In [20]:
def train(model, train_loader, optimizer, scheduler):
    model.train()
    pbar = tqdm(train_loader)
    ppls = []

    for batch in pbar:
        optimizer.zero_grad()
        loss = model(**batch, labels=batch['input_ids']).loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        ppls.append(np.exp(loss.item()))
        pbar.set_description(f'PPL: {np.mean(ppls):.5f}')

    return np.mean(ppls)


def evaluate(model, val_loader):
    model.eval()
    pbar = tqdm(val_loader)
    ppls = []
    with torch.no_grad():
        for batch in pbar:
            loss = model(**batch, labels=batch['input_ids']).loss
            ppls.append(np.exp(loss.item()))
            pbar.set_description(f'PPL: {np.mean(ppls):.5f}')

    return np.mean(ppls)


def train_val_fn(data_name, model_name, batch_size, learning_rate, num_epochs, device='cpu'):
    # load data
    train_data, val_data, test_data = get_data(data_name, 'lm')

    # create a tokenizer
    tokenizer = get_gpt_tokenizer(model_name, max_len=256)

    train_set = FluencyDataset(train_data, tokenizer, device=device)
    val_set = FluencyDataset(val_data, tokenizer, device=device)

    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True)
    # test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

    # create a model
    fluency_model = get_fluency_model(model_name, tokenizer_length=len(tokenizer), device=device)

    # create an optimizer
    optimizer = optim.AdamW(fluency_model.parameters(), lr=learning_rate)

    # create a learning rate scheduler
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 10
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    best_val_ppl = np.inf
    for _ in range(num_epochs):
        train_ppl = train(fluency_model, train_loader, optimizer, scheduler)
        val_ppl = evaluate(fluency_model, val_loader)
        print(f'Train PPL: {train_ppl:.5f} - Val PPL: {val_ppl:.5f}')

        if val_ppl < best_val_ppl:
            fluency_model.save_pretrained('modules/judge_system/saved_fluency_models')
            best_val_ppl = val_ppl

    return val_ppl

In [21]:
def objective(trial, data_name, device):
    data_name = data_name
    model_name = 'redrussianarmy/gpt2-turkish-cased'
    batch_size = trial.suggest_int('batch_size', 1, 16)
    learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-2, log=True)
    num_epochs = 1

    return train_val_fn(data_name, model_name, batch_size, learning_rate, num_epochs, device)

def tune_hyperparameters(data_name, num_trials, device):
    study = optuna.create_study(study_name='fluency_study', storage='sqlite:///fluency_study.db', direction="minimize",
                                load_if_exists=True, pruner=optuna.pruners.SuccessiveHalvingPruner())
    study.optimize(lambda x: objective(x, data_name, device), n_trials=num_trials, gc_after_trial=True)


In [None]:
train_val_fn('reddit', model_name='redrussianarmy/gpt2-turkish-cased',
             batch_size=16, learning_rate=0.0002,
             num_epochs=3, device='cpu')