In [23]:
import pandas as pd

import numpy as np
import optuna

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm
from transformers.utils.logging import set_verbosity_error

from transformers import BertTokenizer, AutoModel, get_linear_schedule_with_warmup, set_seed

set_seed(42)
set_verbosity_error()

In [24]:
def get_bert_tokenizer(path, max_len=512):
    tokenizer = BertTokenizer.from_pretrained(path, model_max_length=max_len)
    return tokenizer


In [25]:
class ClassifyBERTurk(nn.Module):
    def __init__(self, model_name_or_path):
        super(ClassifyBERTurk, self).__init__()

        self.berturk = AutoModel.from_pretrained(model_name_or_path)
        self.fc = nn.Linear(self.berturk.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, **inputs):
        bert_out = self.berturk(**inputs).last_hidden_state
        bert_last_hidden = bert_out[:, 0, :]

        fc_out = self.fc(bert_last_hidden)
        sigmoid_out = self.sigmoid(fc_out)
        return sigmoid_out


In [26]:
def get_relevancy_model(model_name_or_path, trainable_llm=False, device='cpu'):
    model = ClassifyBERTurk(model_name_or_path).to(device=device)

    for params in model.berturk.parameters():
        params.requires_grad = trainable_llm

    return model

In [27]:
class RelevancyDataset(Dataset):
    def __init__(self, data, tokenizer, device='cpu'):
        super().__init__()

        self.data = data
        self.tokenizer = tokenizer
        self.device = device

    def __getitem__(self, i):
        row = self.data.iloc[i]
        post, resp, y = row['post'], row['response'], row['is_pair']

        tokenized_pair = self.tokenizer(post, resp, padding='max_length', truncation='longest_first', return_tensors='pt')
        tokenized_pair = {k: v.squeeze().to(self.device) for k, v in tokenized_pair.items()}

        return tokenized_pair, torch.tensor(y, dtype=torch.float).to(self.device)

    def __len__(self):
        return len(self.data)


In [28]:
def get_data(data_name, task_name):
    if data_name == 'reddit':
        if task_name == 'lm':
            train_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit_train.parquet')
            val_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit_val.parquet')
            test_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit_test.parquet')
        elif task_name == 'cls':
            train_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit-pairs_train.parquet')
            val_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit-pairs_val.parquet')
            test_data = pd.read_parquet('taboo-datasets/reddit-dataset/tr-reddit-pairs_test.parquet')
        else:
            raise ValueError('Invalid task name')
    elif data_name == 'forum_dh':
        if task_name == 'lm':
            train_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh_train.parquet')
            val_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh_val.parquet')
            test_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh_test.parquet')
        elif task_name == 'cls':
            train_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh-pairs_train.parquet')
            val_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh-pairs_val.parquet')
            test_data = pd.read_parquet('taboo-datasets/donanim-haber-dataset/forum_dh-pairs_test.parquet')
        else:
            raise ValueError('Invalid task name')
    else:
        raise ValueError('Invalid data name')

    return train_data, val_data, test_data

In [29]:
def calc_accuracy(y_pred, y, relevancy_threshold=0.4):
    return ((y_pred > relevancy_threshold) == y).sum().item() / len(y)

def train(model, train_loader, criterion, optimizer, scheduler, relevancy_threshold=0.4):
    model.train()
    pbar = tqdm(train_loader)
    losses, accuracies = [], []

    for batch, y in pbar:
        optimizer.zero_grad()
        y_pred = model(**batch).reshape(len(y))
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        scheduler.step()

        losses.append(loss.item())
        accuracies.append(calc_accuracy(y_pred, y, relevancy_threshold=relevancy_threshold))
        pbar.set_description(f'Loss: {np.mean(losses):.5f} - Accuracy: {np.mean(accuracies):.5f}')

    return np.mean(accuracies)

def evaluate(model, val_loader, criterion, relevancy_threshold=0.4):
    model.eval()
    pbar = tqdm(val_loader)
    losses, accuracies = [], []

    with torch.no_grad():
        for batch, y in pbar:
            y_pred = model(**batch).reshape(len(y))
            loss = criterion(y_pred, y)

            losses.append(loss.item())
            accuracies.append(calc_accuracy(y_pred, y, relevancy_threshold=relevancy_threshold))
            pbar.set_description(f'Loss: {np.mean(losses):.5f} - Accuracy: {np.mean(accuracies):.5f}')

    return np.mean(accuracies)


In [30]:
def train_val_fn(data_name, model_name, batch_size, learning_rate, num_epochs,
                 device='cpu', threshold=0.4, trainable_llm=False):
    # load data
    train_data, val_data, test_data = get_data(data_name, 'cls')

    # create a tokenizer
    tokenizer = get_bert_tokenizer(model_name)

    train_set = RelevancyDataset(train_data, tokenizer, device=device)
    val_set = RelevancyDataset(val_data, tokenizer, device=device)
    # test_set = RelevancyDataset(test_data, tokenizer, device=device)

    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=False)
    # test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

    # create a model
    relevancy_model = get_relevancy_model(model_name, trainable_llm=trainable_llm, device=device)

    # create an optimizer
    optimizer = optim.AdamW(relevancy_model.parameters(), lr=learning_rate)

    # create a criterion
    criterion = nn.BCELoss()

    # create a learning rate scheduler
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 10
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    best_val_acc = 0
    for _ in range(num_epochs):
        train_acc = train(relevancy_model, train_loader, criterion, optimizer, scheduler, relevancy_threshold=threshold)
        val_acc = evaluate(relevancy_model, val_loader, criterion, relevancy_threshold=threshold)
        print(f'Train Accuracy: {train_acc:.5f} - Val Accuracy: {val_acc:.5f}')

        # if val_acc > best_val_acc:
        #     torch.save(relevancy_model, f'modules/judge_system/best_fluency_model/relevancy_{batch_size}_{learning_rate}_{num_epochs}_{threshold}_{int(1000*val_acc)}.pt')
        #     best_val_acc = val_acc

    return relevancy_model, val_acc

In [31]:
best_acc = 0

def objective(trial, data_name, device):
    global best_acc

    data_name = data_name
    model_name = 'dbmdz/bert-base-turkish-cased'
    batch_size = trial.suggest_int('batch_size', 1, 32)
    learning_rate = trial.suggest_float('learning_rate', 1e-7, 1e-2, log=True)
    num_epochs = 1  # trial.suggest_int('num_epochs', 1, 5)
    threshold = trial.suggest_float('threshold', 0.1, 0.9)

    model, acc = train_val_fn(data_name, model_name, batch_size, learning_rate, num_epochs, device, threshold)

    if acc > best_acc:
        torch.save(model, f'best_model.pt')
        print('='*50)
        print('CURRENT BEST MODEL SAVED!')
        print('='*50)
        print(f'Batch Size: {batch_size} - Learning Rate: {learning_rate} - Num Epochs: {num_epochs} - Threshold: {threshold} - Accuracy: {acc:.5f}')
        best_acc = acc

    return acc


def tune_hyperparameters(study_name, data_name, num_trials, device):
    study = optuna.create_study(study_name=study_name, storage=f'sqlite:///{study_name}.db',
                                direction="maximize", load_if_exists=True,
                                pruner=optuna.pruners.SuccessiveHalvingPruner())
    study.optimize(lambda x: objective(x, data_name, device), n_trials=num_trials, gc_after_trial=True)


In [32]:
model, _ = train_val_fn('reddit', 'dbmdz/bert-base-turkish-cased', batch_size=4, learning_rate=1e-4,
                        num_epochs=3, device='cpu', threshold=0.4, trainable_llm=False)

FileNotFoundError: [Errno 2] No such file or directory: 'taboo-datasets/reddit-dataset/tr-reddit-pairs_train.parquet'