## Setup

In [None]:
!pip install transformers torch datasets "ray[tune]"

In [None]:
from pathlib import Path

WORKING_ENV = 'COLAB' # Can be LABS, COLAB or PAPERSPACE

assert WORKING_ENV in ['COLAB', 'PAPERSPACE']

if WORKING_ENV == 'COLAB':
    from google.colab import drive
    %load_ext google.colab.data_table
    content_path = '/content/drive/MyDrive/'
    drive.mount('/content/drive/', force_remount=True) # Outputs will be saved in your google drive

else: # Using Paperspace
    # Paperspace does not properly render animated progress bars
    # Strongly recommend using the JupyterLab UI instead of theirs
    !pip install ipywidgets 
    content_path = '/notebooks'

content_path = Path(content_path)

In [None]:
data_folder = f"{content_path}/NLP/data"
results_folder = f"{content_path}/NLP/results"
logging_folder = f"{content_path}/NLP/logs"
hp_search_folder = f"{content_path}/NLP/hp_search"

In [None]:
import pandas as pd
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification, DebertaTokenizer
import torch.nn as nn
import torch
import datasets
# from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os
import itertools

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

## Load data

In [None]:
pcl_df_train_train = pd.read_csv(f"{data_folder}/pcl_df_train_train_preprocessed.csv")
pcl_df_train_train_aug = pd.read_csv(f"{data_folder}/pcl_df_train_train_aug.csv")
pcl_df_train_train_gpt = pd.read_csv(f"{data_folder}/pcl_df_train_train_aug_chatgpt.csv")


pcl_df_train_dev = pd.read_csv(f"{data_folder}/pcl_df_train_dev_preprocessed.csv")
pcl_df_dev = pd.read_csv(f"{data_folder}/pcl_df_dev_preprocessed.csv")

In [None]:
pcl_df_train_train.shape

In [None]:
pcl_df_train_train["class"].value_counts()

In [None]:
pcl_df_train_train_aug.shape

In [None]:
pcl_df_train_train_aug["class"].value_counts()

In [None]:
pcl_df_train_train_gpt.shape

In [None]:
pcl_df_train_train_gpt["class"].value_counts()

In [None]:
pcl_df_train_train.columns

In [None]:
pcl_df_train_train = pcl_df_train_train[['text', 'class']]
pcl_df_train_train_aug = pcl_df_train_train_aug[['text', 'class']]
pcl_df_train_train_gpt = pcl_df_train_train_gpt[['text', 'class']]


pcl_df_train_dev = pcl_df_train_dev[['text', 'class']]
pcl_df_dev = pcl_df_dev[['text', 'class']]


In [None]:
pcl_df_train_train = datasets.Dataset.from_pandas(pcl_df_train_train)
pcl_df_train_train_aug = datasets.Dataset.from_pandas(pcl_df_train_train_aug)
pcl_df_train_train_gpt = datasets.Dataset.from_pandas(pcl_df_train_train_gpt)

pcl_df_train_dev = datasets.Dataset.from_pandas(pcl_df_train_dev)
pcl_df_dev = datasets.Dataset.from_pandas(pcl_df_dev)

In [None]:
type(pcl_df_train_train)

### Helper functions

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
def model_init():

    model = AutoModelForSequenceClassification.from_pretrained(
        "microsoft/deberta-base", 
        num_labels=2, 
        id2label=id2label, 
        label2id=label2id
    )

    model.classifier = torch.nn.Sequential(
        torch.nn.Linear(768, 1024),
        torch.nn.BatchNorm1d(1024),
        torch.nn.Dropout(0.2),
        torch.nn.ReLU(),
        torch.nn.Linear(1024, 256),
        torch.nn.BatchNorm1d(256),
        torch.nn.Dropout(0.2),
        torch.nn.ReLU(),
        torch.nn.Linear(256, 64),
        torch.nn.BatchNorm1d(64),
        torch.nn.Dropout(0.2),
        torch.nn.ReLU(),  
        torch.nn.Linear(64, 2),
        torch.nn.Softmax(dim=-1)
    )

    return model


tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

In [None]:
def tokenization(batched_text):
    return tokenizer(
        batched_text['text'], 
        padding = 'max_length', 
        truncation=True, 
        max_length = 512
    )

In [None]:
# define accuracy metrics
def compute_metrics(pred):
    preds, labels = pred
    preds = np.argmax(preds, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


### Tokenization

In [None]:
pcl_df_train_train = pcl_df_train_train.map(
    tokenization, batched = True, batch_size = len(pcl_df_train_train)
)

pcl_df_train_train_aug = pcl_df_train_train_aug.map(
    tokenization, batched = True, batch_size = len(pcl_df_train_train_aug)
)

pcl_df_train_train_gpt = pcl_df_train_train_gpt.map(
    tokenization, batched = True, batch_size = len(pcl_df_train_train_gpt)
)


pcl_df_train_dev = pcl_df_train_dev.map(
    tokenization, batched = True, batch_size = len(pcl_df_train_dev)
)

pcl_df_dev = pcl_df_dev.map(
    tokenization, batched = True, batch_size = len(pcl_df_dev)
)


In [None]:
pcl_df_train_train.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)

pcl_df_train_train_aug.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)

pcl_df_train_train_gpt.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)



pcl_df_train_dev.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)
pcl_df_dev.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)


In [None]:
pcl_df_train_train = pcl_df_train_train.rename_column("class", "label")
pcl_df_train_train_aug= pcl_df_train_train_aug.rename_column("class", "label")
pcl_df_train_train_gpt = pcl_df_train_train_gpt.rename_column("class", "label")


pcl_df_train_dev = pcl_df_train_dev.rename_column("class", "label")
pcl_df_dev = pcl_df_dev.rename_column("class", "label")

### Grid search

In [None]:
learning_rate_vals = [1e-5, 2e-5]
weight_decay_vals = [0.1, 0.01]
per_device_train_batch_size_vals = [16, 32]
warmup_steps_vals = [0, 200]

In [None]:
for learning_rate, weight_decay, per_device_train_batch_size, warmup_steps in list(
    itertools.product(
    learning_rate_vals, weight_decay_vals, 
    per_device_train_batch_size_vals, warmup_steps_vals)
    ):
  
  print(learning_rate, weight_decay, per_device_train_batch_size, warmup_steps)

In [None]:
experiment_lr = []
experiment_wd = []
experiment_train_batch_size = []
experiment_warmup = []

experiment_acc = []
experiment_precision = []
experiment_recall = []
experiment_f1 = []

for learning_rate, weight_decay, per_device_train_batch_size, warmup_steps in tqdm(
    list(
    itertools.product(
    learning_rate_vals, weight_decay_vals, 
    per_device_train_batch_size_vals, warmup_steps_vals)
    )):
    
    training_args = TrainingArguments(
        output_dir=hp_search_folder, 
        learning_rate=learning_rate,  # config
        warmup_steps=warmup_steps, #config
        weight_decay=weight_decay,  # config
        per_device_train_batch_size=per_device_train_batch_size,  # config
        num_train_epochs=10,
        per_device_eval_batch_size=16, 
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        gradient_accumulation_steps=8,
        logging_steps=100,
        logging_dir=logging_folder,
    )

    trainer = Trainer(
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=pcl_df_train_train_aug,
        eval_dataset=pcl_df_train_dev,
        model_init=model_init,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    metrics = trainer.evaluate()

    experiment_lr.append(learning_rate)
    experiment_wd.append(weight_decay)
    experiment_train_batch_size.append(per_device_train_batch_size)
    experiment_warmup.append(warmup_steps)
    experiment_acc.append(metrics['eval_accuracy'])
    experiment_precision.append(metrics['eval_precision'])
    experiment_recall.append(metrics['eval_recall'])
    experiment_f1.append(metrics['eval_f1'])


In [None]:
grid_search_results = pd.DataFrame({
    'learning_rate': experiment_lr,
    'weight_decay': experiment_wd,
    'per_device_train_batch_size': experiment_train_batch_size,
    'warmup_steps': experiment_warmup,
    'accuracy': experiment_acc,
    'precision': experiment_precision,
    'recall': experiment_recall,
    'f1': experiment_f1
})

In [None]:
# get the best hyperparameters with highest f1 score
grid_search_results = grid_search_results.sort_values(by='f1', ascending=False)
grid_search_results.to_csv(f"{results_folder}_grid_search_results.csv", index=False)

# get the first row of the dataframe
best_hyperparameters = grid_search_results.iloc[0]

# get the best hyperparameters
best_learning_rate = best_hyperparameters['learning_rate']
best_weight_decay = best_hyperparameters['weight_decay']
best_per_device_train_batch_size = int(best_hyperparameters['per_device_train_batch_size'])
best_warmup_steps = int(best_hyperparameters['warmup_steps'])

In [None]:
best_hyperparameters