## Setup

In [None]:
!pip install transformers torch datasets "ray[tune]"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ray[tune]
  Downloading ray-2.3.0-cp38-cp38-manylinux2014_x86_64.whl (58.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.6/58.6 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface

In [None]:
from pathlib import Path

WORKING_ENV = 'COLAB' # Can be LABS, COLAB or PAPERSPACE

assert WORKING_ENV in ['COLAB', 'PAPERSPACE']

if WORKING_ENV == 'COLAB':
    from google.colab import drive
    %load_ext google.colab.data_table
    content_path = '/content/drive/MyDrive/'
    drive.mount('/content/drive/', force_remount=True) # Outputs will be saved in your google drive

else: # Using Paperspace
    # Paperspace does not properly render animated progress bars
    # Strongly recommend using the JupyterLab UI instead of theirs
    !pip install ipywidgets 
    content_path = '/notebooks'

content_path = Path(content_path)

The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table
Mounted at /content/drive/


In [None]:
data_folder = f"{content_path}/NLP/data"
results_folder = f"{content_path}/NLP/results"
logging_folder = f"{content_path}/NLP/logs"
hp_search_folder = f"{content_path}/NLP/hp_search"

In [None]:
import pandas as pd
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification, DebertaTokenizer
import torch.nn as nn
import torch
import datasets
# from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os
import itertools

In [None]:
# from ray.tune.suggest.hyperopt import HyperOptSearch
# from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
# from ray.tune import CLIReporter
# from ray import tune

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Load data

In [None]:
pcl_df_train_train = pd.read_csv(f"{data_folder}/pcl_df_train_train_aug.csv")
pcl_df_train_dev = pd.read_csv(f"{data_folder}/pcl_df_train_dev_preprocessed.csv")
pcl_df_dev = pd.read_csv(f"{data_folder}/pcl_df_dev_preprocessed.csv")

In [None]:
pcl_df_train_train.columns

Index(['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label', 'class',
       'preprocessed_text'],
      dtype='object')

In [None]:
pcl_df_train_train = pcl_df_train_train[['text', 'class']]
pcl_df_train_dev = pcl_df_train_dev[['text', 'class']]
pcl_df_dev = pcl_df_dev[['text', 'class']]

In [None]:
pcl_df_train_train = datasets.Dataset.from_pandas(pcl_df_train_train)
pcl_df_train_dev = datasets.Dataset.from_pandas(pcl_df_train_dev)
pcl_df_dev = datasets.Dataset.from_pandas(pcl_df_dev)

In [None]:
type(pcl_df_train_train)

datasets.arrow_dataset.Dataset

### Helper functions

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
def model_init():

    model = AutoModelForSequenceClassification.from_pretrained(
        "microsoft/deberta-base", 
        num_labels=2, 
        id2label=id2label, 
        label2id=label2id,
        sinusoidal_pos_embds = True
    )
    
    model.classifier = torch.nn.Sequential(
        torch.nn.Linear(768, 1024),
        torch.nn.BatchNorm1d(1024),
        torch.nn.Dropout(0.2),
        torch.nn.ReLU(),
        torch.nn.Linear(1024, 256),
        torch.nn.BatchNorm1d(256),
        torch.nn.Dropout(0.2),
        torch.nn.ReLU(),
        torch.nn.Linear(256, 64),
        torch.nn.BatchNorm1d(64),
        torch.nn.Dropout(0.2),
        torch.nn.ReLU(),  
        torch.nn.Linear(64, 2),
        torch.nn.Softmax(dim=-1)
    )

    return model


tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

In [None]:
def tokenization(batched_text):
    return tokenizer(
        batched_text['text'], 
        padding = 'max_length', 
        truncation=True, 
        max_length = 512
    )

In [None]:
# define accuracy metrics
def compute_metrics(pred):
    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    preds, labels = pred
    preds = np.argmax(preds, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


### Tokenization

In [None]:
pcl_df_train_train = pcl_df_train_train.map(
    tokenization, batched = True, batch_size = len(pcl_df_train_train)
)
pcl_df_train_dev = pcl_df_train_dev.map(
    tokenization, batched = True, batch_size = len(pcl_df_train_dev)
)

pcl_df_dev = pcl_df_dev.map(
    tokenization, batched = True, batch_size = len(pcl_df_dev)
)


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Map:   0%|          | 0/2094 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

In [None]:
pcl_df_train_train.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)
pcl_df_train_dev.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)
pcl_df_dev.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)

In [None]:
pcl_df_train_train = pcl_df_train_train.rename_column("class", "label")
pcl_df_train_dev = pcl_df_train_dev.rename_column("class", "label")
pcl_df_dev = pcl_df_dev.rename_column("class", "label")

### Grid search

In [None]:
learning_rate_vals = [1e-5, 2e-5]
weight_decay_vals = [0.1, 0.01]
per_device_train_batch_size_vals = [16, 32]
warmup_steps_vals = [0, 200]

In [None]:
experiment_lr = []
experiment_wd = []
experiment_train_batch_size = []
experiment_warmup = []

experiment_acc = []
experiment_precision = []
experiment_recall = []
experiment_f1 = []

for learning_rate, weight_decay, per_device_train_batch_size, warmup_steps in tqdm(
    list(
    itertools.product(
    learning_rate_vals, weight_decay_vals, 
    per_device_train_batch_size_vals, warmup_steps_vals)
    )):
    
    training_args = TrainingArguments(
        output_dir=hp_search_folder, 
        learning_rate=learning_rate,  # config
        warmup_steps=warmup_steps, #config
        weight_decay=weight_decay,  # config
        per_device_train_batch_size=per_device_train_batch_size,  # config
        num_train_epochs=10,
        per_device_eval_batch_size=16, 
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        gradient_accumulation_steps=8,
        logging_steps=100,
        logging_dir=logging_folder,
    )

    trainer = Trainer(
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=pcl_df_train_train,
        eval_dataset=pcl_df_train_dev,
        model_init=model_init,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    metrics = trainer.evaluate()

    experiment_lr.append(learning_rate)
    experiment_wd.append(weight_decay)
    experiment_train_batch_size.append(per_device_train_batch_size)
    experiment_warmup.append(warmup_steps)
    experiment_acc.append(metrics['eval_accuracy'])
    experiment_precision.append(metrics['eval_precision'])
    experiment_recall.append(metrics['eval_recall'])
    experiment_f1.append(metrics['eval_f1'])


In [None]:
grid_search_results = pd.DataFrame({
    'learning_rate': experiment_lr,
    'weight_decay': experiment_wd,
    'per_device_train_batch_size': experiment_train_batch_size,
    'warmup_steps': experiment_warmup,
    'accuracy': experiment_acc,
    'precision': experiment_precision,
    'recall': experiment_recall,
    'f1': experiment_f1
})

In [None]:
# get the best hyperparameters with highest f1 score
grid_search_results = grid_search_results.sort_values(by='f1', ascending=False)
grid_search_results.to_csv(f"{results_folder}_grid_search_results.csv", index=False)

# get the first row of the dataframe
best_hyperparameters = grid_search_results.iloc[0]

# get the best hyperparameters
best_learning_rate = best_hyperparameters['learning_rate']
best_weight_decay = best_hyperparameters['weight_decay']
best_per_device_train_batch_size = int(best_hyperparameters['per_device_train_batch_size'])
best_warmup_steps = int(best_hyperparameters['warmup_steps'])

In [None]:
best_hyperparameters

### Train with best hyperparameters on the entire train data

In [None]:
# hyperparameters
lr = best_learning_rate
weight_decay = best_weight_decay
train_batch_size = best_per_device_train_batch_size
warmup_steps = best_warmup_steps
eval_batch_size = 16
gradient_accumulation_steps = 8
logging_steps = 100

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapsh

In [None]:
training_args = TrainingArguments(
    output_dir = results_folder,
    num_train_epochs = 10,
    per_device_train_batch_size = train_batch_size,
    learning_rate = lr,
    gradient_accumulation_steps = gradient_accumulation_steps,    
    per_device_eval_batch_size= eval_batch_size,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    metric_for_best_model = 'eval_f1',
    greater_is_better = True,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_steps = logging_steps,
    fp16 = True,
    logging_dir=logging_folder,
    dataloader_num_workers = 0,
    run_name = 'deberta-classification'
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=pcl_df_train_train,
        eval_dataset=pcl_df_train_dev,
        model_init=model_init,
        compute_metrics=compute_metrics,
    )

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6700
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 1040
  Number of trainable parameters = 68022594
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.7849,0.681555,0.61791,0.230769,0.144796,0.568047
1,0.7436,0.681269,0.740299,0.333844,0.225207,0.64497
2,0.6505,0.611077,0.846567,0.447312,0.351351,0.615385
3,0.6022,0.599019,0.82806,0.452471,0.333333,0.704142
4,0.5821,0.577781,0.865672,0.501109,0.400709,0.668639
5,0.5684,0.57672,0.845373,0.489152,0.366864,0.733728
6,0.5579,0.551221,0.885373,0.524752,0.451064,0.627219
7,0.5483,0.571757,0.863284,0.503254,0.39726,0.686391
8,0.5529,0.537113,0.89194,0.534704,0.472727,0.615385
9,0.5449,0.553026,0.872836,0.517007,0.419118,0.674556


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1675
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/NLP/results/checkpoint-104
Configuration saved in /content/drive/MyDrive/NLP/results/checkpoint-104/config.json
Model weights saved in /content/drive/MyDrive/NLP/results/checkpoint-104/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/NLP/results/checkpoint-104/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/NLP/results/checkpoint-104/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertFor

TrainOutput(global_step=1040, training_loss=0.6201488990050096, metrics={'train_runtime': 357.9391, 'train_samples_per_second': 187.183, 'train_steps_per_second': 2.906, 'total_flos': 9089077250433024.0, 'train_loss': 0.6201488990050096, 'epoch': 9.99})

In [None]:
# evaluate the model on eval_dataset=pcl_df_train_dev, this should give the 
# best performance found during the training process
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1675
  Batch size = 16


{'eval_loss': 0.30444014072418213,
 'eval_accuracy': 0.9164179104477612,
 'eval_f1': 0.5953757225433526,
 'eval_precision': 0.5819209039548022,
 'eval_recall': 0.6094674556213018,
 'eval_runtime': 1.9852,
 'eval_samples_per_second': 843.722,
 'eval_steps_per_second': 52.89,
 'epoch': 9.99}

### Make predictions on official dev set

In [None]:
dev_set_preds, dev_set_labels, dev_set_metrics = trainer.predict(
    pcl_df_dev, metric_key_prefix="dev"
)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2094
  Batch size = 16


In [None]:
dev_set_metrics

{'dev_loss': 0.5393843054771423,
 'dev_accuracy': 0.8930276981852913,
 'dev_f1': 0.5193133047210301,
 'dev_precision': 0.45318352059925093,
 'dev_recall': 0.6080402010050251,
 'dev_runtime': 2.4904,
 'dev_samples_per_second': 840.817,
 'dev_steps_per_second': 52.601}

In [None]:
dev_set_pred_labels = np.argmax(dev_set_preds)

### Saving trained model

In [None]:
# save the best model
trainer.save_model(f'{results_folder}/deberta')

Saving model checkpoint to /content/drive/MyDrive/NLP/results/distilbert
Configuration saved in /content/drive/MyDrive/NLP/results/distilbert/config.json
Model weights saved in /content/drive/MyDrive/NLP/results/distilbert/pytorch_model.bin


### Load trained model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    f'{results_folder}/deberta', 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id
)


loading configuration file /content/drive/MyDrive/NLP/results/distilbert/config.json
You are using a model of type distilbert to instantiate a model of type longformer. This is not supported for all configurations of models and can yield errors.
Model config LongformerConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "attention_probs_dropout_prob": 0.1,
  "attention_window": 512,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "dim": 768,
  "dropout": 0.1,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dim": 3072,
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "longformer",
  "n_heads": 12,
  