## Setup

In [1]:
!pip install transformers torch datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
Collectin

In [2]:
from pathlib import Path

WORKING_ENV = 'COLAB' # Can be LABS, COLAB or PAPERSPACE

assert WORKING_ENV in ['COLAB', 'PAPERSPACE']

if WORKING_ENV == 'COLAB':
    from google.colab import drive
    %load_ext google.colab.data_table
    content_path = '/content/drive/MyDrive/'
    drive.mount('/content/drive/', force_remount=True) # Outputs will be saved in your google drive

else: # Using Paperspace
    # Paperspace does not properly render animated progress bars
    # Strongly recommend using the JupyterLab UI instead of theirs
    !pip install ipywidgets 
    content_path = '/notebooks'

content_path = Path(content_path)

Mounted at /content/drive/


In [3]:
data_folder = f"{content_path}/NLP/data"
results_folder = f"{content_path}/NLP/results"
logging_folder = f"{content_path}/NLP/logs"

# data_folder = f"{content_path}/data"
# results_folder = f"{content_path}/results"
# logging_folder = f"{content_path}/logs"

In [4]:
import torch
from transformers import AutoTokenizer, LongformerForSequenceClassification

In [5]:
import pandas as pd
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification
import torch.nn as nn
import torch
import datasets
# from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Load data

In [55]:
pcl_df_train_train = pd.read_csv(f"{data_folder}/pcl_df_train_train.csv")
pcl_df_train_dev = pd.read_csv(f"{data_folder}/pcl_df_train_dev.csv")
pcl_df_dev = pd.read_csv(f"{data_folder}/pcl_df_dev.csv")
pcl_df_dev = pcl_df_dev.dropna()

In [52]:
chatgpt = pd.read_csv(f"{data_folder}/chatgpt_reword_random_200samples.csv")

In [60]:
chatgpt["class"] = 1

In [57]:
pcl_df_train_train = pcl_df_train_train[["text", "class"]].copy()

In [63]:
pcl_df_train_train.shape

(6700, 2)

In [64]:
chatgpt.shape

(198, 2)

In [65]:
pcl_df_train_train = pd.concat([pcl_df_train_train, chatgpt], ignore_index=True)

In [66]:
pcl_df_train_train.shape

(6898, 2)

In [31]:
# pcl_df_train_train = pd.read_csv(f"{data_folder}/pcl_df_train_train_aug.csv")
# pcl_df_train_dev = pd.read_csv(f"{data_folder}/pcl_df_train_dev_processed.csv")
# pcl_df_dev = pd.read_csv(f"{data_folder}/pcl_df_dev_processed.csv")

In [67]:
pcl_df_train_train = pcl_df_train_train[['text', 'class']]
pcl_df_train_dev = pcl_df_train_dev[['text', 'class']]
pcl_df_dev = pcl_df_dev[['text', 'class']]

In [68]:
pcl_df_train_train = datasets.Dataset.from_pandas(pcl_df_train_train)
pcl_df_train_dev = datasets.Dataset.from_pandas(pcl_df_train_dev)
pcl_df_dev = datasets.Dataset.from_pandas(pcl_df_dev)

In [69]:
type(pcl_df_train_train)

datasets.arrow_dataset.Dataset

### DistilBERT

In [70]:
distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapsh

In [71]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [72]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [73]:
distilbert_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a

### Functions for Tokenization and Metrics Calculation

In [74]:
tokenizer = distilbert_tokenizer
# define a function that will tokenize the model, and will return the relevant 
# inputs for the model
def tokenization(batched_text):
    return tokenizer(
        batched_text['text'], 
        padding = 'max_length', 
        truncation=True, 
        max_length = 512
    )

In [75]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Tokenization

In [76]:
pcl_df_train_train = pcl_df_train_train.map(
    tokenization, batched = True, batch_size = len(pcl_df_train_train)
)
pcl_df_train_dev = pcl_df_train_dev.map(
    tokenization, batched = True, batch_size = len(pcl_df_train_dev)
)

pcl_df_dev = pcl_df_dev.map(
    tokenization, batched = True, batch_size = len(pcl_df_dev)
)

Map:   0%|          | 0/6898 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

In [77]:
pcl_df_train_train.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)
pcl_df_train_dev.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)
pcl_df_dev.set_format(
    'torch', columns=['input_ids', 'attention_mask', 'class']
)

In [78]:
pcl_df_train_train = pcl_df_train_train.rename_column("class", "label")
pcl_df_train_dev = pcl_df_train_dev.rename_column("class", "label")
pcl_df_dev = pcl_df_dev.rename_column("class", "label")

### Training

In [79]:
# hyperparameters
train_batch_size = 8
eval_batch_size = 16
lr = 2e-5
num_epochs = 10
gradient_accumulation_steps = 8
warmup_steps = 200
weight_decay = 0.01
logging_steps = 4

In [80]:
training_args = TrainingArguments(
    output_dir = results_folder,
    num_train_epochs = num_epochs,
    per_device_train_batch_size = train_batch_size,
    learning_rate = lr,
    gradient_accumulation_steps = gradient_accumulation_steps,    
    per_device_eval_batch_size= eval_batch_size,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    metric_for_best_model = 'eval_f1',
    greater_is_better = True,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_steps = logging_steps,
    fp16 = True,
    logging_dir=logging_folder,
    dataloader_num_workers = 0,
    run_name = 'distilbert-classification'
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [81]:
trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=pcl_df_train_train,
    eval_dataset=pcl_df_train_dev
)


Using cuda_amp half precision backend


In [82]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6898
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 1070
  Number of trainable parameters = 66955010


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.2773,0.301811,0.894328,0.0,0.0,0.0
1,0.1669,0.268484,0.902687,0.204878,0.75,0.118644
2,0.1566,0.225041,0.902687,0.507553,0.545455,0.474576
3,0.1286,0.273485,0.916418,0.448819,0.74026,0.322034
4,0.0643,0.28436,0.899104,0.512968,0.523529,0.502825
5,0.0595,0.377547,0.88597,0.501305,0.466019,0.542373
6,0.0038,0.426555,0.905075,0.516717,0.559211,0.480226
7,0.0024,0.466945,0.899701,0.505882,0.527607,0.485876
8,0.0087,0.482433,0.909254,0.512821,0.592593,0.451977
9,0.0012,0.486653,0.911045,0.517799,0.606061,0.451977


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1675
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /content/drive/MyDrive/NLP/results/checkpoint-107
Configuration saved in /content/drive/MyDrive/NLP/results/checkpoint-107/config.json
Model weights saved in /content/drive/MyDrive/NLP/results/checkpoint-107/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1675
  Batch size = 16
Saving mod

TrainOutput(global_step=1070, training_loss=0.12616111923914367, metrics={'train_runtime': 347.1995, 'train_samples_per_second': 198.675, 'train_steps_per_second': 3.082, 'total_flos': 9130977789358080.0, 'train_loss': 0.12616111923914367, 'epoch': 9.99})

### Saving trained model

In [83]:
# save the best model
trainer.save_model(f'{results_folder}/distilbert')

Saving model checkpoint to /content/drive/MyDrive/NLP/results/distilbert
Configuration saved in /content/drive/MyDrive/NLP/results/distilbert/config.json
Model weights saved in /content/drive/MyDrive/NLP/results/distilbert/pytorch_model.bin


### Load trained model

In [84]:
trained_model = LongformerForSequenceClassification.from_pretrained(
    f'{results_folder}/distilbert',
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id
)

loading configuration file /content/drive/MyDrive/NLP/results/distilbert/config.json
You are using a model of type distilbert to instantiate a model of type longformer. This is not supported for all configurations of models and can yield errors.
Model config LongformerConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "attention_probs_dropout_prob": 0.1,
  "attention_window": 512,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "dim": 768,
  "dropout": 0.1,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dim": 3072,
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "longformer",
  "n_heads": 12,
  

In [85]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1675
  Batch size = 16


{'eval_loss': 0.4866526424884796,
 'eval_accuracy': 0.9110447761194029,
 'eval_f1': 0.5177993527508091,
 'eval_precision': 0.6060606060606061,
 'eval_recall': 0.4519774011299435,
 'eval_runtime': 1.9692,
 'eval_samples_per_second': 850.619,
 'eval_steps_per_second': 53.322,
 'epoch': 9.99}

### Make predictions on official dev set

In [86]:
dev_set_preds, labels, metrics = trainer.predict(
    pcl_df_dev, metric_key_prefix="dev"
)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2093
  Batch size = 16


In [87]:
metrics

{'dev_loss': 0.4379754960536957,
 'dev_accuracy': 0.9168657429526995,
 'dev_f1': 0.5139664804469273,
 'dev_precision': 0.5786163522012578,
 'dev_recall': 0.4623115577889447,
 'dev_runtime': 2.658,
 'dev_samples_per_second': 787.421,
 'dev_steps_per_second': 49.284}

In [None]:
dev_set_preds = np.argmax(dev_set_preds)