## Setup

In [1]:
!pip install transformers torch datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting dil

In [19]:
from pathlib import Path

WORKING_ENV = 'COLAB' # Can be LABS, COLAB or PAPERSPACE

assert WORKING_ENV in ['COLAB', 'PAPERSPACE']

if WORKING_ENV == 'COLAB':
    from google.colab import drive
    %load_ext google.colab.data_table
    content_path = '/content/drive/MyDrive/NLP'
    drive.mount('/content/drive/', force_remount=True) # Outputs will be saved in your google drive

else: # Using Paperspace
    # Paperspace does not properly render animated progress bars
    # Strongly recommend using the JupyterLab UI instead of theirs
    !pip install ipywidgets 
    content_path = '/notebooks'

content_path = Path(content_path)

The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table
Mounted at /content/drive/


In [20]:
# data_folder = f"{content_path}/NLP/data"
# results_folder = f"{content_path}/NLP/results"
# logging_folder = f"{content_path}/NLP/logs"

data_folder = f"{content_path}/data"
results_folder = f"{content_path}/results"
logging_folder = f"{content_path}/logs"

In [21]:
import torch
from transformers import AutoTokenizer, LongformerForSequenceClassification

In [22]:
import pandas as pd
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
import datasets
# from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Load data

In [24]:
pcl_df_train_train = pd.read_csv(f"{data_folder}/pcl_df_train_train.csv")
pcl_df_train_dev = pd.read_csv(f"{data_folder}/pcl_df_train_dev.csv")

In [58]:
pcl_df_dev = pd.read_csv(f"{data_folder}/pcl_df_dev.csv")
pcl_df_dev = pcl_df_dev.dropna()

In [25]:
pcl_df_train_train = pcl_df_train_train[['text', 'class']]
pcl_df_train_dev = pcl_df_train_dev[['text', 'class']]

In [59]:
pcl_df_dev = pcl_df_dev[['text', 'class']]

In [26]:
pcl_df_train_train = datasets.Dataset.from_pandas(pcl_df_train_train)
pcl_df_train_dev = datasets.Dataset.from_pandas(pcl_df_train_dev)

In [60]:
pcl_df_dev = datasets.Dataset.from_pandas(pcl_df_dev)

In [27]:
type(pcl_df_train_train)

datasets.arrow_dataset.Dataset

## Longformer

In [28]:
# hyperparameters
train_batch_size = 8
eval_batch_size = 16
lr = 5e-5
num_epochs = 3
gradient_accumulation_steps = 8
warmup_steps = 200
weight_decay = 0.01
logging_steps = 4

In [29]:
config = LongformerConfig()

config

LongformerConfig {
  "attention_probs_dropout_prob": 0.1,
  "attention_window": 512,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [30]:
# load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                           gradient_checkpointing=False,
                                                           attention_window = 512)
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length = 1024)

Downloading (…)lve/main/config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weig

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [31]:
model.config

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [32]:
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = 'max_length', truncation=True, max_length = 1024)

In [33]:
pcl_df_train_train = pcl_df_train_train.map(tokenization, batched = True, batch_size = len(pcl_df_train_train))
pcl_df_train_dev = pcl_df_train_dev.map(tokenization, batched = True, batch_size = len(pcl_df_train_dev))

Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

In [35]:
pcl_df_train_train.set_format('torch', columns=['input_ids', 'attention_mask', 'class'])
pcl_df_train_dev.set_format('torch', columns=['input_ids', 'attention_mask', 'class'])

In [36]:
pcl_df_train_train = pcl_df_train_train.rename_column("class", "label")
pcl_df_train_dev = pcl_df_train_dev.rename_column("class", "label")

In [37]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [38]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = results_folder,
    num_train_epochs = 5,
    per_device_train_batch_size = train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,    
    per_device_eval_batch_size= eval_batch_size,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_steps = logging_steps,
    fp16 = True,
    logging_dir=logging_folder,
    dataloader_num_workers = 0,
    run_name = 'longformer-classification-updated-rtx3090_paper_replication_2_warm'
)

In [39]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=pcl_df_train_train,
    eval_dataset=pcl_df_train_dev
)

Using cuda_amp half precision backend


In [65]:
pcl_df_train_dev

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1675
})

In [40]:
# train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `LongformerForSequenceClassification.forward` and have been ignored: text. If text are not expected by `LongformerForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6700
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 520
  Number of trainable parameters = 148660994
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.2331,0.229561,0.898507,0.086022,0.888889,0.045198
1,0.2087,0.235277,0.888358,0.555819,0.479508,0.661017
2,0.1807,0.262809,0.906269,0.587927,0.54902,0.632768
3,0.0478,0.302933,0.920597,0.52669,0.711538,0.418079
4,0.0445,0.437221,0.917015,0.585075,0.620253,0.553672


Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on C

TrainOutput(global_step=520, training_loss=0.1686536344914482, metrics={'train_runtime': 1806.7347, 'train_samples_per_second': 18.542, 'train_steps_per_second': 0.288, 'total_flos': 2.197562751929549e+16, 'train_loss': 0.1686536344914482, 'epoch': 4.99})

In [41]:
# save the best model
trainer.save_model(f'{results_folder}/longformer')

Saving model checkpoint to /content/drive/MyDrive/NLP/results/longformer
Configuration saved in /content/drive/MyDrive/NLP/results/longformer/config.json
Model weights saved in /content/drive/MyDrive/NLP/results/longformer/pytorch_model.bin


### Load trained model

In [66]:
trained_model = LongformerForSequenceClassification.from_pretrained(f'{results_folder}/longformer',
                                                           gradient_checkpointing=False,
                                                           attention_window = 512)

loading configuration file /content/drive/MyDrive/NLP/results/longformer/config.json
Model config LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "architectures": [
    "LongformerForSequenceClassification"
  ],
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": 512,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "sep_token_id": 2,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 

In [None]:
from transformers import pipeline
clf = pipeline("text-classification", trained_model, tokenizer)

In [67]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `LongformerForSequenceClassification.forward` and have been ignored: text. If text are not expected by `LongformerForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1675
  Batch size = 16
Initializing global attention on CLS token...


Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on C

{'eval_loss': 0.22956061363220215,
 'eval_accuracy': 0.8985074626865671,
 'eval_f1': 0.08602150537634408,
 'eval_precision': 0.8888888888888888,
 'eval_recall': 0.04519774011299435,
 'eval_runtime': 18.4752,
 'eval_samples_per_second': 90.662,
 'eval_steps_per_second': 5.683,
 'epoch': 4.99}

### Make predictions on official dev set

In [61]:
pcl_df_dev = pcl_df_dev.map(tokenization, batched = True, batch_size = len(pcl_df_dev))
pcl_df_dev.set_format('torch', columns=['input_ids', 'attention_mask', 'class'])
pcl_df_dev = pcl_df_dev.rename_column("class", "label")

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

In [62]:
dev_set_preds, labels, metrics = trainer.predict(pcl_df_dev, metric_key_prefix="dev")

The following columns in the test set don't have a corresponding argument in `LongformerForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `LongformerForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2093
  Batch size = 16
Initializing global attention on CLS token...


Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on C

In [63]:
dev_set_preds

array([[ 0.838  , -0.45   ],
       [ 1.072  , -0.695  ],
       [ 0.965  , -0.5435 ],
       ...,
       [ 1.473  , -1.23   ],
       [ 1.997  , -1.62   ],
       [ 0.08295,  0.1744 ]], dtype=float16)

In [64]:
metrics

{'dev_loss': 0.22298593819141388,
 'dev_accuracy': 0.9087434304825609,
 'dev_f1': 0.09478672985781991,
 'dev_precision': 0.8333333333333334,
 'dev_recall': 0.05025125628140704,
 'dev_runtime': 23.0962,
 'dev_samples_per_second': 90.621,
 'dev_steps_per_second': 5.672}

In [48]:
dev_set_preds = np.argmax(dev_set_preds)

In [51]:
dev_set_preds

0

In [46]:
pcl_df_dev

Dataset({
    features: ['text', 'class'],
    num_rows: 2094
})