## Setup

In [1]:
!pip install transformers torch datasets

[0m

In [2]:
from pathlib import Path


WORKING_ENV = 'PAPERSPACE' # Can be LABS, COLAB or PAPERSPACE
assert WORKING_ENV in ['COLAB', 'PAPERSPACE']

if WORKING_ENV == 'COLAB':
    from google.colab import drive
    %load_ext google.colab.data_table
    content_path = '/content/drive/MyDrive/nlp'
    drive.mount('/content/drive/') # Outputs will be saved in your google drive

else: # Using Paperspace
    # Paperspace does not properly render animated progress bars
    # Strongly recommend using the JupyterLab UI instead of theirs
    !pip install ipywidgets 
    content_path = '/notebooks'

content_path = Path(content_path)

[0m

In [3]:
data_folder = f"{content_path}/NLP/data"
results_folder = f"{content_path}/NLP/results"
logging_folder = f"{content_path}/NLP/logs"

In [4]:
import torch
from transformers import AutoTokenizer, LongformerForSequenceClassification

In [5]:
import pandas as pd
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
import datasets
# from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Load data

In [7]:
pcl_df_train_train = pd.read_csv(f"{data_folder}/pcl_df_train_train.csv")
pcl_df_train_dev = pd.read_csv(f"{data_folder}/pcl_df_train_dev.csv")
pcl_df_dev = pd.read_csv(f"{data_folder}/pcl_df_dev.csv")

In [8]:
pcl_df_train_train = pcl_df_train_train[['text', 'class']]
pcl_df_train_dev = pcl_df_train_dev[['text', 'class']]
pcl_df_dev = pcl_df_dev[['text', 'class']]

In [9]:
pcl_df_train_train = datasets.Dataset.from_pandas(pcl_df_train_train)
pcl_df_train_dev = datasets.Dataset.from_pandas(pcl_df_train_dev)
pcl_df_dev = datasets.Dataset.from_pandas(pcl_df_dev)

In [10]:
type(pcl_df_train_train)

datasets.arrow_dataset.Dataset

## Longformer

In [11]:
# hyperparameters
train_batch_size = 8
eval_batch_size = 16
lr = 5e-5
num_epochs = 3
gradient_accumulation_steps = 8
warmup_steps = 200
weight_decay = 0.01
logging_steps = 4

In [12]:
config = LongformerConfig()

config

LongformerConfig {
  "attention_probs_dropout_prob": 0.1,
  "attention_window": 512,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [13]:
# load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                           gradient_checkpointing=False,
                                                           attention_window = 512)
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length = 1024)

Downloading:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [14]:
model.config

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [15]:
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = 'max_length', truncation=True, max_length = 1024)

In [16]:
pcl_df_train_train = pcl_df_train_train.map(tokenization, batched = True, batch_size = len(pcl_df_train_train))
pcl_df_train_dev = pcl_df_train_dev.map(tokenization, batched = True, batch_size = len(pcl_df_train_dev))



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
pcl_df_train_dev

Dataset({
    features: ['text', 'class', 'input_ids', 'attention_mask'],
    num_rows: 1675
})

In [18]:
pcl_df_train_train.set_format('torch', columns=['input_ids', 'attention_mask', 'class'])
pcl_df_train_dev.set_format('torch', columns=['input_ids', 'attention_mask', 'class'])

In [19]:
pcl_df_train_train = pcl_df_train_train.rename_column("class", "label")
pcl_df_train_dev = pcl_df_train_dev.rename_column("class", "label")

In [20]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [21]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = results_folder,
    num_train_epochs = 5,
    per_device_train_batch_size = train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,    
    per_device_eval_batch_size= eval_batch_size,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_steps = logging_steps,
    fp16 = True,
    logging_dir=logging_folder,
    dataloader_num_workers = 0,
    run_name = 'longformer-classification-updated-rtx3090_paper_replication_2_warm'
)

In [22]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=pcl_df_train_train,
    eval_dataset=pcl_df_train_dev
)

Using cuda_amp half precision backend


In [23]:
# train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `LongformerForSequenceClassification.forward` and have been ignored: text. If text are not expected by `LongformerForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6700
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 520
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss


Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on C

TrainOutput(global_step=520, training_loss=0.17459068136432995, metrics={'train_runtime': 4024.9794, 'train_samples_per_second': 8.323, 'train_steps_per_second': 0.129, 'total_flos': 2.197562751929549e+16, 'train_loss': 0.17459068136432995, 'epoch': 4.99})

In [None]:
# save the best model
trainer.save_model(f'{results_folder}/longformer')

In [24]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `LongformerForSequenceClassification.forward` and have been ignored: text. If text are not expected by `LongformerForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1675
  Batch size = 16
Initializing global attention on CLS token...


Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on C

{'eval_loss': 0.23045866191387177,
 'eval_accuracy': 0.8967164179104478,
 'eval_f1': 0.04419889502762431,
 'eval_precision': 1.0,
 'eval_recall': 0.022598870056497175,
 'eval_runtime': 42.5018,
 'eval_samples_per_second': 39.41,
 'eval_steps_per_second': 2.47,
 'epoch': 4.99}

### Make predictions on official dev set

In [None]:
dev_set_preds, labels, metrics = trainer.predict(pcl_df_dev, metric_key_prefix="dev")

In [None]:
dev_set_preds = np.argmax(dev_set_preds, axis=1)