In [1]:
# https://jesusleal.io/2020/11/24/Longformer-with-IMDB/
# https://github.com/jlealtru/website_tutorials/blob/main/notebooks/Longformer%20with%20IMDB.ipynb

In [2]:
#import pandas as pd
import datasets
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
# import wandb
import os

In [3]:
config = LongformerConfig()

In [4]:
snli_dataset = datasets.load_dataset("stanfordnlp/snli").filter(lambda e: e['label'] != -1)

In [5]:
# load model and tokenizer and define length of the text sequence

ckpt_path = "../ignored_dir/training_outputs/longformer_snli/run_2/results/checkpoint-1500"

# model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=3, gradient_checkpointing=False, attention_window=512)
model = LongformerForSequenceClassification.from_pretrained(ckpt_path, num_labels=3, gradient_checkpointing=False, attention_window=512)
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length = 1024)

In [6]:
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
    return tokenizer(batched_text['premise'], batched_text['hypothesis'], padding = 'max_length', truncation=True, max_length = 1024)

#train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
#test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))

snli_dataset_mapped = snli_dataset.map(tokenization, batched=True, batch_size=len(snli_dataset))
train_data, eval_data = snli_dataset_mapped['train'], snli_dataset_mapped['validation']

In [7]:
# define the training arguments

exp_dir = "../ignored_dir/training_outputs/longformer_snli"
if not os.path.exists(exp_dir):
    os.mkdir(exp_dir)
run_fold = f"run_{len(os.listdir(exp_dir)) + 1}"
run_path = os.path.join(exp_dir, run_fold)
results_path = os.path.join(run_path, "results")
if not os.path.exists(os.path.exists(results_path)):
    os.mkdir(results_path)
logs_path = os.path.join(run_path, "logs")
if not os.path.exists(os.path.exists(logs_path)):
    os.mkdir(logs_path)

training_args = TrainingArguments(
    output_dir = results_path,
    num_train_epochs = 5,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 8,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "epoch",
    disable_tqdm = False, 
    warmup_steps=200,
    weight_decay=0.01,
    logging_steps = 4,
    fp16 = True,
    logging_dir=logs_path,
    dataloader_num_workers = 0,
    run_name = 'longformer-snli'
)



In [8]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
eval_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [9]:
test_data = snli_dataset_mapped['test']
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [10]:
# define accuracy metrics

import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)
    """
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return acc
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    """

In [11]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=eval_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


'cuda'

In [12]:
# trainer.evaluate(eval_dataset=test_data)

In [13]:
trainer.train()

Initializing global attention on CLS token...
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Initializing global attention on CLS token...


In [None]:
# save the best model
trainer.save_model('tmpresults/paper_replication_lr_warmup200')

In [None]:
trainer.evaluate()