In [2]:
# !pip install datasets
# !pip install transformers
# !pip install wandb
# !pip install accelerate>=0.21.0
# !pip install transformers[torch]

# Text classification with the Longformer


In [3]:
# Importing necessary libraries
import pandas as pd  # For data manipulation and analysis
import datasets  # For loading datasets for NLP tasks

# For working with Longformer model
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn  # PyTorch neural network module
import torch  # PyTorch library
from torch.utils.data import Dataset, DataLoader  # PyTorch data utilities
import numpy as np  # Numerical computing library
from tqdm import tqdm  # For displaying progress bars
import wandb  # For logging training runs
import os  # Operating system module

2024-03-28 04:58:47.117030: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-28 04:58:47.117176: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-28 04:58:47.256454: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


1 LongformerTokenizerFast:

Explanation: A fast tokenizer specifically designed for the Longformer model.

Why Use: Efficiently tokenizes text inputs for Longformer model inference or fine-tuning.

2 LongformerForSequenceClassification:

Explanation: Longformer model fine-tuned for sequence classification tasks.

Why Use: Utilized for sequence classification tasks such as sentiment analysis or document classification.

3 Trainer:

Explanation: High-level interface for training and evaluating transformer models.

Why Use: Simplifies the training process by providing a consistent API for training and evaluation.

4 TrainingArguments:

Explanation: A class to hold all the configuration parameters for training a model.

Why Use: Allows customization of training process including batch size, number of epochs, etc., without changing the training code.

5 LongformerConfig:

Explanation: Configuration class for Longformer model architecture.

Why Use: Provides flexibility to configure Longformer model architecture according to specific requirements such as number of layers, attention mechanism, etc.

In [4]:
config = LongformerConfig()
config

LongformerConfig {
  "attention_probs_dropout_prob": 0.1,
  "attention_window": 512,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "sep_token_id": 2,
  "transformers_version": "4.38.2",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

# download IMDB Dataset

In [5]:
# Load IMDb dataset using Hugging Face datasets library, specifying parameters:
# - 'imdb': IMDb dataset
# - split=['train', 'test']: Specify the splits to load (training and test)
# - cache_dir= Specify cache directory for dataset caching (give any name as you wish)
train_data, test_data = datasets.load_dataset('imdb', split=['train', 'test'],
                                              cache_dir='data')


Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to data/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to data/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
train_data

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [7]:
train_data.features['label']

ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)

In [8]:
# Display a few rows from the training dataset
train_data[:5]  # Get the first 5 rows

{'text': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far b

# Dowloand Pretrained Longformer and its tokenizer

In [9]:
# Load Longformer model for sequence classification, specifying parameters:
# - 'allenai/longformer-base-4096': Pre-trained Longformer model
# - gradient_checkpointing=False: This parameter disables gradient checkpointing. Gradient checkpointing is a technique used to reduce 
# memory consumption during training by trading compute for memory. When set to False, gradient checkpointing is turned off, 
# which can lead to higher memory usage but potentially faster computation.
# - attention_window=512:  This parameter defines the attention window size for the Longformer model. Attention window determines 
# the range of tokens that each token attends to during self-attention computation.

model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                           gradient_checkpointing=False,
                                                           attention_window=512)

# Load Longformer tokenizer, specifying parameters:
# - 'allenai/longformer-base-4096': Pre-trained Longformer tokenizer
# - max_length=1024: Defines the maximum sequence length for tokenization
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length=1024)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
# The model gives a warning anout needing to train the model first before usage in downstream tasks.

model.config

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "sep_token_id": 2,
  "transformers_version": "4.38.2",
  "type_vocab_size": 1,
  "vocab_size": 50265
}

# Tokenization

In [11]:
# Define a function to tokenize the text data and prepare inputs for the model
def tokenization(batched_text):
    # Tokenize batched text using the tokenizer
    # - 'text': Key for the input text in the batched_text dictionary
    # - padding='max_length': Pad sequences to the maximum length in the batch
    # - truncation=True: Truncate sequences longer than max_length
    # - max_length=1024: Define the maximum sequence length for tokenization
    return tokenizer(batched_text['text'], padding='max_length', truncation=True, max_length=1024)

# Tokenize and prepare inputs for training data using the tokenization function
# Batch=True:This parameter indicates that the tokenization function should be applied in a batched manner, which can be more memory efficient
train_data = train_data.map(tokenization, batched=True, batch_size=len(train_data))
# Tokenize and prepare inputs for test data using the tokenization function
test_data = test_data.map(tokenization, batched=True, batch_size=len(test_data))


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Training


In the paper the authors trained for 15 epochs, with batch size of 32, learning rate of 3e-5 and linear warmup steps equal to 0.1 of the total training steps. For this quick tutorial I went for the default learning rate of the trainer class which is 5e-5, 5 epochs for training, batch size of 8 with gradient accumulation of 8 steps for an effective batch size of 64 and 200 warm up steps (roughly 10 percent of total training steps). The overall training time for this implementation was 2 hours and 54 minutes.

In [12]:
# Define training arguments for the Trainer
training_args = TrainingArguments(
    # Directory to save the trained model and other outputs
    output_dir='results',
    # Number of training epochs. One epoch is a complete pass through the entire training dataset.
    num_train_epochs=1,
    # Batch size per GPU for training. It specifies how many training examples are processed simultaneously on each GPU during training
    per_device_train_batch_size=8,
    # Number of gradient accumulation steps. It controls the accumulation of gradients over multiple batches before performing a weight update, which helps in training with larger effective batch sizes.
    gradient_accumulation_steps=8,
    # Batch size per GPU for evaluation
    per_device_eval_batch_size=16,
    # Evaluation strategy, "epoch" means evaluate at the end of each epoch
    evaluation_strategy="epoch",
    # Whether to disable the tqdm progress bar during training
    disable_tqdm=False,
    # Whether to load the best model at the end of training
    load_best_model_at_end=True,
    # Number of warmup steps for the learning rate scheduler. Warmup steps gradually increase the learning rate from a very small 
    # value to its target value over the initial training steps.
    warmup_steps=200,
    # Weight decay coefficient for the optimizer. It is a regularization term added to the loss function to prevent 
    # overfitting by penalizing large weights.
    weight_decay=0.01,
    # Logging steps, how often to log training metrics
    logging_steps=4,
    # Whether to use mixed precision training (FP16). Mixed precision training uses half-precision floating-point
    # format to speed up training and reduce memory usage.
    fp16=True,
    # Directory to save training logs
    logging_dir='logs',
    # Number of dataloader workers for data loading
    dataloader_num_workers=0,
    # Save strategy, "epoch" means save the model at the end of each epoch
    save_strategy="epoch",
    # Name of the training run for logging purposes
    run_name='longformer-classification-updated-rtx3090_paper_replication_2_warm'
)


In [14]:
# Create Trainer object
trainer = Trainer(
    model=model,                    # The instantiated model to be trained
    args=training_args,             # Training arguments created earlier
    train_dataset=train_data,       # Training dataset
    eval_dataset=test_data          # Evaluation dataset
)

# Train the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss
0,0.1362,0.125362


TrainOutput(global_step=390, training_loss=0.2649148473372826, metrics={'train_runtime': 7558.0536, 'train_samples_per_second': 3.308, 'train_steps_per_second': 0.052, 'total_flos': 1.639501622673408e+16, 'train_loss': 0.2649148473372826, 'epoch': 1.0})

In [20]:
trainer.evaluate() 

{'eval_loss': 0.1256919503211975,
 'eval_runtime': 2277.9361,
 'eval_samples_per_second': 10.975,
 'eval_steps_per_second': 0.686,
 'epoch': 1.0}

In [21]:
# save the best model
trainer.save_model('results/paper_replication_lr_warmup200')