In [None]:
# Install necessary libraries
!pip install transformers

In [None]:
# Importing necessary libraries
import os
import random
import time

import numpy as np
import pandas as pd
import spacy
import torch
from google.colab import drive
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset
from tqdm import tqdm
from transformers import (
    AdamW, 
    LongformerForSequenceClassification, 
    LongformerTokenizer, 
    get_linear_schedule_with_warmup
)

# Load spaCy model
nlp = spacy.load("en_core_web_sm", exclude=[
    "ner", "tagger", "parser", "lemmatizer", "textcat", "attribute_ruler"
])

# Mount Google Drive
drive.mount('/content/drive')

# Directories
DATA_DIR = '/content/drive/MyDrive/MSMARCO/'  # Data directory
MAIN_DIR = '/content/drive/MyDrive/'  # Main directory

# GPU Configuration
print('We will use the GPU:', torch.cuda.get_device_name(0))
DEVICE = torch.device("cuda")  # To run the model and process the tensors on GPU

In [None]:
# Data preprocess block, this takes MSMARCO dataset which only has positive labels (qrels)
# and assigns a random document to each query from the collection to be used as a negative label
# It also adds query text and document text so this final file can be used without referencing the huge collection file
# in the end we have a file with these fields columns=['qid', '0', 'docid', 'label', 'query_text','doc_text'])


# Load datasets
train_queries = pd.read_csv(os.path.join(data_dir, 'queries.doctrain.tsv'), sep='\t', header=None, names=['qid', 'query_text'])
train_queries_index = train_queries.set_index('qid')

train_relations = pd.read_csv(os.path.join(data_dir, 'msmarco-doctrain-qrels.tsv'), sep=' ', header=None, names=['qid', '0', 'docid', 'label'])

val_queries = pd.read_csv(os.path.join(data_dir, 'queries.docdev.tsv'), sep='\t', header=None, names=['qid', 'query_text'])
val_queries_index = val_queries.set_index('qid')

val_relations = pd.read_csv(os.path.join(data_dir, 'msmarco-docdev-qrels.tsv'), sep=' ', header=None, names=['qid', '0', 'docid', 'label'])

lookup = pd.read_csv(os.path.join(data_dir, 'msmarco-docs-lookup.tsv'), sep='\t', header=None, names=['docid', 'trec_offset', 'tsv_offset'], usecols=['docid', 'trec_offset', 'tsv_offset'])
lookup_list = lookup.values.tolist()

collection = pd.read_csv(os.path.join(data_dir, 'msmarco-docs.tsv'), sep='\t', header=None, names=['docid', 'url', 'title', 'doc_text'], index_col='docid')


def negative_samples(queries, relations, queries_index):
    negative_samples_list = []
    for i in tqdm(range(len(queries))):
        tmp_list = [queries[i][0], '0', random.sample(lookup_list, 1)[0][0], 0, queries[i][1]]
        negative_samples_list.append(tmp_list)
    for k in tqdm(range(len(relations))):
        query_text = queries_index.loc[relations[k][0]].query_text
        relations[k].append(query_text)
    return negative_samples_list + relations


def add_doc_text(dataset):
    for i in tqdm(range(len(dataset))):
        docid = dataset[i][2]
        document = collection.loc[docid].doc_text
        dataset[i].append(document)
    return pd.DataFrame(dataset, columns=['qid', '0', 'docid', 'label', 'query_text', 'doc_text'])

train_dataset = add_doc_text(negative_samples(train_queries.values.tolist(), train_relations.values.tolist(), train_queries_index))
val_dataset = add_doc_text(negative_samples(val_queries.values.tolist(), val_relations.values.tolist(), val_queries_index))
train_dataset.to_csv(os.path.join(data_dir, 'train_dataset.csv'))
val_dataset.to_csv(os.path.join(data_dir, 'val_dataset.csv'))

In [None]:
# This block is to be used when a preprocessed dataset is already created (that has positive and negative labels)


train_dataset = pd.read_csv(os.path.join(data_dir, 'train_dataset.csv'), header=0,
                          names=['qid', '0', 'docid', 'label', 'query_text', 'doc_text'])

val_dataset = pd.read_csv(os.path.join(data_dir, 'val_dataset.csv'), header=0,
                          names=['qid', '0', 'docid', 'label', 'query_text', 'doc_text'])                         


In [None]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') # Tokeizer initialization

In [None]:
# This function handels the toknizer and returns TensorDataset to be fed to the model

def tokenize(dataset, labels):
    input_ids = []
    attention_masks = []
    global_attention_mask = [] 

    for i in tqdm(range(len(dataset))):
        encoded_dict = tokenizer.encode_plus(
            str(dataset[i][4]),  # Query
            str(dataset[i][5]),  # Document
            add_special_tokens=True,  # Add '<s>' and '</s>'
            max_length=1024,  # Pad & truncate all sentences.
            padding='max_length',
            return_attention_mask=True,  # Construct attention masks.
            truncation='only_second',
            return_tensors='pt',  # Return pytorch tensors.
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

        global_attention = [0] * 1024
        range_with_CLS = len(nlp(dataset[i][4])) + 1
        for j in range(range_with_CLS):
            global_attention[j] = 1
        global_attention_mask.append(torch.tensor([global_attention]))

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    global_attention_masks = torch.cat(global_attention_mask, dim=0)
    labels = torch.tensor(labels)

    return TensorDataset(input_ids, attention_masks, labels, global_attention_masks)


In [None]:
# This block calls the tokenizer defined above and saves the tensor datasets to .pt files,
# so we do no have to go through this process every time we run the model



# Prepare tensor datasets
train_dataset_list = train_dataset.values.tolist() # Converting the dataframe to list (more efficient)
val_dataset_list = val_dataset.values.tolist() # Converting the dataframe to list (more efficient)
train_Labels = train_dataset.label.values # Extracting labels
val_labels = val_dataset.label.values # Extracting labels

train_dataset_tensor = tokenize(train_dataset_list, train_Labels) # Tokenization and creation of TensorDataset
val_dataset_tensor = tokenize(val_dataset_list, val_labels) # Tokenization and creation of TensorDataset

torch.save(train_dataset_tensor, os.path.join(MAIN_DIR, 'train_dataset_tensor.pt') # Saving TensorDataset to disk  
torch.save(val_dataset_tensor, os.path.join(MAIN_DIR, 'val_dataset_tensor.pt') # Saving TensorDataset to disk 

In [None]:
# This block is to be used when a TensorDataset is already created and saved to disk

train_dataset_tensor = torch.load('/content/drive/MyDrive/Longformer_train_dataset_tensor.pt') # Training dataset, Tokenized and converted to tensors ..  Type is TensorDataset
val_dataset_tensor = torch.load('/content/drive/MyDrive/Longformer_val_dataset_tensor.pt') # Validation dataset, Tokenized and converted to tensors ..  Type is TensorDataset

In [None]:
# DataLoader parameters
BATCH_SIZE = 16

# Create DataLoader for training set
train_dataloader = DataLoader(
    train_dataset_tensor,  # The training samples.
    sampler=RandomSampler(train_dataset_tensor),  # Select batches randomly
    batch_size=BATCH_SIZE  # Trains with this batch size.
)

# Create DataLoader for validation set
validation_dataloader = DataLoader(
    val_dataset_tensor,  # The validation samples.
    sampler=SequentialSampler(val_dataset_tensor),  # Pull out batches sequentially.
    batch_size=BATCH_SIZE  # Evaluate with this batch size.
)

In [None]:
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels = 2)
model.cuda()

In [None]:
# This block defines the optimizer and the scheduler

optimizer = AdamW(model.parameters(), lr=3e-6)
EPOCHS = 1
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=2500,
    num_training_steps=total_steps
)

In [None]:
for epoch_i in range(0, EPOCHS):
    print(f"\n======== Epoch {epoch_i + 1} / {EPOCHS} ========")
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            print(f'  Batch {step:>5,}  of  {len(train_dataloader):>5,}.')

        t_input_ids, t_input_mask, t_labels, t_global = (b.to(DEVICE) for b in batch)

        
        model.zero_grad()
        result = model(
            t_input_ids,
            attention_mask=t_input_mask, 
            labels=t_labels,
            global_attention_mask=t_global,
            return_dict=True
        )
        loss = result.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"\n  Average training loss: {avg_train_loss:.2f}")

    print("\nRunning Validation...")
    t0 = time.time()
    model.eval()
    total_eval_loss = 0

    for batch in validation_dataloader:

        v_input_ids, v_input_mask, v_labels, v_global = (b.to(DEVICE) for b in batch)
        with torch.no_grad():
            result = model(
                v_input_ids, 
                attention_mask=v_input_mask,
                labels=v_labels,
                global_attention_mask=v_global,
                return_dict=True
            )
        loss = result.loss
        total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print(f"  Validation Loss: {avg_val_loss:.2f}")

print("\nTraining complete!")

In [None]:
# This block saves the finetuned model

output_dir = '/content/drive/MyDrive/Longformer_checkpoint/'

model.save_pretrained(output_dir)