In [None]:
!pip install transformers

In [None]:
import torch
import pandas as pd
import os
import numpy as np
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from google.colab import drive
from tqdm import tqdm
import random
import time
import spacy
import datetime

nlp = spacy.load("en_core_web_sm", exclude=["ner", "tagger", "parser", "lemmatizer", "textcat", "attribute_ruler"])

drive.mount('/content/drive')

data_dir = '/content/drive/MyDrive/MSMARCO/' # Data directory
dir = '/content/drive/MyDrive/' # Main directory
print('We will use the GPU:', torch.cuda.get_device_name(0))
device = torch.device("cuda") # To run the model and process the tensors on GPU


In [None]:
# Data preprocess block, this takes MSMARCO dataset which only has positive labels (qrels)
# and assigns a random document to each query from the collection to be used as a negative label
# It also adds query text and document text so this final file can be used without referencing the huge collection file
# in the end we have a file with these fields columns=['qid', '0', 'docid', 'label', 'query_text','doc_text'])

train_queries = pd.read_csv(os.path.join(data_dir, 'queries.doctrain.tsv'),
                                   sep='\t', header=None, names=['qid', 'query_text'])
train_queries_index = train_queries.set_index('qid')

train_relations = pd.read_csv(os.path.join(data_dir, 'msmarco-doctrain-qrels.tsv'),
                                     sep=' ', header=None, names=['qid', '0', 'docid', 'label'])
        
val_queries = pd.read_csv(os.path.join(data_dir, 'queries.docdev.tsv'),
                                   sep='\t', header=None, names=['qid', 'query_text'])
val_queries_index = val_queries.set_index('qid')

val_relations = pd.read_csv(os.path.join(data_dir, 'msmarco-docdev-qrels.tsv'),
                                     sep=' ', header=None, names=['qid', '0', 'docid', 'label'])

lookup = pd.read_csv(os.path.join(data_dir,'msmarco-docs-lookup.tsv'),
                                        sep='\t', header=None,
                                        names=['docid', 'trec_offset', 'tsv_offset'],
                                        usecols=['docid', 'trec_offset', 'tsv_offset'])
lookup_list = lookup.values.tolist()

collection = pd.read_csv(os.path.join(data_dir, 'msmarco-docs.tsv'),
                       sep='\t', header=None,
                       names=['docid', 'url', 'title', 'doc_text'], index_col='docid',)

def negative_samples(queries, relations, queries_index):
  Negative_Samples_List = []
  for i in tqdm(range(len(queries))):
    tmp_list = []
    tmp_list.append(queries[i][0])
    tmp_list.append('0')
    docid_n = random.sample(lookup_list, 1)[0][0]
    tmp_list.append(docid_n)
    tmp_list.append(0)
    tmp_list.append(queries[i][1])
    Negative_Samples_List.append(tmp_list)
  for k in tqdm(range(len(relations))):
    query_text = queries_index.loc[relations[k][0]].query_text
    relations[k].append(query_text)

  return Negative_Samples_List+relations

def add_doc_text(dataset):
  for i in tqdm(range(len(dataset))):
    docid = dataset[i][2]
    document = collection.loc[docid].doc_text
    dataset[i].append(document)
  return pd.DataFrame(dataset, columns=['qid', '0', 'docid', 'label', 'query_text','doc_text'])

train_dataset = add_doc_text(negative_samples(train_queries.values.tolist(), train_relations.values.tolist(), train_queries_index))
val_dataset = add_doc_text(negative_samples(val_queries.values.tolist(), val_relations.values.tolist(), val_queries_index))
train_dataset.to_csv(os.path.join(data_dir, 'train_dataset.csv'))
val_dataset.to_csv(os.path.join(data_dir, 'val_dataset.csv'))


In [None]:
# This block is to be used when having a ready dataset (that has positive and negative labels)


train_dataset = pd.read_csv(os.path.join(data_dir, 'train_dataset.csv'), header=0,
                          names=['qid', '0', 'docid', 'label', 'query_text', 'doc_text'])

val_dataset = pd.read_csv(os.path.join(data_dir, 'val_dataset.csv'), header=0,
                          names=['qid', '0', 'docid', 'label', 'query_text', 'doc_text'])                         


In [None]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') # Tokeizer initialization

In [None]:
# This function handels the toknizer and returns TensorDataset to be fed to the model

def tokenize(DS, Label):
  input_ids = []
  attention_masks = []
  global_attention_mask = [] 

  for i in tqdm(range(len(DS))):

    encoded_dict = tokenizer.encode_plus(
                        str(DS[i][4]), # Query
                        str(DS[i][5]), # Document
                        add_special_tokens = True, # Add '<s>' and '</s>'
                        max_length = 1024,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        truncation='only_second',
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
      # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])


    global_attention = None
    global_attention = [0] * 1024
    range_with_CLS = len(nlp(DS[i][4])) + 1
    for i in range(range_with_CLS):
      global_attention[i] = 1
    global_attention_mask.append(torch.tensor([global_attention]))


  
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  global_attention_masks = torch.cat(global_attention_mask, dim=0)
  labels = torch.tensor(Label)


  return TensorDataset(input_ids, attention_masks, labels, global_attention_masks)
  

In [None]:
# This block calls the tokenizer defined above and saves the tensor datasets to .pt files,
# so we do no have to go through this process every time we run the model


train_dataset_list = train_dataset.values.tolist() # Converting the dataframe to list (more efficient)
val_dataset_list = val_dataset.values.tolist() # Converting the dataframe to list (more efficient)
train_Labels = train_dataset.label.values # Extracting labels
val_labels = val_dataset.label.values # Extracting labels

train_dataset_tensor = tokenize(train_dataset_list, train_Labels) # Tokenization and creation of TensorDataset
val_dataset_tensor = tokenize(val_dataset_list, val_labels) # Tokenization and creation of TensorDataset

torch.save(train_dataset_tensor, os.path.join(dir, 'train_dataset_tensor.pt') # Saving TensorDataset to disk  
torch.save(val_dataset_tensor, os.path.join(dir, 'val_dataset_tensor.pt') # Saving TensorDataset to disk 

In [None]:
train_dataset_tensor = torch.load('/content/drive/MyDrive/Longformer_train_dataset_tensor.pt') # Training dataset, Tokenized and converted to tensors ..  Type is TensorDataset
val_dataset_tensor = torch.load('/content/drive/MyDrive/Longformer_val_dataset_tensor.pt') # Validation dataset, Tokenized and converted to tensors ..  Type is TensorDataset


batch_size = 16

train_dataloader = DataLoader(
            train_dataset_tensor,  # The training samples.
            sampler = RandomSampler(train_dataset_tensor), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset_tensor, # The validation samples.
            sampler = SequentialSampler(val_dataset_tensor), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels = 2)
model.cuda()

In [None]:
# This block defines the optimizer and the scheduler

optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-6)

epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 2500,
                                            num_training_steps = total_steps)

In [None]:
seed_val = 50
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()


for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        # Progress update every 100 batches.
        if step % 100 == 0 and not step == 0:
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        t_input_ids = batch[0].to(device)
        t_input_mask = batch[1].to(device)
        t_labels = batch[2].to(device)
        t_global = batch[3].to(device)

        model.zero_grad()        

        result = model(t_input_ids,
                       attention_mask=t_input_mask, 
                       labels=t_labels,
                       global_attention_mask=t_global,
                       return_dict=True)

        loss = result.loss

        # Summing training loss to calculate AVG
        total_train_loss += loss.item()

        loss.backward()

        # Clip the norm of the gradients to 1.0.
        #L2 regularization
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        
        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))


    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode
    model.eval()

    total_eval_loss = 0

    for batch in validation_dataloader:
        
        v_input_ids = batch[0].to(device)
        v_input_mask = batch[1].to(device)
        v_labels = batch[2].to(device)
        v_global = batch[3].to(device)
        

        with torch.no_grad():        

            result = model(v_input_ids, 
                           attention_mask=v_input_mask,
                           labels=v_labels,
                           global_attention_mask=v_global,
                           return_dict=True)

        loss = result.loss
            
        # Summing the validation loss.
        total_eval_loss += loss.item()

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))

print("")
print("Training complete!")

In [None]:
# This block saves the finetuned model

output_dir = '/content/drive/MyDrive/Longformer_checkpoint/'

model.save_pretrained(output_dir)