In [None]:
# Install necessary libraries
!pip install transformers

In [None]:
# Import necessary libraries
import os
import random
import time
import datetime

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset
from transformers import (AdamW, BertForSequenceClassification, BertTokenizer,
                          get_linear_schedule_with_warmup)
from tqdm import tqdm
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

# Set directories
data_dir = '/content/drive/MyDrive/MSMARCO/'  # Data directory
main_dir = '/content/drive/MyDrive/'  # Main directory

# Check for GPU
print('We will use the GPU:', torch.cuda.get_device_name(0))
device = torch.device("cuda")

In [None]:
# Data preprocess block, this takes MSMARCO dataset which only has positive labels (qrels)
# and assigns a random document to each query from the collection to be used as a negative label
# It also adds query text and document text so this final file can be used without referencing the huge collection file
# in the end we have a file with these fields columns=['qid', '0', 'docid', 'label', 'query_text','doc_text'])


# Load datasets
train_queries = pd.read_csv(os.path.join(data_dir, 'queries.doctrain.tsv'), sep='\t', header=None, names=['qid', 'query_text'])
train_queries_index = train_queries.set_index('qid')

train_relations = pd.read_csv(os.path.join(data_dir, 'msmarco-doctrain-qrels.tsv'), sep=' ', header=None, names=['qid', '0', 'docid', 'label'])

val_queries = pd.read_csv(os.path.join(data_dir, 'queries.docdev.tsv'), sep='\t', header=None, names=['qid', 'query_text'])
val_queries_index = val_queries.set_index('qid')

val_relations = pd.read_csv(os.path.join(data_dir, 'msmarco-docdev-qrels.tsv'), sep=' ', header=None, names=['qid', '0', 'docid', 'label'])

lookup = pd.read_csv(os.path.join(data_dir, 'msmarco-docs-lookup.tsv'), sep='\t', header=None, names=['docid', 'trec_offset', 'tsv_offset'], usecols=['docid', 'trec_offset', 'tsv_offset'])
lookup_list = lookup.values.tolist()

collection = pd.read_csv(os.path.join(data_dir, 'msmarco-docs.tsv'), sep='\t', header=None, names=['docid', 'url', 'title', 'doc_text'], index_col='docid')


def negative_samples(queries, relations, queries_index):
    negative_samples_list = []
    for i in tqdm(range(len(queries))):
        tmp_list = [queries[i][0], '0', random.sample(lookup_list, 1)[0][0], 0, queries[i][1]]
        negative_samples_list.append(tmp_list)
    for k in tqdm(range(len(relations))):
        query_text = queries_index.loc[relations[k][0]].query_text
        relations[k].append(query_text)
    return negative_samples_list + relations


def add_doc_text(dataset):
    for i in tqdm(range(len(dataset))):
        docid = dataset[i][2]
        document = collection.loc[docid].doc_text
        dataset[i].append(document)
    return pd.DataFrame(dataset, columns=['qid', '0', 'docid', 'label', 'query_text', 'doc_text'])

train_dataset = add_doc_text(negative_samples(train_queries.values.tolist(), train_relations.values.tolist(), train_queries_index))
val_dataset = add_doc_text(negative_samples(val_queries.values.tolist(), val_relations.values.tolist(), val_queries_index))
train_dataset.to_csv(os.path.join(data_dir, 'train_dataset.csv'))
val_dataset.to_csv(os.path.join(data_dir, 'val_dataset.csv'))

In [None]:
# This block is to be used when a preprocessed dataset is already created (that has positive and negative labels)


train_dataset = pd.read_csv(os.path.join(data_dir, 'train_dataset.csv'), header=0,
                          names=['qid', '0', 'docid', 'label', 'query_text', 'doc_text'])

val_dataset = pd.read_csv(os.path.join(data_dir, 'val_dataset.csv'), header=0,
                          names=['qid', '0', 'docid', 'label', 'query_text', 'doc_text'])                         


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Tokeizer initialization

In [None]:
# This function handels the toknizer and returns TensorDataset to be fed to the model
 
    
def tokenize(dataset, label):
    input_ids = []
    attention_masks = []
    token_type_ids = []

    for i in tqdm(range(len(dataset))):
        encoded_dict = tokenizer.encode_plus(
            str(dataset[i][4]),  # Query
            str(dataset[i][5]),  # Document
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=512,  # Pad & truncate all sentences.
            padding='max_length',
            return_attention_mask=True,  # Construct attn. masks.
            truncation='only_second',
            return_tensors='pt',  # Return pytorch tensors.
            return_token_type_ids=True  # To differentiate query sequence from document sequence
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        token_type_ids.append(encoded_dict['token_type_ids'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    labels = torch.tensor(label)

    return TensorDataset(input_ids, attention_masks, token_type_ids, labels)


In [None]:
# This block calls the tokenizer defined above and saves the tensor datasets to .pt files,
# so we do no have to go through this process every time we run the model


train_dataset_list = train_dataset.values.tolist() # Converting the dataframe to list (more efficient)
val_dataset_list = val_dataset.values.tolist() # Converting the dataframe to list (more efficient)
train_Labels = train_dataset.label.values # Extracting labels
val_labels = val_dataset.label.values # Extracting labels

train_dataset_tensor = tokenize(train_dataset_list, train_Labels) # Tokenization and creation of TensorDataset
val_dataset_tensor = tokenize(val_dataset_list, val_labels) # Tokenization and creation of TensorDataset

torch.save(train_dataset_tensor, os.path.join(main_dir, 'train_dataset_tensor.pt') # Saving TensorDataset to disk  
torch.save(val_dataset_tensor, os.path.join(main_dir, 'val_dataset_tensor.pt') # Saving TensorDataset to disk 

In [None]:
# This block is to be used when a TensorDataset is already created and saved to disk

train_dataset_tensor = torch.load('/content/drive/MyDrive/train_dataset_tensor.pt') # Training dataset, Tokenized and converted to tensors ..  Type is TensorDataset
val_dataset_tensor = torch.load('/content/drive/MyDrive/val_dataset_tensor.pt') # Validation dataset, Tokenized and converted to tensors ..  Type is TensorDataset


In [None]:
# Define dataloaders

batch_size = 16

train_dataloader = DataLoader(
            train_dataset_tensor,  # The training samples.
            sampler = RandomSampler(train_dataset_tensor), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset_tensor, # The validation samples.
            sampler = SequentialSampler(val_dataset_tensor), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)
model.cuda()

In [None]:
# This block defines the optimizer and the scheduler


optimizer = AdamW(model.parameters(), lr=3e-6)

epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 10000,
                                            num_training_steps = total_steps)

In [None]:
# Training loop
seed_val = 50
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    print("Training...")

    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and step != 0:
            elapsed = str(datetime.timedelta(seconds=int(time.time() - t0)))
            print(f'  Batch {step}  of  {len(train_dataloader)}.    Elapsed: {elapsed}.')

        t_input_ids, t_input_mask, t_token_type_ids, t_labels = (b.to(device) for b in batch)
        model.zero_grad()
        result = model(t_input_ids, token_type_ids=t_token_type_ids, attention_mask=t_input_mask, labels=t_labels, return_dict=True)

        loss = result.loss
        total_train_loss += loss.item() # Summing training loss to calculate AVG
        loss.backward()
        
        # Clip the norm of the gradients to 1.0.
        #L2 regularization
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step() # Update the learning rate.

    avg_train_loss = total_train_loss / len(train_dataloader) # Calculate the average loss over all of the batches.
    print(f"\n  Average training loss: {avg_train_loss:.2f}")

    print("\nRunning Validation...")
    t0 = time.time()
    model.eval()
    total_eval_loss = 0

    for batch in validation_dataloader:
        v_input_ids, v_input_mask, v_token_type_ids, v_labels = (b.to(device) for b in batch)
        with torch.no_grad():
            result = model(v_input_ids, token_type_ids=v_token_type_ids, attention_mask=v_input_mask, labels=v_labels, return_dict=True)
        total_eval_loss += result.loss.item()

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print(f"  Validation Loss: {avg_val_loss:.2f}")

print("\nTraining complete!")

In [None]:
# This block saves the finetuned model

output_dir = '/content/drive/MyDrive/BERT_checkpoint/'

model.save_pretrained(output_dir)