# Finetuning script for Cross-encoder



In [None]:
# Installing Sentence Transformers for Cross Encoder and Importing other Dependencies
!pip install -U sentence-transformers
!pip install gdown
import gdown
import numpy as np
import pandas as pd
import os
import json
import random
from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers import InputExample
import logging
from datetime import datetime
import sys
from ast import literal_eval
import os
import gzip
import csv
from tqdm import tqdm

#Data Extraction and Pre-Processing

In [None]:
# Download training files using gdown
cwd = os.getcwd()
url = "https://drive.google.com/u/1/uc?id=1jHR-T1PH4xkd4ljGWn8lD-HaJPYMo4EC&export=download" #> Provided training data
output = cwd+"/train_data.csv"
gdown.download(url, output, quiet=False)

In [None]:
# Path for Training Data File
data_path = './train_data.csv'

# Split Ratio for Training and Validation and Test
split = [80, 10, 10]

In [None]:
# Reading data for finetuning

full_data = pd.read_csv(data_path)
themes = full_data.Theme.unique()
full_data.Answer_start = full_data.Answer_start.apply(literal_eval)
full_data.Answer_text = full_data.Answer_text.apply(literal_eval)
full_data['Unnamed: 0'] = full_data['Unnamed: 0'].astype(str)
train_samples = []
val_samples = []
test_samples = []

# Divide the complete data into training and validation and
# testing data in the defined split ratio for each theme.

for theme in themes:
  theme_df = full_data[full_data['Theme']==theme]
  n = len(theme_df)
  for i,theme_row in enumerate(theme_df.iterrows()):
    theme_row = theme_row[1]
    input = {
              'Answer_start': theme_row['Answer_start'],
              'Answer_text':theme_row['Answer_text'],
              'Paragraph':theme_row['Paragraph'],
              'id':theme_row['Unnamed: 0'],
              'Question': theme_row['Question'],
              'Theme': theme_row['Theme'],
              'Answer_possible': theme_row['Answer_possible']
          }
    if i<int(split[0]*n/sum(split)):
      train_samples.append(input)
    elif i<int((split[0]+split[1])*n/sum(split)):
      val_samples.append(input)
    else:
      test_samples.append(input)
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)

In [None]:
# Converting the given data in the MS-MARCO dataset format for Cross Encoder 
# fine-tuning.

def preprocess_data(df):
  data = []
  for i in tqdm(range(len(df))):
    data_dict = dict()
    question = df.Question.iloc[i]
    true_para = df.Paragraph.iloc[i]
    all_para = df[df.Theme==df.Theme.iloc[i]].Paragraph.unique().tolist()
    all_para.remove(true_para)
    random.shuffle(all_para)
    if len(all_para) >= 10:
        took_para = all_para[:10]
    else:
        took_para = all_para
    rand_index = random.randint(0,len(took_para)-1)
    is_selected = [0]*len(took_para)
    if df.Answer_possible.iloc[i] or df.Answer_possible.iloc[i]=='TRUE':
        is_selected[rand_index] = 1
    took_para[rand_index] = true_para
    data_dict['query'] = question
    data_dict['passages'] = {'is_selected': is_selected, 'passage_text': took_para}
    data_dict['answer'] = df.Answer_text.iloc[i]
    data.append(data_dict)
  return pd.DataFrame(data)

In [None]:
train_df = preprocess_data(train_df) #pre-process for training split
val_df = preprocess_data(val_df) #pre-process for validation split
#test_df = preprocess_data(test_df) #pre-process for testing split

In [None]:
# Adds Preprocessed data in train_samples, val_samples with labels
# if Question and Paragraph pair are matching then label is 1
# else label is 0
train_samples = []
val_samples = []

for i in range(len(train_df)):
    query = train_df.loc[i, 'query']
    contexts = train_df.loc[i, 'passages']['passage_text']
    select_idx = -1
    if 1 in train_df.loc[i, 'passages']['is_selected']:
      select_idx = train_df.loc[i, 'passages']['is_selected'].index(1)
    for j in range(len(contexts)):
        if j==select_idx:
            train_samples.append(InputExample(texts=[query, contexts[j]], label=1))
        else:
            train_samples.append(InputExample(texts=[query, contexts[j]], label=0))

for i in range(len(val_df)):
    query = val_df.loc[i, 'query']
    contexts = val_df.loc[i, 'passages']['passage_text']
    select_idx = -1
    if 1 in train_df.loc[i, 'passages']['is_selected']:
      select_idx = train_df.loc[i, 'passages']['is_selected'].index(1)
    for j in range(len(contexts)):
        if j==select_idx:
            val_samples.append(InputExample(texts=[query, contexts[j]], label=1))
        else:
            val_samples.append(InputExample(texts=[query, contexts[j]], label=0))

#Model

In [None]:
#loads the Base Cross-Encoder MiniLM-L-4-v2 from Huggingface Library
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-4-v2', num_labels=1)

In [None]:
# Finetuning configurations
train_batch_size = 16
num_epochs = 4
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

In [None]:
# Evaluator methods
evaluator = CECorrelationEvaluator.from_input_examples(val_samples, name='val') #Evaluation with validation dataset
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
model.fit(train_dataloader=train_dataloader,evaluator=evaluator,epochs=num_epochs,warmup_steps=warmup_steps,output_path='./minilml4-v2')