# Finetuning script for DeLADE

## Run instructions:
Only Change these variables in the Variables Section:

1. **save_path**: Path where DeLADE checkpoints will be periodically saved during finetuning

2. **data_url**: The download url of the train data csv (Please Refer to Generating the Download Link)

4. **split**: The Percentage of Train, Validation and Test

5. **DATA_DIR**: Path where training data is saved.

6. Generating the Download Link:
  * Upload the Csv to Drive.

  * Change the share settings and set view to all.

  * Get the share link and generate the download link from https://sites.google.com/site/gdocs2direct/

7. After setting the Appropriate Variables, Run all.

# Variables

In [None]:
import os
# Get the Current Working Directory
cwd = os.getcwd()

# The Path where the Finetuned Model checkpoints will be saved
save_path = cwd+"/finetuned_model"

# The Path of the data files, data path contains a csv file in the Squadv2 format
data_url = 'https://drive.google.com/u/1/uc?id=1jHR-T1PH4xkd4ljGWn8lD-HaJPYMo4EC&export=download'

# The Train - Validation - Split Percentage
split = [80,10,10] # 80% for Training, 10% for Validation and rest of the 10% for Testing

# The Path where the training data for finetuning will be saved
DATA_DIR = cwd

# Imports

In [None]:
!git clone https://github.com/castorini/dhr.git

In [None]:
!pip install transformers datasets nmslib sentence-transformers

In [None]:
%cd dhr

In [None]:
!sudo apt install megatools

In [None]:
!rm /content/dhr/tevatron/data.py
!megadl --path $cwd/dhr/tevatron/ 'https://mega.nz/#!8rpzDYyL!AuHeWGGAIbpAV4vBm91A8KNjzXjOvvsW1ON_pQGqoS0'
!rm /content/dhr/tevatron/trainer.py
!megadl --path $cwd/dhr/tevatron/ 'https://mega.nz/#!VjJwzbhR!_cy2OBYGUUQPVD_Zy4ez1iutSG5Gh6jSwPmPAcT5318'

In [None]:
# Nmslib Params, used for generation of the train data for finetuning DeLADE

M = 100
efC = 2000

num_threads = 4
efS = 2000
query_time_params = {'efSearch': efS}
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
nmslib_space = 'cosinesimil'
nmslib_method = 'hnsw'

# Model card for the embedding model for NMSLib (for use as helper module)
embed_model_name = 'all-distilroberta-v1'

In [None]:
!pip install gdown
import gdown

In [None]:
# Download training files using gdown
data_path = f"{DATA_DIR}/train_data.csv"
gdown.download(data_url, data_path, quiet=False)

# Preprocessing code for input data

In [None]:
import pandas as pd
DATA_PATH=cwd
df = pd.read_csv(data_path)

In [None]:
paras = df.Paragraph.unique()
df[df["Paragraph"]==paras[0]].Theme[0]
df = df.rename(columns={'Answer_Text':'Answer_text'})
data = []
for i, para in enumerate(paras):
    data_dict = {}
    data_dict['id']=i+1
    data_dict['paragraph']=para
    data_dict['theme'] = df[df["Paragraph"]==para].iloc[0].Theme
    data.append(data_dict)
df2 = pd.DataFrame(data)
df2.to_csv(DATA_PATH+'/input_para1.csv', header=True, index=False)

In [None]:
para_data_path = DATA_PATH+'/input_para1.csv'

In [None]:
# Functions for Loading and Saving the Data in the jsonl format (as required by
# DeLADE)

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))
def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [None]:
# Loading Tokenizer for generating tokens for Training 

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
        'jacklin/DeLADE-CLS-P',
        # cache_dir=cache_dir,
        use_fast=False,
    )

def preprocess_text(text,max_length=512):
  text_encoded = tokenizer.encode(
              tokenizer.sep_token.join(text),
              add_special_tokens=False,
              max_length=max_length,
              truncation=True
          )
  return text_encoded

In [None]:
# Generating Paragraph Embeddings for NMSLIB search for generating Negative Para
# Ids, as required by DeLADE

from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm import tqdm
embed_model = SentenceTransformer(embed_model_name)
para_df = pd.read_csv(para_data_path)
themes = para_df["theme"].unique().tolist()
all_theme_embeddings = dict()
for theme in tqdm(themes):
  paras = para_df[para_df["theme"]==theme].paragraph.unique()
  theme_para_embed = embed_model.encode(paras)
  all_theme_embeddings[theme] = theme_para_embed

In [None]:
# Generating Query Embeddings for NMSLIB search for generating Negative Para,
# Ids, as required by DeLADE
from ast import literal_eval
embed_model = SentenceTransformer(embed_model_name)
full_data = df.copy()
full_data.Answer_start = full_data.Answer_start.apply(literal_eval)
full_data.Answer_text = full_data.Answer_text.apply(literal_eval)
full_data['Unnamed: 0'] = full_data['Unnamed: 0'].astype(str)
queries = {}
for theme in tqdm(themes):
  theme_df = full_data[full_data['Theme']==theme]
  queries[theme] = {idx:embed_model.encode(theme_row['Question']) for idx,theme_row in theme_df.iterrows()}
# np.save(f'/content/drive/MyDrive/Assets/Embeddings/distilroberta_75k_queries.npy',queries,allow_pickle=True)

In [None]:
# Splitting and Generating the Data
import numpy as np
import nmslib
train_samples = []
dev_samples = []
test_samples = []

for theme in tqdm(themes):
  theme_df = full_data[full_data['Theme']==theme]
  n = len(theme_df)
  index = nmslib.init(method=nmslib_method, space=nmslib_space, data_type=nmslib.DataType.DENSE_VECTOR) 
  index.addDataPointBatch(all_theme_embeddings[theme])
  index.createIndex(index_time_params) 
  index.setQueryTimeParams(query_time_params)
  for i,theme_row in enumerate(theme_df.iterrows()):
    idx,theme_row = theme_row
    k=np.random.randint(5,10)
    I , D = index.knnQuery(queries[theme][idx], k = k)
    pred_para = theme_df.Paragraph.unique()[I].tolist()
    real_para = theme_row.Paragraph
    if real_para in pred_para:
      pred_para.remove(real_para)
    input = {
              'query_id':theme_row['Unnamed: 0'],
              'query':theme_row['Question'],
              'positive_passages':[{'doc_id':para_df[para_df.paragraph==real_para]['id'].tolist()[0],
                                   'title':theme,
                                   'text':real_para
                  }],
              'negative_passages':[
                  {
                   'doc_id':para_df[para_df.paragraph==para]['id'].tolist()[0],
                   'title':theme,
                   'text':para
                  } for para in pred_para],
          }
    if i<int(split[0]*n/sum(split)):
      train_samples.append(input)
    elif i<int((split[0]+split[1])*n/sum(split)):
      dev_samples.append(input)
    else:
      test_samples.append(input)

In [None]:
# Generating Tokens for the Data

query_toks = {row['Unnamed: 0']: preprocess_text(row['Question']) for idx,row in full_data.iterrows()}
para_toks = {row['id']:preprocess_text(row['paragraph']) for idx,row in para_df.iterrows()}

In [None]:
# Generating the Train Data

train_data = []
for sample in tqdm(train_samples):
  train_data.append({
      'query': query_toks[sample['query_id']],
      'positive_pids': [sample['positive_passages'][0]['doc_id']],
      'negative_pids': [neg['doc_id'] for neg in sample['negative_passages']]
  })

In [None]:
# Generating the Corpus Data

corpus_data = []
for id,tok in para_toks.items():
  corpus_data.append(
      {
          'text_id': str(id),
          'text': tok
      }
  )

In [None]:
# Saving the Generated Data

import os
import json
try:
  os.mkdir(f'{DATA_DIR}/train')
  os.mkdir(f'{DATA_DIR}/corpus')
except Exception as e:
  pass

dump_jsonl(train_data,f'{DATA_DIR}/train/75k_trainsplit.json')
dump_jsonl(corpus_data,f'{DATA_DIR}/corpus/corpus.json')

# Training

In [None]:
!export CUDA_VISIBLE_DEVICES=0
!export MODEL=DHR
!export CLSDIM=128
!export DLRDIM=768
!export MODEL_DIR=${MODEL}_CLS${CLSDIM}

!python -m tevatron.driver.train \
  --output_dir $save_path/DHR \
  --train_dir $DATA_DIR/train \
  --corpus_dir $DATA_DIR/corpus \
  --model_name_or_path jacklin/DeLADE-CLS-P  \
  --do_train \
  --save_steps 5000 \
  --per_device_train_batch_size 4 \
  --learning_rate 7e-6 \
  --q_max_len 32 \
  --p_max_len 150 \
  --num_train_epochs 1 \
  --add_pooler \
  --model DHR \
  --projection_out_dim 128 \
  --train_n_passages 8 \
  --dataloader_num_workers 2 \
  --seed 42 \
  --overwrite_output_dir \
  --combine_cls 