In [None]:
%pip install torch sentence-transformers


In [None]:
%pip help

In [None]:
%pip show sentence-transformers

In [None]:
#transformers update a few weeks ago made the supplied training code non functional see
#https://github.com/UKPLab/sentence-transformers/issues/3021
#

In [1]:
import csv
import datetime
import json
import string
from bs4 import BeautifulSoup
from argparse import ArgumentParser


In [2]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def remove_html(string):
    cleantext = BeautifulSoup(string, "lxml").text
    return cleantext

In [4]:
def load_topic_file(topic_filepath):
    # a method used to read the topic file for this year of the lab; to be passed to BERT/PyTerrier methods
    queries = json.load(open(topic_filepath))
    result = {}
    for item in queries:
      # returing results as dictionary of topic id: [title, body, tag]
      title =  item['Title'].translate(str.maketrans('', '', string.punctuation))
      #removing html from body
      body = remove_html(item['Body']).translate(str.maketrans('', '', string.punctuation))
      tags = item['Tags']
      result[item['Id']] = [title, body, tags]
    return result

In [5]:
def read_qrel_file(qrel_filepath):
    # a method used to read the topic file
    result = {}
    with open(qrel_filepath, "r") as f:
        reader = csv.reader(f, delimiter='\t', lineterminator='\n')
        for line in reader:
            query_id = line[0]
            doc_id = line[2]
            score = int(line[3])
            if query_id in result:
                result[query_id][doc_id] = score
            else:
                result[query_id] = {doc_id: score}
    # dictionary of key:query_id value: dictionary of key:doc id value: score
    return result

In [6]:
def read_collection(answer_filepath):
  # Reading collection to a dictionary
  lst = json.load(open(answer_filepath))
  result = {}
  for doc in lst:
    #processes the answers to remove html and punctuation.
    result[doc['Id']] = remove_html(doc['Text']).translate(str.maketrans('', '', string.punctuation))
  return result

In [7]:
#modifies each query to contain the title and body
def prep_queries(topics):
    queries = {}
    for query_id in topics:
        queries[query_id] = "[TITLE]" + topics[query_id][0] + "[BODY]" + topics[query_id][1]
    return queries

In [8]:
from sentence_transformers import InputExample

def cross_dataset_gen(queries,qrel,answers):
    result_list = []
    sample_list =[]
    for topic in qrel:
        print(f"Key: {topic}, Value: {qrel[topic]}")
        for doc, score in qrel[topic].items():
            pair = (queries[topic],answers[doc],score)
            result_list.append(pair)

    for pair in result_list:
        ex_1 = pair[0]
        ex_2 = pair[1]
        label = pair[2]
        if label >=1:
            label = 1
        sample_list.append(InputExample(texts=[pair[0],pair[1]],label=label))

    return sample_list

In [9]:
def bi_dataset_gen(queries,qrel,answers):
    mod_queries = {}

    for query_id in queries:
        mod_queries

In [10]:
def main():
    parser = ArgumentParser()
    parser.add_argument('-i', '--input', required=True, help='search domain file e.g. Answers.json', default="Answers.json")
    parser.add_argument('-t', '--topic', required=True, help='topic source files e.g. topics_1.json', default="topics_1.json")
    parser.add_argument('-q', '--qrel', required=True, help='qrel source files e.g. qrel_1.tsv', default="qrel_1.tsv")
    args = parser.parse_args()
    answer_filepath = args.input
    topic_filepath = args.topic
    qrel_filepath = args.qrel

    topics = load_topic_file(topic_filepath)
    qrel = read_qrel_file(qrel_filepath)
    answers = read_collection(answer_filepath)

    queries = prep_queries(topics)
    

In [11]:
topics = load_topic_file("topics_1.json")
qrel = read_qrel_file("qrel_1.tsv")
answers = read_collection("Answers.json")
queries = prep_queries(topics)

In [12]:
import random

def partition(data):
    random.shuffle(data)
    n = len(data)
    split1 = int(n * 0.8)
    split2 = int(n * 0.9)

    train = data[:split1]
    validation = data[split1:split2]
    test = data[split2:]

    return train, validation, test


In [13]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder
from sentence_transformers.losses import CoSENTLoss
from sentence_transformers import datasets

bi_model = SentenceTransformer("multi-qa-mpnet-base-cos-v1")
cross_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
splade_model = 'naver/splade_v2_max'

bi_finetune = SentenceTransformer("multi-qa-mpnet-base-cos-v1")
bi_loss = CoSENTLoss(bi_finetune)

cross_finetune = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
cross_loss = CoSENTLoss(cross_finetune)


In [None]:
#partition dataset into 80:10:10 sets
cross_dataset = cross_dataset_gen(queries,qrel,answers)
train_set, validation_set, test_set = partition(dataset)


In [29]:
#finetune Bi encoder

from torch.utils.data import DataLoader
from sentence_transformers import (
    SentenceTransformer,
    SentencesDataset,
    InputExample,
    losses,
    evaluation,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData
)
from torch.utils.data import DataLoader
import pandas as pd
import math

def fine_tune_bi(model,loss,train,valid,model_name,epochs=10):
    
    args = SentenceTransformerTrainingArguments(
        # Required parameter:
        output_dir="models/"+model_name,
        # Optional training parameters:
        num_train_epochs=epochs,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        warmup_ratio=0.1,
        fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
        bf16=False,  # Set to True if you have a GPU that supports BF16
        batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    )

    num_epochs = epochs
    model_save_path = "models/"+model_name


    train_dataloader = DataLoader(train, shuffle=True, batch_size=4)
    # During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.

    
    evaluator = evaluation.EmbeddingSimilarityEvaluator(evaluator_samples_1, evaluator_samples_2, evaluator_samples_score, write_csv="evaluation-epoch.csv")
    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
    train_loss = losses.MultipleNegativesRankingLoss(model=model)
    train_objectives = [(train_dataloader, loss)]
    
    # Train the model
    model.fit(
        train_objectives=train_objectives,
        evaluator=evaluator,
        epochs=epochs,
        warmup_steps=warmup_steps,
        output_path=model_save_path,
        save_best_model=True
    )
    model.save(model_save_path)

In [None]:
#fine tune cross encoder
fine_tune_bi(bi_finetune,bi_loss,train_set,validation_set,"fine_tuned_Bi")


In [None]:
# Fine-tuning Bi-encoder
# Models: https://sbert.net/docs/sentence_transformer/pretrained_models.html
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from itertools import islice
import json
import torch
import math
import string
import csv
import random
import os
os.environ["WANDB_DISABLED"] = "true"

def read_qrel_file(file_path):
    # Reading the qrel file
    dic_topic_id_answer_id_relevance = {}
    with open(file_path) as fd:
        rd = csv.reader(fd, delimiter="\t", quotechar='"')
        for row in rd:
            topic_id = row[0]
            answer_id = int(row[2])
            relevance_score = int(row[3])
            if topic_id in dic_topic_id_answer_id_relevance:
                dic_topic_id_answer_id_relevance[topic_id][answer_id] = relevance_score
            else:
                dic_topic_id_answer_id_relevance[topic_id] = {answer_id: relevance_score}
    return dic_topic_id_answer_id_relevance


def load_topic_file(topic_filepath):
    # a method used to read the topic file for this year of the lab; to be passed to BERT/PyTerrier methods
    queries = json.load(open(topic_filepath))
    result = {}
    for item in queries:
      # You may do additional preprocessing here

      # returing results as dictionary of topic id: [title, body, tag]
      title = item['Title'].translate(str.maketrans('', '', string.punctuation))
      body = item['Body'].translate(str.maketrans('', '', string.punctuation))
      tags = item['Tags']
      result[item['Id']] = [title, body, tags]
    return result


def read_collection(answer_filepath):
  # Reading collection to a dictionary
  lst = json.load(open(answer_filepath))
  result = {}
  for doc in lst:
    result[int(doc['Id'])] = doc['Text']
  return result


# Uses the posts file, topic file(s) and qrel file(s) to build our training and evaluation sets.
def process_data(queries, train_dic_qrel, val_dic_qrel, collection_dic):
    train_samples = []
    evaluator_samples_1 = []
    evaluator_samples_2 = []
    evaluator_samples_score = []

    # Build Training set
    for topic_id in train_dic_qrel:
        question = queries[topic_id]
        dic_answer_id = train_dic_qrel.get(topic_id, {})

        for answer_id in dic_answer_id:
            score = dic_answer_id[answer_id]
            answer = collection_dic[answer_id]
            if score > 1:
                train_samples.append(InputExample(texts=[question, answer], label=1.0))
            else:
                train_samples.append(InputExample(texts=[question, answer], label=0.0))
    for topic_id in val_dic_qrel:
        question = queries[topic_id]
        dic_answer_id = val_dic_qrel.get(topic_id, {})

        for answer_id in dic_answer_id:
            score = dic_answer_id[answer_id]
            answer = collection_dic[answer_id]
            if score > 1:
                label = 1.0
            elif score == 1:
                label = 0.5
            else:
                label = 0.0
            evaluator_samples_1.append(question)
            evaluator_samples_2.append(answer)
            evaluator_samples_score.append(label)

    return train_samples, evaluator_samples_1, evaluator_samples_2, evaluator_samples_score



def shuffle_dict(d):
    keys = list(d.keys())
    random.shuffle(keys)
    return {key: d[key] for key in keys}


def split_train_validation(qrels, ratio=0.9):
    # Using items() + len() + list slicing
    # Split dictionary by half
    n = len(qrels)
    n_split = int(n * ratio)
    qrels = shuffle_dict(qrels)
    train = dict(islice(qrels.items(), n_split))
    validation = dict(islice(qrels.items(), n_split, None))

    return train, validation


def train(model):

    ## reading queries and collection
    dic_topics = load_topic_file("topics_1.json")
    queries = {}
    for query_id in dic_topics:
        queries[query_id] = "[TITLE]" + dic_topics[query_id][0] + "[BODY]" + dic_topics[query_id][1]
    qrel = read_qrel_file("qrel_1.tsv")
    collection_dic = read_collection('Answers.json')
    train_dic_qrel, val_dic_qrel = split_train_validation(qrel)

    # print(train_dic_qrel)
    # print(val_dic_qrel)

    num_epochs = 5
    batch_size = 10

    # Rename this when training the model and keep track of results
    MODEL = "bi_multi-qa-mpnet-base-cos-v1"

    # Creating train and val dataset
    train_samples, evaluator_samples_1, evaluator_samples_2, evaluator_samples_score = process_data(queries, train_dic_qrel, val_dic_qrel, collection_dic)

    train_dataset = SentencesDataset(train_samples, model=model)
    train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size=batch_size)
    train_loss = losses.CoSENTLoss(model=model)

    evaluator = evaluation.EmbeddingSimilarityEvaluator(evaluator_samples_1, evaluator_samples_2, evaluator_samples_score, write_csv="evaluation-epoch.csv")
    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

    # add evaluator to the model fit function
    model.fit(
        train_objectives =[(train_dataloader, train_loss)],
        evaluator=evaluator,
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        use_amp=True,
        save_best_model=True,
        show_progress_bar=True,
        output_path=MODEL
    )

model = SentenceTransformer('multi-qa-mpnet-base-cos-v1')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
print("here")
train(model)

In [None]:
# Fine-tuning Cross-encoder
import csv
import datetime
import json
import string
from sentence_transformers import InputExample
from sentence_transformers import SentenceTransformer, util, CrossEncoder, losses
import torch
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator, CEBinaryClassificationEvaluator, \
    CERerankingEvaluator
from torch.utils.data import DataLoader
import math


def read_qrel_file(qrel_filepath):
    # a method used to read the topic file
    result = {}
    with open(qrel_filepath, "r") as f:
        reader = csv.reader(f, delimiter='\t', lineterminator='\n')
        for line in reader:
            query_id = line[0]
            doc_id = line[2]
            score = int(line[3])
            if query_id in result:
                result[query_id][doc_id] = score
            else:
                result[query_id] = {doc_id: score}
    # dictionary of key:query_id value: dictionary of key:doc id value: score
    return result


def load_topic_file(topic_filepath):
    # a method used to read the topic file for this year of the lab; to be passed to BERT/PyTerrier methods
    queries = json.load(open(topic_filepath))
    result = {}
    for item in queries:
      # You may do additional preprocessing here
      # returing results as dictionary of topic id: [title, body, tag]
      title = item['Title'].translate(str.maketrans('', '', string.punctuation))
      body = item['Body'].translate(str.maketrans('', '', string.punctuation))
      tags = item['Tags']
      result[item['Id']] = [title, body, tags]
    return result


def read_collection(answer_filepath):
  # Reading collection to a dictionary
  lst = json.load(open(answer_filepath))
  result = {}
  for doc in lst:
    result[doc['Id']] = doc['Text']
  return result


## reading queries and collection
dic_topics = load_topic_file("topics_1.json")
queries = {}
for query_id in dic_topics:
    queries[query_id] = "[TITLE]" + dic_topics[query_id][0] + "[BODY]" + dic_topics[query_id][1]
qrel = read_qrel_file("qrel_1.tsv")
collection_dic = read_collection('Answers.json')

## Preparing pairs of training instances
num_topics = len(queries.keys())
number_training_samples = int(num_topics*0.9)


## Preparing the content
counter = 1
train_samples = []
valid_samples = {}
for qid in qrel:
    # key: doc id, value: relevance score
    dic_doc_id_relevance = qrel[qid]
    # query text
    topic_text = queries[qid]

    if counter < number_training_samples:
        for doc_id in dic_doc_id_relevance:
            label = dic_doc_id_relevance[doc_id]
            content = collection_dic[doc_id]
            if label >= 1:
                label = 1
            train_samples.append(InputExample(texts=[topic_text, content], label=label))
    else:
        for doc_id in dic_doc_id_relevance:
            label = dic_doc_id_relevance[doc_id]
            if qid not in valid_samples:
                valid_samples[qid] = {'query': topic_text, 'positive': set(), 'negative': set()}
            if label == 0:
                label = 'negative'
            else:
                label = 'positive'
            content = collection_dic[doc_id]
            valid_samples[qid][label].add(content)
    counter += 1

print("Training and validation set prepared")

# selecting cross-encoder
model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
# Learn how to use GPU with this!
model = CrossEncoder(model_name)

# Adding special tokens
tokens = ["[TITLE]", "[BODY]"]
model.tokenizer.add_tokens(tokens, special_tokens=True)
model.model.resize_token_embeddings(len(model.tokenizer))

num_epochs = 10
model_save_path = "cross-encoder/ms-marco-MiniLM-L-6-v2"
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=4)
# During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CERerankingEvaluator(valid_samples, name='train-eval')
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
train_loss = losses.MultipleNegativesRankingLoss(model=model)
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          save_best_model=True)

model.save(model_save_path)


In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

def splade():
    
    model_id = 'naver/splade_v2_max'
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForMaskedLM.from_pretrained(model_id)