# Imports

In [1]:
import torch
import ir_datasets
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder

from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F

import numpy as np
from scipy.spatial.distance import cdist
from tqdm.notebook import tqdm
import pickle
from collections import defaultdict
import ir_measures

from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [2]:
cross_encoder = pipeline('text-classification', model='cross-encoder/ms-marco-MiniLM-L-6-v2')

# Downloading Dataset

In [3]:
dataset = ir_datasets.load("neuclir/1/multi/trec-2023")
dataset
english_queries = [(query.query_id, query.title) for query in dataset.queries_iter()]
qrels = [(qrel.query_id, qrel.doc_id, qrel.relevance) for qrel in dataset.qrels_iter()]

In [None]:
def evaluate(qrels, result):
    qrels = [
        Qrel(query_id=query_id, doc_id=doc_id, relevance=relevance)
        for query_id, doc_id, relevance in qrels   
    ]

    runs = [
        ScoredDoc(query_id=query_id, doc_id=doc_id, score=score)
        for query_id, doc_id, score in result
    ]
    scores = ir_measures.calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)

    return scores

# Training

In [6]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [7]:
query_embeddings = [
    (query[0], model.encode(query[1], convert_to_tensor=True)) 
    for query in tqdm(english_queries, desc="Encoding queries")
]

with open('document_embeddings.pkl', 'rb') as file:
    document_embeddings = pickle.load(file)

with open('data/multi-subset.pkl', 'rb') as file:
    multi_subset = pickle.load(file)

multi_ids = [item[0] for item in multi_subset]
document_embeddings = list(zip(multi_ids, document_embeddings))

Encoding queries:   0%|          | 0/76 [00:00<?, ?it/s]

In [8]:
def cosine_simarlity(query, documents):
    similarities = []

    query_id, query_embedding = query
    for doc_id, doc_embedding in documents:
        # Normalize the document embedding
        doc_embedding = doc_embedding / doc_embedding.norm()
        
        # Compute cosine similarity (dot product)
        cos_sim = F.cosine_similarity(query_embedding, doc_embedding, dim=0)
        
        # Append the result as a tuple of (document ID, cosine similarity)
        similarities.append((query_id, doc_id, cos_sim.item()))

    return similarities

def euclidean_distance(query, documents):
    similarities = []

    query_id, query_embedding = query
    for doc_id, doc_embedding in documents:

        doc_embedding = doc_embedding / doc_embedding.norm()

        # Compute Euclidean distance
        distance = torch.norm(query_embedding - doc_embedding)
        
        # Append the result as a tuple of (document ID, Euclidean distance)
        similarities.append((query_id, doc_id, distance.item()))

    return similarities


def dot_product(query, documents):
    similarities = []

    query_id, query_embedding = query
    for doc_id, doc_embedding in documents:

        doc_embedding = doc_embedding / doc_embedding.norm()

        # Compute dot product
        dot_prod = torch.dot(query_embedding, doc_embedding)
        
        # Append the result as a tuple of (document ID, dot product)
        similarities.append((query_id, doc_id, dot_prod.item()))

    return similarities

In [9]:
def process_documents(queries, documents, similarity_measure):
    all_queries = []
    for query in tqdm(queries):
        cosine_similarity_documents = similarity_measure(query, documents) 

        sorted_documents = sorted(cosine_similarity_documents, key=lambda x: x[2], reverse=True)
        all_queries.extend([sorted_documents])


        
    return all_queries

In [10]:
scores = process_documents(query_embeddings, document_embeddings, cosine_simarlity)

  0%|          | 0/76 [00:00<?, ?it/s]

In [None]:
merged_list = [item for sublist in scores for item in sublist]

In [17]:
evaluation_score = evaluate(qrels, merged_list)

In [18]:
evaluation_score

{RBP(rel=1): 0.45510694717835337,
 nDCG@20: 0.3148521932945857,
 R@100: 0.2100646377835137,
 R@1000: 0.6727884396541376,
 AP: 0.2237307845483261}

In [11]:
with open('data/multi-subset.pkl', 'rb') as file:
    document_subset = pickle.load(file)
    
model_name = "cross-encoder/ms-marco-TinyBERT-L-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
document_dict = {doc_id: doc_title for doc_id, doc_title, _ in document_subset}

In [None]:
updated_scores = []
for query_scores in tqdm(scores):
    score_in_query = []
    for query in query_scores:
        
        uuid = query[1]

        text = document_dict.get(uuid, "No text found")
        
        if text not in query:

            score_in_query.append(query + (text,))
        else:
            score_in_query.append(query)
    updated_scores.append(score_in_query)

  0%|          | 0/76 [00:00<?, ?it/s]

In [14]:
cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')  

In [None]:
cross_encoded_scores = []
for new_scores in tqdm(updated_scores):
    top_100  = new_scores[:100]
    pairs = [(english_queries[0][1], doc[3]) for doc in top_100]
    top_100_scores = cross_encoder.predict(pairs)

    sorted_top_100_indices = np.argsort(top_100_scores)[::-1]

    sorted_top_100 = [top_100[i] for i in sorted_top_100_indices]

    items = []
    for index ,item in enumerate(sorted_top_100[:100]):
        adjustment = (100 - index) / 500 - 0.1

        items.append((item[0], item[1],item[2] + adjustment, item[3]))

        

    sorted_top_100[:100] 

    new_scores[:100] = items

    cross_encoded_scores.extend(new_scores)


  0%|          | 0/76 [00:00<?, ?it/s]

In [16]:
cross_encoded_scores
cross_encoded_scores_evaluation = [tup[:3] for tup in cross_encoded_scores]


In [17]:
flattened_scores = [item for sublist in scores for item in sublist]
flattened_scores

[('200', 'c6fa30d0-dbfc-4bc3-892c-5e95a65f46b1', 0.7512326836585999),
 ('200', 'b7d9703b-bbe4-46e4-8b2f-1e4c4ef4ea20', 0.7421060800552368),
 ('200', 'a9ef1603-604e-43f9-b8cd-80f98b3ac144', 0.7155982255935669),
 ('200', '3f669212-6d89-4826-8ac5-ea1f9adaeb10', 0.705804705619812),
 ('200', '40b1d7b9-856b-416f-9e9e-daa14daab532', 0.7012007832527161),
 ('200', 'c8b66c15-d538-4f8a-9534-f5adfca6188c', 0.6989572644233704),
 ('200', '164fef6f-9200-4f5c-8778-335d4e689b55', 0.6896619200706482),
 ('200', 'd84ebc49-2b1f-433b-b8c5-3f2572d6d273', 0.6877737641334534),
 ('200', '99a91ff3-c6f9-43bc-8963-2e21468d6b20', 0.6854864358901978),
 ('200', 'adc44758-18ce-4893-848c-efb6261f3cc4', 0.6827200055122375),
 ('200', 'bb4bfa48-51ad-40ad-9438-64dc9e65b554', 0.6813896894454956),
 ('200', 'fb49ace6-aaa3-4ce0-a6bb-5504213cac27', 0.6804773211479187),
 ('200', '02d3c393-227d-4573-b7bb-df798d132186', 0.6795737743377686),
 ('200', '7f7b7999-31c9-4c9b-8f86-8c8e00ddf555', 0.6781703233718872),
 ('200', 'dce80923-b7

In [18]:
evaluation_score = evaluate(qrels, cross_encoded_scores_evaluation)

In [19]:
evaluation_score

{R@1000: 0.6728081962528415,
 nDCG@20: 0.2574415083961664,
 AP: 0.2084441975338404,
 RBP(rel=1): 0.3826343516937159,
 R@100: 0.20394230185676054}

In [20]:
evaluation_score

{R@1000: 0.6728081962528415,
 nDCG@20: 0.2574415083961664,
 AP: 0.2084441975338404,
 RBP(rel=1): 0.3826343516937159,
 R@100: 0.20394230185676054}