In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import ir_datasets
from sentence_transformers import SentenceTransformer

import numpy as np
from tqdm.notebook import tqdm
import pickle
import ir_measures

from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc


In [None]:
# Define evaluation function 
def evaluate(qrels, result):
    qrels = [
        Qrel(query_id=query_id, doc_id=doc_id, relevance=relevance)
        for query_id, doc_id, relevance in qrels   
    ]

    runs = [
        ScoredDoc(query_id=query_id, doc_id=doc_id, score=score)
        for query_id, doc_id, score in result
    ]
    scores = ir_measures.calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)

    return scores

In [3]:
with open('data/multi-subset.pkl', 'rb') as file:
    document_subset = pickle.load(file)

dataset = ir_datasets.load("neuclir/1/multi/trec-2023")
dataset
english_queries = [(query.query_id, query.title) for query in dataset.queries_iter()]
qrels = [(qrel.query_id, qrel.doc_id, qrel.relevance) for qrel in dataset.qrels_iter()]


model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

query_embeddings = [
    (query[0], model.encode(query[1], convert_to_tensor=True)) 
    for query in tqdm(english_queries, desc="Encoding queries")
]

with open('document_embeddings.pkl', 'rb') as file:
    document_embeddings = pickle.load(file)

with open('data/multi-subset.pkl', 'rb') as file:
    multi_subset = pickle.load(file)

multi_ids = [item[0] for item in multi_subset]
document_embeddings = list(zip(multi_ids, document_embeddings))

Encoding queries:   0%|          | 0/76 [00:00<?, ?it/s]

In [220]:
def get_embedding_for_id(qrels, document_embeddings, target_id):
    # Create a dictionary from document_embeddings for fast lookup
    doc_embeddings_dict = {doc_id: emb for doc_id, emb in document_embeddings}

    # Iterate through qrels to find the target id and return its embedding
    for _, doc_id, _ in qrels:
        if doc_id == target_id:
            return doc_embeddings_dict.get(doc_id, None)
        
    return None


In [None]:
training_subset = []
document_embedding_subset = [] 
labels_subset = [] 
count = 0
for qrel_number in tqdm(range(200, 210)):
    filtered_qrels = [tup for tup in qrels if tup[0] == str(qrel_number)]
  
    for qrel in tqdm(filtered_qrels): 
        target_id = qrel[1]
        embedding = get_embedding_for_id(qrels, document_embeddings, target_id)
        document_embedding_subset.append(embedding)
        labels_subset.append(qrel[2])
    
    training_subset_current_iteration = [torch.cat((tensor, query_embeddings[count][1]), dim=0) for tensor in document_embedding_subset]
    training_subset.extend(training_subset_current_iteration)
    count += 1

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1448 [00:00<?, ?it/s]

  0%|          | 0/862 [00:00<?, ?it/s]

  0%|          | 0/1361 [00:00<?, ?it/s]

  0%|          | 0/967 [00:00<?, ?it/s]

  0%|          | 0/1477 [00:00<?, ?it/s]

  0%|          | 0/1523 [00:00<?, ?it/s]

  0%|          | 0/607 [00:00<?, ?it/s]

  0%|          | 0/1019 [00:00<?, ?it/s]

  0%|          | 0/1119 [00:00<?, ?it/s]

  0%|          | 0/2293 [00:00<?, ?it/s]

In [109]:
import torch
from torch.utils.data import Dataset

class DocumentQueryDataset(Dataset):
    def __init__(self, feature_embeddings, relevance_scores):
        self.feature_embeddings = feature_embeddings
        self.relevance_scores = relevance_scores

    def __len__(self):
        return len(self.relevance_scores)

    def __getitem__(self, idx):
        feature_embeddings = torch.tensor(self.feature_embeddings[idx], dtype=torch.float32)
        relevance_score = torch.tensor(self.relevance_scores[idx], dtype=torch.float32)
        return feature_embeddings, relevance_score

In [110]:
from torch.utils.data import DataLoader

# Assuming doc_embeddings, query_embeddings, and relevance_scores are numpy arrays or lists
dataset = DocumentQueryDataset(training_subset, labels_subset)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [111]:
class MLPModel(nn.Module):
    def __init__(self):
        super(MLPModel, self).__init__()
        # Input layer is 2 * embedding_size
        self.input_size = 768
        
        # Define the hidden layers
        self.hidden1 = nn.Linear(self.input_size, 768)
        self.hidden2 = nn.Linear(768, 384)
        self.output = nn.Linear(384, 4)  
        
        # Activation functions
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.hidden1(x))
        x = self.relu(self.hidden2(x))
        x = self.output(x)
        return x  # Raw logits for multi-class classification

In [112]:
# Initialize the model, loss function, and optimizer
model = MLPModel()  # Set embedding size based on your embeddings
criterion = nn.CrossEntropyLoss()  # Cross entropy for multi-class classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [115]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for data, target in dataloader:
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(data)

        target = target.long()
        
        # Compute the loss
        loss = criterion(output, target)
        
        # Backpropagation
        loss.backward()
        
        # Update the weights
        optimizer.step()
        
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

  feature_embeddings = torch.tensor(self.feature_embeddings[idx], dtype=torch.float32)


Epoch 1/5, Loss: 1.3241698741912842
Epoch 2/5, Loss: 1.2908841371536255
Epoch 3/5, Loss: 0.7200946807861328
Epoch 4/5, Loss: 0.4602888822555542
Epoch 5/5, Loss: 0.5500671863555908


In [208]:
document_embeddings[0]

('7707eaec-e3a6-422a-9e49-68ef7d2baaec',
 tensor([ 0.2471, -0.0563,  0.1989,  0.2372,  0.2023,  0.0907,  0.1063,  0.0344,
          0.0772,  0.0111,  0.1059,  0.0678, -0.0716,  0.1183,  0.2134, -0.0463,
          0.1280, -0.1676, -0.3580,  0.2306, -0.2797, -0.0795,  0.1890,  0.1065,
         -0.1021,  0.0766, -0.1166, -0.0851, -0.1898,  0.0535,  0.0759, -0.0718,
          0.0698, -0.0335, -0.1357,  0.0945, -0.0415,  0.0902, -0.1679, -0.0291,
         -0.0273, -0.1866,  0.0859, -0.0446, -0.1004,  0.1884,  0.0150,  0.0557,
         -0.0983, -0.0696, -0.0465,  0.0185,  0.1249,  0.0862,  0.0469,  0.2441,
         -0.0667, -0.0874, -0.1384,  0.1545,  0.1402, -0.0100,  0.0443,  0.2315,
          0.1856, -0.0649, -0.1911, -0.0896, -0.0506, -0.1312,  0.1476,  0.0244,
         -0.0703, -0.0612,  0.0666, -0.0420, -0.1119, -0.0168, -0.1232, -0.3195,
          0.0789, -0.0761,  0.0936,  0.0570, -0.0341, -0.0104, -0.0372, -0.0689,
         -0.2534,  0.1144, -0.0940,  0.0567,  0.1318, -0.0337,  0.14

In [306]:
testing_subset = []
testing_document_embedding_subset = [] 
testing_labels_subset = [] 
document_ids = []
query_id = []

count = 0
for qrel_number in tqdm(range(200, 270)):
    filtered_qrels = [tup for tup in qrels if tup[0] == str(qrel_number)]
  
    for qrel in filtered_qrels: 
        target_id = qrel[1]
        embedding = get_embedding_for_id(qrels, document_embeddings, target_id)
        testing_document_embedding_subset.append(embedding)
        document_ids.append(str(200+ count))
        testing_labels_subset.append(qrel[2])
        query_id.append(target_id)
    
    testing_subset_current_iteration = [torch.cat((tensor, query_embeddings[count][1]), dim=0) for tensor in testing_document_embedding_subset]
    testing_subset.extend(training_subset_current_iteration)
    count += 1

  0%|          | 0/70 [00:00<?, ?it/s]

In [307]:
with open('testing_subset.pkl', 'wb') as file:
    # Serialize the list and save it to the file
    pickle.dump(testing_subset, file)

with open('labels_subset.pkl', 'wb') as file:
    # Serialize the list and save it to the file
    pickle.dump(labels_subset, file)

In [308]:
model.eval()

testing_subset_tensor = torch.stack(testing_subset)
# Ensure no gradients are computed for inference
with torch.no_grad():
    # Pass data through the model
    predictions = model(testing_subset_tensor)
    
    # For classification, get the predicted class
    predicted_classes = torch.argmax(predictions, dim=1)  # Shape: [batch_size]


In [309]:
len(qrels)

79934

In [310]:
len(predicted_classes)

887320

In [311]:
scores = list(zip(document_ids, query_id, predicted_classes.tolist()))

In [312]:
from itertools import groupby

scores_sorted = []
for key, group in groupby(sorted(scores, key=lambda x: x[0]), key=lambda x: x[0]):
    # Sort each group by the third item
    scores_sorted.extend(sorted(group, key=lambda x: x[2], reverse=True))

In [340]:
evaluate(qrels, scores_sorted)

{AP: 0.17663493184716525,
 R@1000: 0.8140258447147039,
 RBP(rel=1): 0.1955172311579157,
 R@100: 0.0984033707842195,
 nDCG@20: 0.11625619697869208}