In [3]:
import pandas as pd
import ir_datasets
import pickle
from sentence_transformers import SentenceTransformer
from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc, calc_aggregate
from tqdm.notebook import tqdm
import torch
from sentence_transformers import CrossEncoder
import numpy as np
import ast
from sklearn.model_selection import train_test_split


In [4]:
df_documents = pd.read_csv('data/sbert_documents.csv')
df_queries = pd.read_csv('data/queries.csv')
dataset = ir_datasets.load("neuclir/1/multi/trec-2023")
english_queries = [(query.query_id, query.title) for query in dataset.queries_iter()]

qrels = [(qrel.query_id, qrel.doc_id, qrel.relevance) for qrel in dataset.qrels_iter()]

In [5]:
dataset = ir_datasets.load("neuclir/1/multi/trec-2023")

english_queries = [(query.query_id, query.title) for query in dataset.queries_iter()]
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
query_embeddings = model.encode([query[1] for query in english_queries], convert_to_tensor=True)

In [None]:
df_queries['query_embedding'] = df_queries['query_embedding'].apply(lambda x: torch.tensor(ast.literal_eval(x.split('tensor')[1])))
df_documents['content_embedding'] = df_documents['content_embedding'].apply(lambda x: torch.tensor(ast.literal_eval(x.split('tensor')[1])))
df_documents['title_embedding'] = df_documents['title_embedding'].apply(lambda x: torch.tensor(ast.literal_eval(x.split('tensor')[1])))

In [7]:
def evaluate(qrels, result):
    qrels = [
        Qrel(query_id=query_id, doc_id=doc_id, relevance=relevance)
        for query_id, doc_id, relevance in qrels   
    ]

    runs = [
        ScoredDoc(query_id=query_id, doc_id=doc_id, score=score)
        for query_id, doc_id, score in result
    ]
    scores = calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)

    return scores

In [None]:
import numpy as np

train_feature_set = []
train_labels = []

# Iterate through query 200-220 as training data
for query_id, doc_id, score in tqdm(qrels):
    if 200 <= int(query_id) <= 220:
        # Retrieve the corresponding query embedding and document embedding
        query_embedding = df_queries.loc[df_queries['id'] == int(query_id), 'query_embedding'].values[0]
        content_embedding = df_documents.loc[df_documents['id'] == doc_id, 'content_embedding'].values[0]

        # Concatenate the query embedding and document content embedding
        feature_vector = np.concatenate((query_embedding, content_embedding))
        
        # Append the feature vector and score
        train_feature_set.append(feature_vector)
        train_labels.append(score)

train_feature_set = np.array(train_feature_set)  # Numpy array of feature vectors
train_labels = np.array(train_labels)  # Numpy array of scores


  0%|          | 0/79934 [00:00<?, ?it/s]

In [None]:
test_feature_set = []
test_labels = []
test_doc_ids = []
test_query_id = []
for query_id, doc_id, score in tqdm(qrels):
    # Iterate through query 221-275 as test data
    if 221 <= int(query_id) <= 275:
        # Retrieve the corresponding query embedding and document embedding
        query_embedding = df_queries.loc[df_queries['id'] == int(query_id), 'query_embedding'].values[0]
        content_embedding = df_documents.loc[df_documents['id'] == doc_id, 'content_embedding'].values[0]
        test_doc_ids.append(doc_id)
        test_query_id.append(query_id)
        
        # Concatenate query embedding and document content embedding
        feature_vector = np.concatenate((query_embedding, content_embedding))
        
        # Append the feature vector and score
        test_feature_set.append(feature_vector)
        test_labels.append(score)

# Convert feature_set and labels into appropriate formats
test_feature_set = np.array(test_feature_set)  
test_labels = np.array(test_labels)  

  0%|          | 0/79934 [00:00<?, ?it/s]

In [10]:
class DocumentQueryDataset(torch.utils.data.Dataset):
    def __init__(self, feature_embeddings, relevance_scores):
        self.feature_embeddings = feature_embeddings
        self.relevance_scores = relevance_scores

    def __len__(self):
        return len(self.relevance_scores)

    def __getitem__(self, idx):
        feature_embeddings = torch.tensor(self.feature_embeddings[idx], dtype=torch.float32)
        relevance_score = torch.tensor(self.relevance_scores[idx], dtype=torch.float32)
        return feature_embeddings, relevance_score
    
dataset = DocumentQueryDataset(train_feature_set, train_labels)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
class MLPModel(torch.nn.Module):
    def __init__(self):
        super(MLPModel, self).__init__()

        self.input_size = 768
        
        self.hidden1 = torch.nn.Linear(self.input_size, self.input_size)
        self.hidden2 = torch.nn.Linear(self.input_size, int(self.input_size/2))
        self.output = torch.nn.Linear(int(self.input_size/2), 4)  
        
        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.hidden1(x))
        x = self.relu(self.hidden2(x))
        x = self.output(x)
        return x  

In [None]:
model = MLPModel()  
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for data, target in dataloader:
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(data)

        target = target.long()
        
        # Compute loss
        loss = criterion(output, target)
        
        # Backpropagation
        loss.backward()
        
        # Update weights
        optimizer.step()
        
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1/5, Loss: 0.6278221011161804
Epoch 2/5, Loss: 0.906505286693573
Epoch 3/5, Loss: 0.5591813921928406
Epoch 4/5, Loss: 0.487682968378067
Epoch 5/5, Loss: 0.8473197221755981


In [None]:
model.eval()  

features = torch.tensor(test_feature_set, dtype=torch.float32)

with torch.no_grad():  
    output = model(features) 

    predicted_class = torch.argmax(output, dim=1).numpy().tolist()

In [None]:
# Convert test qrels into correct format
test_qrels = [item for item in qrels if int(item[0]) >= 221]


In [None]:
# convert predictions into correct format
mlp_scores = list(zip(test_query_id, test_doc_ids, predicted_class))

In [17]:
evaluate(mlp_scores, test_qrels)

{nDCG@20: 0.022632390815729223,
 R@100: 0.06808907513492216,
 R@1000: 0.6295431973386293,
 RBP(rel=1): 0.022045023113816155,
 AP: 0.016838619148803944}