In [None]:
pip install sentence_transformers torch

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import torch
import os
import tqdm
import time
import pickle
from torch.utils.data import DataLoader, Dataset
import multiprocessing as mp
from sklearn.preprocessing import normalize
import gc

In [None]:
"""
Load Datasets, only need Test data for metric evaluation
"""
df_collection = pd.read_csv("Data/collection.tsv", sep="\t", names=["doc_id", "passage"], on_bad_lines="skip")
df_queries = pd.read_csv("Data/queries.test.tsv", sep="\t", names=["query_id", "query"], on_bad_lines="skip")
df_qrels = pd.read_csv("Data/qrels.test.tsv", sep="\t", names=["query_id", "doc_id"])

In [None]:
"""
Choose model for embedding
Basemodel: mixedbread-ai/deepset-mxbai-embed-de-large-v1
Mxbai/finetuning_v21/model_finetuned
"""
model = SentenceTransformer("mixedbread-ai/deepset-mxbai-embed-de-large-v1", device="cuda")

In [None]:
"""
For multiprocessing
"""
pool = model.start_multi_process_pool()

In [None]:
relevant_doc_ids = set(df_qrels['doc_id'].unique())

print(len(relevant_doc_ids))

In [None]:
df_collection_filtered = df_collection[df_collection['doc_id'].isin(relevant_doc_ids)]
df_collection_filtered = df_collection_filtered.reset_index(drop=True)

print(len(df_collection_filtered))

In [None]:
"""
Method to calculate embeddings with multiple GPUS
-------
with one GPU use model.encode() , without pool and batchsize
"""
batch_size = 256
print("Start normalized embeddings")
#corpus_embeddings = model.encode_multi_process(df_collection["passage"], pool, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)
corpus_embeddings = model.encode(df_collection_filtered["passage"], batch_size=batch_size show_progress_bar=True, normalize_embeddings=True)

In [None]:
# Data Structure for Embedding Data
passage_to_doc_id = dict(enumerate(df_collection_filtered['doc_id']))
embedding_data = {'doc_ids': passage_to_doc_id, 'embeddings': corpus_embeddings}

In [None]:
"""
Save passage embeddings to pickle
"""
with open('Data/models/mxbai/embedding_passages_only_test_v2.pkl', 'wb') as file:
    pickle.dump(embedding_data, file)

In [None]:
"""
Load Finetuned Model
"""

model_v1 = SentenceTransformer("Mxbai/finetuning_v21/model_finetuned", device="cuda")

In [None]:
"""
Method to calculate embeddings with multiple GPUS
-------
with one GPU use model.encode() , without pool
"""
batch_size = 256
print("Start normalized embeddings")
start = time.time()
#corpus_embeddings = model.encode_multi_process(df_collection["passage"], pool, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)
corpus_embeddings = model_v1.encode(df_collection_filtered["passage"], batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)

In [None]:
# Data Structure for Embedding Data
passage_to_doc_id = dict(enumerate(df_collection_filtered['doc_id']))
embedding_data = {'doc_ids': passage_to_doc_id, 'embeddings': corpus_embeddings}

In [None]:
"""
Save passage embeddings to pickle
"""
with open('Data/models/mxbai/embedding_passages_only_test_v3.pkl', 'wb') as file:
    pickle.dump(embedding_data, file)

In [None]:
"""
Method to calculate query embeddings with multiple GPUS
-------
with one GPU use model.encode() , without pool and batchsize
"""
batch_size=256
print("Start normalized embeddings")


#query_embeddings = model.encode_multi_process(df_queries["query"], pool, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)

query_embeddings = model_v1.encode(df_queries["query"], batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)

In [None]:
# Data Structure for Query Embedding Data
query_id_dict = dict(enumerate(df_queries['query_id']))
query_data = {'query_ids': query_id_dict, 'embeddings': query_embeddings}

In [None]:
"""
Save query embeddings to pickle
"""
with open('Data/models/mxbai/query_embeddings_test_v3.pkl', 'wb') as file:
    pickle.dump(query_data, file)

In [None]:
model.stop_multi_process_pool(pool)