# Imports

In [8]:
import pandas as pd
from datasets import load_dataset, Dataset
from datasets import concatenate_datasets
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.util import cos_sim

# Initialization

In [2]:
train_df = load_dataset("csv", data_files="training_data.csv")
eval_df = load_dataset("csv", data_files="eval_data.csv")
train_df = train_df.rename_column("query", "anchor")
train_df = train_df.rename_column("corpus", "positive")
eval_df = eval_df.rename_column("query", "anchor")
eval_df = eval_df.rename_column("corpus", "positive")
final_model_path = "gte-finance-model"

# Process and Evaluate

In [3]:
corpus_dataset = concatenate_datasets([train_df['train'], eval_df['train']])
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)  # Our corpus (cid => document)
queries = dict(
    zip(eval_df['train']["id"], eval_df['train']["anchor"])
)  # Our queries (qid => question)

In [4]:
 # Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for q_id in queries:
    relevant_docs[q_id] = [q_id]

In [5]:
ir_evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    truncate_dim=2048,  # Truncate the embeddings to a certain dimension
    score_functions={"cosine": cos_sim},
)

# Base Model Metrics

In [6]:
model = SentenceTransformer(model_name_or_path="Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
model.max_seq_length=512
ir_evaluator(model)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'cosine_accuracy@1': 0.5535911602209945,
 'cosine_accuracy@3': 0.6883977900552486,
 'cosine_accuracy@5': 0.7359116022099448,
 'cosine_accuracy@10': 0.7900552486187845,
 'cosine_precision@1': 0.5535911602209945,
 'cosine_precision@3': 0.2294659300184162,
 'cosine_precision@5': 0.14718232044198895,
 'cosine_precision@10': 0.07900552486187845,
 'cosine_recall@1': 0.5535911602209945,
 'cosine_recall@3': 0.6883977900552486,
 'cosine_recall@5': 0.7359116022099448,
 'cosine_recall@10': 0.7900552486187845,
 'cosine_ndcg@10': 0.6706808418958197,
 'cosine_mrr@10': 0.6325269665877401,
 'cosine_map@100': 0.6386555675603985}

# Fine-tuned Model Metrics

In [7]:
finetuned_model = SentenceTransformer(final_model_path, trust_remote_code=True)
ir_evaluator(finetuned_model)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


{'cosine_accuracy@1': 0.6397790055248619,
 'cosine_accuracy@3': 0.7646408839779005,
 'cosine_accuracy@5': 0.8066298342541437,
 'cosine_accuracy@10': 0.850828729281768,
 'cosine_precision@1': 0.6397790055248619,
 'cosine_precision@3': 0.25488029465930023,
 'cosine_precision@5': 0.1613259668508287,
 'cosine_precision@10': 0.08508287292817679,
 'cosine_recall@1': 0.6397790055248619,
 'cosine_recall@3': 0.7646408839779005,
 'cosine_recall@5': 0.8066298342541437,
 'cosine_recall@10': 0.850828729281768,
 'cosine_ndcg@10': 0.7448964960962077,
 'cosine_mrr@10': 0.7109396650004388,
 'cosine_map@100': 0.7156103926622395}

In [21]:
data_base = {'cosine_accuracy@10': 0.7900552486187845,
 'cosine_precision@1': 0.5535911602209945,
 'cosine_recall@10': 0.7900552486187845,
 'cosine_ndcg@10': 0.6706808418958197,
 'cosine_mrr@10': 0.6325269665877401,
 'cosine_map@100': 0.6386555675603985}


data_finetuned = {'cosine_accuracy@10': 0.850828729281768,
 'cosine_precision@1': 0.6397790055248619,
 'cosine_recall@10': 0.850828729281768,
 'cosine_ndcg@10': 0.7448964960962077,
 'cosine_mrr@10': 0.7109396650004388,
 'cosine_map@100': 0.7156103926622395}

In [38]:
metric_name =  data_base.keys()
score = data_base.values()
score_new = data_finetuned.values()
data_json = {
    'Metrics': metric_name,
    'Scores_Base_Model': score,
    'Score_Fine-tuned_Model': score_new,
    'Performance Gain': [f"{round((score_f-score_b)*100,2)}%" for score_f,score_b in zip(score_new,score)]
}
df_metrics = pd.DataFrame(data=data_json)

In [39]:
df_metrics

Unnamed: 0,Metrics,Scores_Base_Model,Score_Fine-tuned_Model,Performance Gain
0,cosine_accuracy@10,0.790055,0.850829,6.08%
1,cosine_precision@1,0.553591,0.639779,8.62%
2,cosine_recall@10,0.790055,0.850829,6.08%
3,cosine_ndcg@10,0.670681,0.744896,7.42%
4,cosine_mrr@10,0.632527,0.71094,7.84%
5,cosine_map@100,0.638656,0.71561,7.7%


In [40]:
df_metrics.to_csv("results_gte_model.csv", errors=False, index=False)