In [None]:
# list of models
models = ['sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
          'sentence-transformers/LaBSE', 
          'ahdsoft/persian-sentence-transformer-news-wiki-pairs-v3',
          'AIDA-UPM/mstsb-paraphrase-multilingual-mpnet-base-v2',
          'ViravirastSHZ/Hafez_Bert',
          'HooshvareLab/bert-base-parsbert-uncased',
          'pedramyazdipoor/persian_xlm_roberta_large',
          'FacebookAI/xlm-roberta-large',
          'Linq-AI-Research/Linq-Embed-Mistral',
         
         ]

# Semantic Search Pipeline for Test Data using Sentence Transformers

### 1- Import Libraries and set parameters

In [None]:
import os 
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
import pandas as pd

In [None]:
import pandas as pd
import numpy as np
from io import StringIO
import pickle
# from tqdm.autonotebook import tqdm, trange
import torch

from sentence_transformers import SentenceTransformer, util

# parameters 
dataset_path = 'Test_collections/'


#model_folder = 'FT_models/'
# models_path = 'D:/Amini/Dev_works/HF_models'
# test data test
dataset_name = 'Results_Pools_For_Queries_Mordad_1403_sorted.xlsx'


# zero_shut test
# dataset_name = 'Results_pool_for_ZERO_shut_Mordad_1403_sorted.xlsx'


#hf access token
write_access_token = '***'



### Model Info

In [None]:
model_name = 'PSTNWPv3'  # used for storing results
#model_name = 'sentence-transformers/' + model
model = 'Best_FT_models/ahdsoft/persian-sentence-transformer-news-wiki-pairs-v3'
#model = 'myrkur/sentence-transformer-parsbert-fa'
#

## Defining functions

In [None]:
# defininng a function for comparison of strings
def count_string_occurrences(list1, list2):
    occurrences = 0
    for string in list1:
        if string in list2:
            occurrences += 1
    return  occurrences 


### 2- Loading the Evaluation Dataset

In [None]:
dataset = pd.read_excel(dataset_path  + dataset_name )
query_set = dataset['query']


# list of queries for search
queries = query_set.unique().tolist()
verses = dataset['verse'].tolist()

In [None]:
dataset.head()

In [None]:
len(verses)

In [None]:
len(queries)

### 3- Perform Test

In [None]:
torch.cuda.empty_cache()
embedder = SentenceTransformer(model , trust_remote_code=True )


In [None]:
# Loading the st model

# Create embeddings of verses
#verse_embeddings  = embedder.encode(verses )
verse_embeddings  = embedder.encode(verses,  convert_to_tensor = True )

# normalized embeddings
#verse_embeddings.c
# norms = np.linalg.norm(verse_embeddings, ord=2, axis=1, keepdims=True)
# norms[norms == 0] = 1
# verse_embeddings = verse_embeddings / norms


# Create embeddings of queries
query_embeddings = embedder.encode(queries,  convert_to_tensor = True)

# normalized embeddings
# norms = np.linalg.norm(query_embeddings, ord=2, axis=1, keepdims=True)
# norms[norms == 0] = 1
# query_embeddings = query_embeddings / norms


In [None]:
# defining the dictionary of similarity fucntions
sim_function = {'cosine': util.cos_sim , 'dot': util.dot_score , 'euclidean': util.euclidean_sim , 'manhattan': util.manhattan_sim}

In [None]:
sim_fn = embedder.similarity_fn_name
if sim_fn is None:
    sim_fn = 'cosine'

In [None]:
# embedder.similarity_fn_name = 'cosine'

In [None]:
# Attempt to get the similarity function name from the model

for top_k in [10 , 20]:
        # Perform Semantic Search
    print('the model similarity function is: ', sim_fn)
    answers = util.semantic_search(query_embeddings, verse_embeddings, top_k = top_k, score_function = sim_function[sim_fn] )
    
    # creating a dataset from the search results
    df = pd.DataFrame(columns = ['query', 'query_id', 'answer', 'search_score'])
    df_list = []
    for query in queries: 
    
        df_temp = pd.DataFrame(answers[queries.index(query)])
        df_temp['query_id'] = queries.index(query)+1
        df_temp['query'] = query
        
        df_list.append(df_temp)
    df = pd.concat(df_list)
    
    df = df.set_index('corpus_id')
    df = df.reset_index()
    
    df['answer'] = [verses[corpus_id] for corpus_id in df['corpus_id']]
    search_results = df[['query_id', 'query', 'answer', 'corpus_id','score' ]]
    
    
    # save results intro an excel file
    search_results.to_excel(results_path + model_name + 'k_{}.xlsx'.format(top_k))
    
    ###  Evaluation
    q_results = search_results[['query_id' , 'query', 'answer', 'score']]
    gt = dataset[['query', 'verse', 'relevance score']]
    
    
    # evaluation with comparison  - how many of the results happen in the ground truth verses
    TPR = {} # true positive rate or recall
    Precision = {}
    NDCG = {} # Precesion
    for q in queries:
        q_result = q_results[q_results['query'] == q]['answer']
        result_list = q_result.to_list()
        gt_verses = gt[gt['query'] == q]['verse'] 
        gt_verses_list = gt_verses.tolist()
        occurences = count_string_occurrences(result_list, gt_verses_list)
        TPR[q] = occurences / len(gt_verses_list)  # it now recall
        Precision[q] = occurences/ top_k #(we get the top 10)
        
    # calculating the third measure of performance  (NDCG)
    for q in queries:
        q_result =  q_results[q_results['query'] == q][['answer', 'score']]
        gt_verses = gt[gt['query'] == q][['verse', 'relevance score']]
        merged_df = pd.merge(q_result, gt_verses, left_on='answer', right_on='verse', how='left')
        merged_df['relevance score'] = merged_df['relevance score'].fillna(0)
        # Selecting only the columns of interest: 'answer', 'score'
        matched_verses = merged_df[['answer', 'relevance score', 'score']]
    
        # Prepare the arrays for ndcg_score
        true_relevance = matched_verses['relevance score'].to_numpy().reshape(1, -1)
        predicted_scores = matched_verses['score'].to_numpy().reshape(1, -1)
        # calculating the NDCG
        from sklearn.metrics import ndcg_score
        q_ndcg = ndcg_score(true_relevance, predicted_scores)
    
        NDCG[q] =  q_ndcg
    

    # save results intro an excel file
    query_scores = pd.DataFrame( data = { 'Recall' : TPR.values(), 'Precision':Precision.values(), 'NDCG': NDCG.values()} , index = TPR.keys())
    query_scores.reset_index(inplace=True)
    
    query_scores.rename(columns ={'index' : 'query'} , inplace = True)
    
    query_scores.loc[len(query_scores.index)] = ['mean', np.mean(query_scores['Recall']), np.mean(query_scores['Precision']), np.mean(query_scores['NDCG'])]
    query_scores.to_excel(results_path + model_name + 'k_{}_scores.xlsx'.format(top_k))

    print('The Testing process for the model {} is finished.\n'.format(model_name))
    print(' The recall @ k for k = {} is:{} '. format(top_k , np.mean(query_scores['Recall'])))
    print(' The  P@K for top_k = {} is: {}'. format(top_k, np.mean(query_scores['Precision'])))    
    print(' The NDCG @ k for k = {} is:  {}'. format(top_k , np.mean(query_scores['NDCG'])))
    print ('****************************************************************************\n')




###### 