In [1]:
# change dir to root
%cd ../

c:\Users\Anant\Work\search-app


## Evaluation

In [2]:
import json
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import ndcg_score, average_precision_score
from core import TFIDFSearch, BM25Search, NeuralSearch, HybridSearch, RETRIVAL_MODELS, SCORE_TYPES, MEAN_TYPES, NORM_TYPES

In [3]:
def get_metrics(true_scores_df, model, mean_type=None, norm_type=None, k=20):
    ndcg = []
    pbar = tqdm(
        desc="Evaluating nDCG Score ",
        total=true_scores_df.shape[0],
        unit="query",
        leave=False,
    )
    for index, row in true_scores_df.iterrows():
        if model.__class__.__name__ == "HybridSearch":
            y_score = (
                model.get_scores(row.QUERY, mean_type, norm_type)
                .sort_index()
                .SCORE.tolist()
            )
        else:
            y_score = model.get_scores(row.QUERY).sort_index().SCORE.tolist()
        y_true = json.loads(row.SCORES)
        ndcg.append(ndcg_score([y_true], [y_score], k=k))
        pbar.update(1)
    pbar.close()

    return np.mean(ndcg)

In [4]:
true_scores = pd.read_csv('./data/processed/true_scores.csv')

### TFIDF

In [None]:
model = TFIDFSearch()
print('NDCG Score:',get_metrics(true_scores, model))

### BM25

In [None]:
model = BM25Search()
print('NDCG Score:',get_metrics(true_scores, model))

### Neural

In [None]:
for model_name in RETRIVAL_MODELS:
    print(f'Model: {model_name}')
    for score_type in SCORE_TYPES:
        model = NeuralSearch(model=model_name, score_type=score_type)
        score = get_metrics(true_scores, model)
        print(f'Score Type: {score_type}\t;\tScore: {score}')
        del(model)
        torch.cuda.empty_cache()
    print('\n')

In [None]:
Model: BAAI/bge-base-en-v1.5
Score Type: Dot Product	        ;	Score: 0.8575061348159865
Score Type: Cosine Similarity	;	Score: 0.8575061348159865

Model: thenlper/gte-large
Score Type: Dot Product	        ;	Score: 0.8773796912454879
Score Type: Cosine Similarity	;	Score: 0.8773796912454879

Model: llmrails/ember-v1
Score Type: Dot Product	        ;	Score: 0.8041243430795993
Score Type: Cosine Similarity	;	Score: 0.8434121092569689

Model: thenlper/gte-base
Score Type: Dot Product	        ;	Score: 0.8843679159293903
Score Type: Cosine Similarity	;	Score: 0.8843679159293903

Model: all-distilroberta-v1
Score Type: Dot Product	        ;	Score: 0.7205834484750344
Score Type: Cosine Similarity	;	Score: 0.7205834484750344

Model: msmarco-distilbert-base-v4
Score Type: Dot Product	        ;	Score: 0.7505768344386088
Score Type: Cosine Similarity	;	Score: 0.7674091011409188

Model: msmarco-MiniLM-L-6-v3
Score Type: Dot Product	        ;	Score: 0.6871907026867569
Score Type: Cosine Similarity	;	Score: 0.7588424587892733

Model: msmarco-MiniLM-L-12-v3
Score Type: Dot Product	        ;	Score: 0.7376601596872528
Score Type: Cosine Similarity	;	Score: 0.7880065060269283

Model: msmarco-distilbert-base-tas-b
Score Type: Dot Product	        ;	Score: 0.9012300542971058
Score Type: Cosine Similarity	;	Score: 0.8716073130245685

### Hybid

In [7]:
for model_name in RETRIVAL_MODELS:
    print(f'Model: {model_name}')
    for mean_type in MEAN_TYPES:
        for norm_type in [None]+NORM_TYPES:
            for score_type in SCORE_TYPES:
                model = HybridSearch(model=model_name, score_type=score_type)
                score = get_metrics(true_scores, model, mean_type, norm_type)
                print(f'Mean Type: {mean_type}\t;\tNorm Type: {norm_type}\t;\tScore Type: {score_type}\t;\tScore: {score}')
                del(model)
                torch.cuda.empty_cache()
    print('\n')

Model: BAAI/bge-base-en-v1.5


Evaluating nDCG Score :   0%|          | 0/3333 [00:00<?, ?query/s]

Mean Type: Arithmetic	;	Norm Type: None	;	Score Type: Dot Product	;	Score: 0.9404823059231748


Evaluating nDCG Score :   0%|          | 0/3333 [00:00<?, ?query/s]

Mean Type: Arithmetic	;	Norm Type: None	;	Score Type: Cosine Similarity	;	Score: 0.9404823059231748


Evaluating nDCG Score :   0%|          | 0/3333 [00:00<?, ?query/s]

Mean Type: Arithmetic	;	Norm Type: L2	;	Score Type: Dot Product	;	Score: 0.9406323735097765


Evaluating nDCG Score :   0%|          | 0/3333 [00:00<?, ?query/s]

Mean Type: Arithmetic	;	Norm Type: L2	;	Score Type: Cosine Similarity	;	Score: 0.9406323735097765


Evaluating nDCG Score :   0%|          | 0/3333 [00:00<?, ?query/s]

Mean Type: Arithmetic	;	Norm Type: Min-Max	;	Score Type: Dot Product	;	Score: 0.9381730386226522


Evaluating nDCG Score :   0%|          | 0/3333 [00:00<?, ?query/s]

Mean Type: Arithmetic	;	Norm Type: Min-Max	;	Score Type: Cosine Similarity	;	Score: 0.9381730386226522


Evaluating nDCG Score :   0%|          | 0/3333 [00:00<?, ?query/s]

Mean Type: Geometric	;	Norm Type: None	;	Score Type: Dot Product	;	Score: 0.906591376131558


Evaluating nDCG Score :   0%|          | 0/3333 [00:00<?, ?query/s]

In [None]:
Model: BAAI/bge-base-en-v1.5
Mean Type: Arithmetic	;	Norm Type: None	;	Score Type: Dot Product	;	Score: 0.9404823059231748
Mean Type: Arithmetic	;	Norm Type: None	;	Score Type: Cosine Similarity	;	Score: 0.9404823059231748
Mean Type: Arithmetic	;	Norm Type: L2	;	Score Type: Dot Product	;	Score: 0.9406323735097765
Mean Type: Arithmetic	;	Norm Type: L2	;	Score Type: Cosine Similarity	;	Score: 0.9406323735097765
Mean Type: Arithmetic	;	Norm Type: Min-Max	;	Score Type: Dot Product	;	Score: 0.9381730386226522