In [91]:
from pymilvus import MilvusClient, DataType
from openai import OpenAI, Embedding
from sentence_transformers import SentenceTransformer
import pandas as pd
import time
import numpy as np
import json


In [92]:
milvus_client = MilvusClient(
    uri="http://localhost:19530"
)


In [93]:
p=0.1

In [94]:
queries = pd.read_csv("../queries_for_documents.csv").tail(50)

In [95]:
len(queries)

50

In [96]:
### BERT Embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
# model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
def get_embeddings_from_bert(sentence):
    return model.encode([sentence])[0]





In [97]:
queries.head()

Unnamed: 0,query,doc_index
25,Who are the plaintiffs involved in the Mazzaga...,556
26,How much of a hit did Best Buy's stock take af...,302
27,How are medicare pricing adjustments affecting...,312
28,How much has twitter been devalued since rebra...,314
29,What effect is China's imports on crude oil ha...,545


In [98]:
def compute_score(row, p):

    query, ground_truth_document_id = row["query"], row["doc_index"]
    collection_name = f"yahoo_finance_article_BIG_BERT_AUTOPHRASE_Q{Q}_{int(p * 100)}"
    embedded_query = get_embeddings_from_bert(query)

    start = time.time()
    
    res = milvus_client.search(
        collection_name=collection_name,
        data=[embedded_query],
        limit=5,
        search_params={"metric_type": "IP", "params": {}} # Search parameters
    )
    
    end = time.time()
    exec_time = end - start

    top1 = 1 if ground_truth_document_id in [(r["id"]) for r in res[0]][:1] else 0
    top3 = 1 if ground_truth_document_id in [(r["id"]) for r in res[0]][:3] else 0
    top5 = 1 if ground_truth_document_id in [(r["id"]) for r in res[0]][:5] else 0

    return (top1, top3, top5, exec_time)

In [99]:
for Q in [30, 70, 90]:
    # queries["scores"] = queries["query"].map(lambda x: compute_score(x, p))
    queries[f"scores_{Q}"] = queries.apply(lambda x: compute_score(x, p), axis=1)



In [100]:
scores_percentage = {}

for Q in [30, 70, 90]:
    scores_percentage[str(Q)] = tuple(
        sum(values) / len(values) for values in zip(*queries[f'scores_{Q}'])
    )

In [101]:
scores_percentage

{'30': (0.72, 0.84, 0.9, 0.004431514739990234),
 '70': (0.72, 0.82, 0.88, 0.004140405654907226),
 '90': (0.68, 0.84, 0.88, 0.0041989898681640625)}

In [102]:
with open(f'scores_BIG_BERT_AUTOPHRASE_P{p}_{model_name.replace("sentence-transformers/", "")}.txt', 'w') as file:
    file.write(str(scores_percentage))