In [31]:
from pymilvus import MilvusClient, DataType
from openai import OpenAI, Embedding
from sentence_transformers import SentenceTransformer
import pandas as pd
import time
import numpy as np
import json


In [32]:
milvus_client = MilvusClient(
    uri="http://localhost:19530"
)


In [33]:
queries = pd.read_csv("example_queries.csv")

In [34]:
### BERT Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
def get_embeddings_from_bert(sentence):
    return model.encode([sentence])[0]





In [35]:
queries.head()

Unnamed: 0,query,document
0,How much did OPAP's gross gaming revenue incre...,0
1,How expensive is owning a home?,6


In [36]:
def compute_score(row, p):

    query, ground_truth_document_id = row["query"], row["document"]
    
    embedded_query = get_embeddings_from_bert(query)
    
    collection_name = f"yahoo_finance_article_DROPOUT_{int(p * 100)}"

    res = milvus_client.search(
        collection_name=collection_name,
        data=[embedded_query],
        limit=5,
        search_params={"metric_type": "IP", "params": {}} # Search parameters
    )

    top1 = 1 if ground_truth_document_id in [r["id"] for r in res[0]][:1] else 0
    top3 = 1 if ground_truth_document_id in [r["id"] for r in res[0]][:3] else 0
    top5 = 1 if ground_truth_document_id in [r["id"] for r in res[0]][:5] else 0

    return (top1, top3, top5)

In [37]:
for p in [0, 0.1, 0.3, 0.5, 0.7, 0.9]:
    # queries["scores"] = queries["query"].map(lambda x: compute_score(x, p))
    queries[f"scores_{p}"] = queries.apply(lambda x: compute_score(x, p), axis=1)



In [38]:
queries.head()

Unnamed: 0,query,document,scores_0,scores_0.1,scores_0.3,scores_0.5,scores_0.7,scores_0.9
0,How much did OPAP's gross gaming revenue incre...,0,"(1, 1, 1)","(1, 1, 1)","(1, 1, 1)","(1, 1, 1)","(1, 1, 1)","(1, 1, 1)"
1,How expensive is owning a home?,6,"(1, 1, 1)","(1, 1, 1)","(1, 1, 1)","(1, 1, 1)","(1, 1, 1)","(1, 1, 1)"


In [39]:
scores_percentage = {}

for p in [0, 0.1, 0.3, 0.5, 0.7, 0.9]:
    scores_percentage[str(p)] = tuple(
        sum(values) / len(values) for values in zip(*queries[f'scores_{p}'])
    )

In [40]:
scores_percentage

{'0': (1.0, 1.0, 1.0),
 '0.1': (1.0, 1.0, 1.0),
 '0.3': (1.0, 1.0, 1.0),
 '0.5': (1.0, 1.0, 1.0),
 '0.7': (1.0, 1.0, 1.0),
 '0.9': (1.0, 1.0, 1.0)}