In [35]:
from pymilvus import MilvusClient, DataType
from openai import OpenAI, Embedding
from sentence_transformers import SentenceTransformer
import pandas as pd
import time
import numpy as np
import json


In [36]:
milvus_client = MilvusClient(
    uri="http://localhost:19530"
)


In [37]:
queries = pd.read_csv("../queries_for_documents.csv").tail(50)

In [38]:
len(queries)

50

In [39]:
### BERT Embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
# model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
def get_embeddings_from_bert(sentence):
    return model.encode([sentence])[0]





In [40]:
queries.head()

Unnamed: 0,query,doc_index
25,Who are the plaintiffs involved in the Mazzaga...,556
26,How much of a hit did Best Buy's stock take af...,302
27,How are medicare pricing adjustments affecting...,312
28,How much has twitter been devalued since rebra...,314
29,What effect is China's imports on crude oil ha...,545


In [41]:
def compute_score(row, p):

    query, ground_truth_document_id = row["query"], row["doc_index"]
    collection_name = f"yahoo_finance_article_DROPOUT_{int(p * 100)}"
    embedded_query = get_embeddings_from_bert(query)

    start = time.time()
    
    res = milvus_client.search(
        collection_name=collection_name,
        data=[embedded_query],
        limit=5,
        search_params={"metric_type": "IP", "params": {}} # Search parameters
    )
    
    end = time.time()
    exec_time = end - start

    top1 = 1 if ground_truth_document_id in [r["id"] for r in res[0]][:1] else 0
    top3 = 1 if ground_truth_document_id in [r["id"] for r in res[0]][:3] else 0
    top5 = 1 if ground_truth_document_id in [r["id"] for r in res[0]][:5] else 0

    return (top1, top3, top5, exec_time)

In [42]:
for p in [0, 0.1, 0.3, 0.5, 0.7, 0.9]:
    # queries["scores"] = queries["query"].map(lambda x: compute_score(x, p))
    queries[f"scores_{p}"] = queries.apply(lambda x: compute_score(x, p), axis=1)



In [43]:
queries

Unnamed: 0,query,doc_index,scores_0,scores_0.1,scores_0.3,scores_0.5,scores_0.7,scores_0.9
25,Who are the plaintiffs involved in the Mazzaga...,556,"(1, 1, 1, 0.012018442153930664)","(1, 1, 1, 0.005983114242553711)","(0, 1, 1, 0.006123781204223633)","(1, 1, 1, 0.005982637405395508)","(0, 1, 1, 0.005545139312744141)","(0, 0, 1, 0.00588679313659668)"
26,How much of a hit did Best Buy's stock take af...,302,"(1, 1, 1, 0.0058934688568115234)","(0, 1, 1, 0.0039825439453125)","(0, 0, 1, 0.002978801727294922)","(0, 0, 0, 0.0039920806884765625)","(0, 0, 0, 0.0038831233978271484)","(0, 0, 0, 0.0049817562103271484)"
27,How are medicare pricing adjustments affecting...,312,"(1, 1, 1, 0.004968881607055664)","(1, 1, 1, 0.003983974456787109)","(1, 1, 1, 0.003983259201049805)","(1, 1, 1, 0.0040035247802734375)","(0, 1, 1, 0.003994464874267578)","(0, 0, 0, 0.003981828689575195)"
28,How much has twitter been devalued since rebra...,314,"(1, 1, 1, 0.003982067108154297)","(1, 1, 1, 0.0049724578857421875)","(1, 1, 1, 0.003991603851318359)","(0, 1, 1, 0.0029892921447753906)","(1, 1, 1, 0.004400014877319336)","(0, 0, 1, 0.003995418548583984)"
29,What effect is China's imports on crude oil ha...,545,"(1, 1, 1, 0.004147529602050781)","(0, 1, 1, 0.004984855651855469)","(0, 1, 1, 0.003987789154052734)","(0, 1, 1, 0.0038983821868896484)","(0, 0, 0, 0.0049822330474853516)","(0, 0, 0, 0.00498199462890625)"
30,Why is Brazil looking to make income tax cuts ...,12,"(1, 1, 1, 0.003981351852416992)","(1, 1, 1, 0.003983259201049805)","(1, 1, 1, 0.004981279373168945)","(1, 1, 1, 0.003993511199951172)","(1, 1, 1, 0.003999471664428711)","(1, 1, 1, 0.004013776779174805)"
31,What background checking service do large firm...,25,"(1, 1, 1, 0.0040056705474853516)","(1, 1, 1, 0.004980564117431641)","(0, 1, 1, 0.003989458084106445)","(1, 1, 1, 0.002994060516357422)","(0, 0, 1, 0.002996683120727539)","(0, 0, 0, 0.004978656768798828)"
32,How could President-elect Donald Trump's propo...,238,"(0, 1, 1, 0.004981517791748047)","(0, 1, 1, 0.00498652458190918)","(0, 1, 1, 0.004974842071533203)","(0, 1, 1, 0.003883838653564453)","(0, 0, 0, 0.005886554718017578)","(1, 1, 1, 0.00489044189453125)"
33,How is scaled solutions providing value to cli...,70,"(1, 1, 1, 0.005016326904296875)","(0, 1, 1, 0.005036592483520508)","(0, 0, 0, 0.00497746467590332)","(0, 0, 1, 0.0049970149993896484)","(0, 0, 0, 0.0050389766693115234)","(0, 0, 0, 0.0049359798431396484)"
34,How big of a problem was it for Symbotic to de...,121,"(1, 1, 1, 0.00500035285949707)","(1, 1, 1, 0.004492521286010742)","(1, 1, 1, 0.004897117614746094)","(1, 1, 1, 0.004884481430053711)","(1, 1, 1, 0.0029990673065185547)","(0, 1, 1, 0.003971576690673828)"


In [44]:
scores_percentage = {}

for p in [0, 0.1, 0.3, 0.5, 0.7, 0.9]:
    scores_percentage[str(p)] = tuple(
        sum(values) / len(values) for values in zip(*queries[f'scores_{p}'])
    )

In [45]:
scores_percentage

{'0': (0.7, 0.86, 0.9, 0.004512143135070801),
 '0.1': (0.58, 0.8, 0.84, 0.004561066627502441),
 '0.3': (0.6, 0.86, 0.88, 0.004333887100219726),
 '0.5': (0.68, 0.8, 0.82, 0.004228873252868652),
 '0.7': (0.48, 0.6, 0.64, 0.004445037841796875),
 '0.9': (0.34, 0.54, 0.6, 0.004518542289733887)}

In [46]:
sums = {}


for p in [0, 0.1, 0.3, 0.5, 0.7, 0.9]:
    sums[str(p)] = tuple(
        sum(values)for values in zip(*queries[f'scores_{p}'])
    )
sums

{'0': (35, 43, 45, 0.22560715675354004),
 '0.1': (29, 40, 42, 0.22805333137512207),
 '0.3': (30, 43, 44, 0.21669435501098633),
 '0.5': (34, 40, 41, 0.21144366264343262),
 '0.7': (24, 30, 32, 0.22225189208984375),
 '0.9': (17, 27, 30, 0.22592711448669434)}