In [27]:
from pymilvus import MilvusClient, DataType
from openai import OpenAI, Embedding
from sentence_transformers import SentenceTransformer
import pandas as pd
import time
import numpy as np


In [28]:
milvus_client = MilvusClient(
    uri="http://localhost:19530"
)


In [29]:
Q = 90

In [30]:
for p in [0.1, 0.3, 0.5, 0.7, 0.9]:

    collection_name = f"yahoo_finance_article_AUTOPHRASE_Q{Q}_{int(p * 100)}"

    milvus_client.drop_collection(
        collection_name=collection_name,
    )

In [31]:
for p in [0.1, 0.3, 0.5, 0.7, 0.9]:

    collection_name = f"yahoo_finance_article_AUTOPHRASE_Q{Q}_{int(p * 100)}"
    
    schema = MilvusClient.create_schema(
        auto_id=False,
        enable_dynamic_field=True,
    )
    
    schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
    schema.add_field(field_name="embedded_document", datatype=DataType.FLOAT_VECTOR, dim=384) 
    
    # dimensions: 
    # 100 for GloVe-100
    # 1536 dimensions for ada-002
    # 384 for all-MiniLM-L6-v2
    # 768 for all-mpnet-base-v2
    
    index_params = milvus_client.prepare_index_params()
    
    index_params.add_index(
        field_name="id",
        index_type="STL_SORT"
    )
    
    index_params.add_index(
        field_name="embedded_document", 
        index_type="IVF_FLAT",
        metric_type="IP",
        params={ "nlist": 128 }
    )
    
    milvus_client.create_collection(
        collection_name=collection_name,
        schema=schema,
        index_params=index_params
    )
    
    time.sleep(5)
    
    res = milvus_client.get_load_state(
        collection_name=collection_name
    )
    
    print(res)
    
    


{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}


In [32]:
### BERT Embeddings
# model_name = "sentence-transformers/all-mpnet-base-v2"
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
def get_embeddings_from_bert(sentence):
    if type(sentence) == str:
        return model.encode([sentence])[0]
    else: 
        return model.encode([""])[0]




In [33]:
insertion_times = []

for p in [0.1, 0.3, 0.5, 0.7, 0.9]:
# for Q in [30, 70, 90]:

    print(f"Starting insertion of documents for p = {p}")
    # print(f"Starting insertion of documents for Q = {Q}")

    insertion_time = {"p": p}

    df = pd.read_csv(f'../high_quality_dropout/Q_Threshold_{str(Q/100)}-Drop_P_{str(p)}.csv')
    df['id'] = range(len(df))
    
    collection_name = f"yahoo_finance_article_AUTOPHRASE_Q{Q}_{int(p * 100)}"
    content_column_name = "filtered_content"

    """
    try:
        avg_number_of_words = np.mean(df[content_column_name].map(lambda x: len(x.split(" "))).to_list())
        print(f"Average number of words in p={p} collection: {avg_number_of_words}")
    except:
        print("An error occurred while calculating the avg number of words")
    """

    start = time.time()

    insertions = df.apply(lambda x: {"id": x["id"], "embedded_document": get_embeddings_from_bert(x[content_column_name])}, axis=1).to_list()

    res = milvus_client.insert(
        collection_name=collection_name,
        data=insertions
    )

    end = time.time()

    exec_time = end - start
    insertion_time["insertion_time"] = exec_time

    insertion_times.append(insertion_time)

    print(f"Insertion finished. Execution took {exec_time} seconds.")
    

Starting insertion of documents for p = 0.1
Insertion finished. Execution took 82.70732045173645 seconds.
Starting insertion of documents for p = 0.3
Insertion finished. Execution took 79.14713215827942 seconds.
Starting insertion of documents for p = 0.5
Insertion finished. Execution took 76.34741449356079 seconds.
Starting insertion of documents for p = 0.7
Insertion finished. Execution took 65.89435648918152 seconds.
Starting insertion of documents for p = 0.9
Insertion finished. Execution took 28.601293802261353 seconds.


In [25]:
"""
with open(f'insertion_times_AUTOPHRASE_Q{Q}_{model_name.replace("sentence-transformers/", "")}.txt', 'w') as file:
    file.write(str(insertion_times))
"""

'\nwith open(f\'insertion_times_AUTOPHRASE_Q{Q}_{model_name.replace("sentence-transformers/", "")}.txt\', \'w\') as file:\n    file.write(str(insertion_times))\n'

In [34]:
import json
with open(f'insertion_times_AUTOPHRASE_Q{Q}_{model_name.replace("sentence-transformers/", "")}.json', 'w') as fp:
    json.dump(insertion_times, fp)