In [1]:
from pymilvus import MilvusClient, DataType
from openai import OpenAI, Embedding
from sentence_transformers import SentenceTransformer
import pandas as pd
import time
import numpy as np


In [3]:
milvus_client = MilvusClient(
    uri="http://localhost:19530"
)


In [4]:
df = pd.read_csv('../word-dropout/11_20_26_27_Articles_DROPOUT.csv')
df.head()

Unnamed: 0,id,title,url,date,content,reduced_content_p=0.1,reduced_content_p=0.3,reduced_content_p=0.5,reduced_content_p=0.7,reduced_content_p=0.9
0,0,Yahoo Finance,https://finance.yahoo.com/news/threads-now-let...,"Wed, Nov 27, 2024, 12:15 PM",Threads is rolling out a redesign that keeps y...,Threads is a redesign keeps your feeds visible...,Threads rolling out a redesign that your visib...,"Threads is out a on home screen, head Adam ann...",is your the announced on The allows between ju...,redesign Instagram head you out can a should m...
1,1,Yahoo Finance,https://finance.yahoo.com/news/elon-musk-calls...,"Wed, Nov 27, 2024, 1:34 PM","Billionaire Elon Musk, a key adviser to the in...","Billionaire Musk, a key adviser to the incomin...","Elon Musk, a key adviser the incoming administ...","Elon adviser to the administration, called the...",adviser the incoming Trump the Protection to b...,Billionaire Trump for Consumer to setting on t...
2,2,Yahoo Finance,https://finance.yahoo.com/news/tpg-angelo-gord...,"Wed, Nov 27, 2024, 11:23 AM",(Bloomberg) -- Americans borrowing against the...,(Bloomberg) -- Americans borrowing against the...,(Bloomberg) borrowing against homes after a su...,their after surge in creating trillion opportu...,"borrowing their homes prices a investors, acco...",against a in prices Gordon. Brace Are Lanes eq...
3,3,Yahoo Finance,https://finance.yahoo.com/news/microsoft-faces...,"Wed, Nov 27, 2024, 12:58 PM",(Bloomberg) -- The US Federal Trade Commission...,(Bloomberg) -- The US Federal Trade Commission...,(Bloomberg) The Trade has an investigation of ...,(Bloomberg) -- Federal Commission has opened a...,The opened of from computing and licensing art...,and artificial Read Kansas Takes according to ...
4,4,Yahoo Finance,https://finance.yahoo.com/news/crowdstrike-fal...,"Wed, Nov 27, 2024, 6:40 AM",(Bloomberg) -- CrowdStrike Holdings Inc. fell ...,(Bloomberg) -- CrowdStrike Holdings Inc. fell ...,(Bloomberg) CrowdStrike Holdings in trading on...,(Bloomberg) -- Holdings Inc. in on after the c...,(Bloomberg) early the cybersecurity company we...,cybersecurity weaker-than-expected who compute...


In [5]:
len(df)

580

In [6]:
for p in [0, 0.1, 0.3, 0.5, 0.7, 0.9]:

    collection_name = f"yahoo_finance_article_DROPOUT_{int(p * 100)}"

    milvus_client.drop_collection(
        collection_name=collection_name,
    )

In [7]:
for p in [0, 0.1, 0.3, 0.5, 0.7, 0.9]:

    collection_name = f"yahoo_finance_article_DROPOUT_{int(p * 100)}"
    
    schema = MilvusClient.create_schema(
        auto_id=False,
        enable_dynamic_field=True,
    )
    
    schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
    schema.add_field(field_name="embedded_document", datatype=DataType.FLOAT_VECTOR, dim=384) # 100 dimensional GloVe vectors
    
    # dimensions: 
    # 100 for GloVe-100
    # 1536 dimensions for ada-002
    # 384 for all-MiniLM-L6-v2
    # 768 for all-mpnet-base-v2
    
    index_params = milvus_client.prepare_index_params()
    
    index_params.add_index(
        field_name="id",
        index_type="STL_SORT"
    )
    
    index_params.add_index(
        field_name="embedded_document", 
        index_type="IVF_FLAT",
        metric_type="COSINE", 
        # metric_type="IP",
        params={ "nlist": 128 }
    )
    
    milvus_client.create_collection(
        collection_name=collection_name,
        schema=schema,
        index_params=index_params
    )
    
    time.sleep(5)
    
    res = milvus_client.get_load_state(
        collection_name=collection_name
    )
    
    print(res)
    
    


{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}
{'state': <LoadState: Loaded>}


In [8]:
### BERT Embeddings
# model_name = "sentence-transformers/all-mpnet-base-v2"
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
def get_embeddings_from_bert(sentence):
    return model.encode([sentence])[0]





In [9]:
insertion_times = {}

for p in [0, 0.1, 0.3, 0.5, 0.7, 0.9]:

    print(f"Starting insertion of documents for p = {p}")
    
    collection_name = f"yahoo_finance_article_DROPOUT_{int(p * 100)}"
    content_column_name = "content" if p == 0 else f"reduced_content_p={p}"

    avg_number_of_tokens = np.mean(df[content_column_name].map(lambda x: len(x.split(" "))).to_list())

    print(f"Average number of tokens in p={p} collection: {avg_number_of_tokens}")

    start = time.time()

    insertions = df.apply(lambda x: {"id": x["id"], "embedded_document": get_embeddings_from_bert(x[content_column_name])}, axis=1).to_list()

    res = milvus_client.insert(
        collection_name=collection_name,
        data=insertions
    )

    end = time.time()

    exec_time = end - start
    insertion_times[str(p)] = exec_time

    print(f"Insertion finished. Execution took {exec_time} seconds.")
    

Starting insertion of documents for p = 0
Average number of tokens in p=0 collection: 634.1172413793104
Insertion finished. Execution took 129.37971711158752 seconds.
Starting insertion of documents for p = 0.1
Average number of tokens in p=0.1 collection: 571.1517241379311
Insertion finished. Execution took 111.4962170124054 seconds.
Starting insertion of documents for p = 0.3
Average number of tokens in p=0.3 collection: 444.3155172413793
Insertion finished. Execution took 106.65193438529968 seconds.
Starting insertion of documents for p = 0.5
Average number of tokens in p=0.5 collection: 317.298275862069
Insertion finished. Execution took 103.35703992843628 seconds.
Starting insertion of documents for p = 0.7
Average number of tokens in p=0.7 collection: 190.7206896551724
Insertion finished. Execution took 90.66617560386658 seconds.
Starting insertion of documents for p = 0.9
Average number of tokens in p=0.9 collection: 63.86206896551724
Insertion finished. Execution took 39.530922

In [32]:
with open(f'insertion_times_{model_name.replace("sentence-transformers/", "")}.txt', 'w') as file:
    file.write(str(insertion_times))