In [1]:
import os 
os.chdir("../")


In [36]:
import pandas as pd 
import openai
from tqdm import tqdm
from ast import literal_eval
from openai.embeddings_utils import  get_embedding
import numpy as np
from pymilvus import connections
from pymilvus import Collection, DataType, FieldSchema, CollectionSchema
from pymilvus import utility
from time import time

openai.api_key = os.getenv("OPENAI_API_KEY")

In [9]:
tqdm.pandas()

In [4]:
df = pd.read_csv('data/bbc-news-data-with-summary&keywords.csv')
df

Unnamed: 0,category,filename,title,content,summary,keywords
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,TimeWarner has reported a 76% increase in quar...,"['TimeWarner', 'profits', 'AOL', 'internet adv..."
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,The US dollar has risen to its highest level a...,"['dollar', 'Federal Reserve', 'US trade defici..."
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,"Yukos' owners are to demand that Rosneft, the ...","['Yukos', 'Rosneft', 'Yugansk unit', 'Menatep ..."
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,British Airways has reported a 40% drop in pro...,"['British Airways', 'fuel prices', 'pre-tax pr..."
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,Shares in UK drinks and food firm Allied Domec...,"['Allied Domecq', 'takeover', 'Pernod Ricard',..."
...,...,...,...,...,...,...
2216,tech,396.txt,New consoles promise big problems,Making games for future consoles will require...,Creating games for the upcoming next-generatio...,"['future consoles', 'graphic artists', 'next g..."
2217,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,BT is launching two initiatives to combat rogu...,"['BT', 'rogue dialler scams', 'Modem Protectio..."
2218,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,A report by the Business Software Alliance (BS...,"['spam e-mails', 'software', 'clothes', 'jewel..."
2219,tech,399.txt,Be careful how you code,A new European directive could put software w...,The Dutch government is pushing through a cont...,"['European directive', 'software writers', 'pa..."


In [12]:
def openai_embedder(text: str):
    return np.array(get_embedding(text, engine="text-embedding-ada-002"))


In [13]:
df['summary_embeddings'] = df['summary'].progress_apply(openai_embedder)

100%|████████████████████████████████████████████████████| 2221/2221 [05:11<00:00,  7.13it/s]


In [14]:
df

Unnamed: 0,category,filename,title,content,summary,keywords,summary_embeddings
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,TimeWarner has reported a 76% increase in quar...,"['TimeWarner', 'profits', 'AOL', 'internet adv...","[-0.015091367065906525, -0.02089376002550125, ..."
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,The US dollar has risen to its highest level a...,"['dollar', 'Federal Reserve', 'US trade defici...","[-0.006941323634237051, -0.014485102146863937,..."
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,"Yukos' owners are to demand that Rosneft, the ...","['Yukos', 'Rosneft', 'Yugansk unit', 'Menatep ...","[-0.008478711359202862, -0.04009078070521355, ..."
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,British Airways has reported a 40% drop in pro...,"['British Airways', 'fuel prices', 'pre-tax pr...","[-0.022860826924443245, -0.02874668501317501, ..."
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,Shares in UK drinks and food firm Allied Domec...,"['Allied Domecq', 'takeover', 'Pernod Ricard',...","[0.003200878156349063, -0.020808186382055283, ..."
...,...,...,...,...,...,...,...
2216,tech,396.txt,New consoles promise big problems,Making games for future consoles will require...,Creating games for the upcoming next-generatio...,"['future consoles', 'graphic artists', 'next g...","[0.0065853954292833805, -0.01052235160022974, ..."
2217,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,BT is launching two initiatives to combat rogu...,"['BT', 'rogue dialler scams', 'Modem Protectio...","[-0.014873337931931019, -0.017099644988775253,..."
2218,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,A report by the Business Software Alliance (BS...,"['spam e-mails', 'software', 'clothes', 'jewel...","[0.0011268907692283392, -0.02139226347208023, ..."
2219,tech,399.txt,Be careful how you code,A new European directive could put software w...,The Dutch government is pushing through a cont...,"['European directive', 'software writers', 'pa...","[0.0008752755238674581, -0.025408955290913582,..."


## Milvus Setup
- Connecting to the Milvus Container
- Setting up the schema for the Milvus Collection
- converting df to Milvus acceptable data format.
- Insert Data into Milvus
- Freezing the Collection to Segment for building indexes


In [16]:
connections.connect("default", host="localhost", port="19530")

In [39]:
utility.drop_collection('bbc_news')

In [40]:
utility.list_collections()

['covid_data',
 'chatgpt_bucket',
 'backyard_topics_2022',
 'test',
 'bucketed_data',
 'bertopic_bucket',
 'bertopic_small_buckets']

In [18]:
print(len(df['summary_embeddings'][0]))

1536


In [21]:
for col in df.columns:
    arr = df[col].to_list()
    
    if type(arr[0]) == str:
        len_arr = [len(i) for i in arr]
        print(col, np.max(len_arr))

category 13
filename 7
title 52
content 13797
summary 875
keywords 185


In [41]:
category = FieldSchema(
    name='category',
    dtype= DataType.VARCHAR, 
    max_length=20,
)
filename = FieldSchema(
    name='filename', 
    dtype= DataType.VARCHAR, 
    max_length=10,
    is_primary=True
)
title = FieldSchema(
    name='title',
    dtype= DataType.VARCHAR,
    max_length=60,
)
content = FieldSchema(
    name='content',
    dtype= DataType.VARCHAR,
    max_length=15000,
)
summary = FieldSchema(
    name='summary', 
    dtype= DataType.VARCHAR, 
    max_length= 1000
)
keywords = FieldSchema(
    name='keywords', 
    dtype= DataType.VARCHAR, 
    max_length= 250
)
summary_embedding = FieldSchema(
    name='summary_embedding', 
    dtype= DataType.FLOAT_VECTOR, 
    dim=1536, 
)

schema = CollectionSchema(
    fields=[category, filename, title, content, summary,keywords, summary_embedding], 
    description='Building a POC for article search on BBC news data'
)
collection_name = 'bbc_news'

In [42]:
bbc_collection = Collection(
    name=collection_name, 
    schema=schema, 
    using='default', 
    shards_num=2
)

In [43]:
bbc_collection.schema

{
  auto_id: False
  description: Building a POC for article search on BBC news data
  fields: [{
    name: category
    description: 
    type: 21
    params: {'max_length': 20}
  }, {
    name: filename
    description: 
    type: 21
    params: {'max_length': 10}
    is_primary: True
    auto_id: False
  }, {
    name: title
    description: 
    type: 21
    params: {'max_length': 60}
  }, {
    name: content
    description: 
    type: 21
    params: {'max_length': 15000}
  }, {
    name: summary
    description: 
    type: 21
    params: {'max_length': 1000}
  }, {
    name: keywords
    description: 
    type: 21
    params: {'max_length': 250}
  }, {
    name: summary_embedding
    description: 
    type: 101
    params: {'dim': 1536}
  }]
}

In [44]:
def format_df(df: pd.DataFrame):
    collection_data = []
    for column in df.columns:
        collection_data.append(df[column].to_list())
    return collection_data
collection_data = format_df(df)
len(collection_data)

7

In [45]:
mr = bbc_collection.insert(collection_data)

In [46]:
mr.insert_count

2221

In [47]:
bbc_collection.flush()

In [48]:
bbc_collection.num_entities

2221

## Milvus Indexing
We would be using the following configuration -

1. We are using IVF_FLAT as it is super fast and has quantization enabled,
2. IVF_FLAT is comparable to FAISS in terms of performance
3. We are using IP distance for setting up the distance metric, it is analogus to vector product
4. We are creating 190 cluster as it can be calculated as 4* sqrt(num_index_data_points)

In [49]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "IP",
    "params": {"nlist": 190},
}
bbc_collection.create_index("summary_embedding", index)

Status(code=0, message=)

In [50]:
bbc_collection.release()

In [51]:
bbc_collection.load()

In [53]:
vectors_to_search = [openai_embedder(i) for i in [df['summary'][0], df['summary'][1]]]
search_params = {
    "metric_type": "IP",
    "params": {"nprobe": 10},
}
start = time()
result = bbc_collection.search(vectors_to_search, "summary_embedding", search_params, limit=10, output_fields=['title', 'content', 'summary', 'keywords'])
print((time() - start)*1000)

9.811878204345703


In [54]:
result_list = []
for out in result: 
    hit_list = []
    for hit in out:
        hit_dict = {}
        hit_dict['filename'] = hit.id 
        hit_dict['title'] = hit.entity.get('title')
        hit_dict['content'] = hit.entity.get('content')
        hit_dict['summary'] = hit.entity.get('summary')
        hit_dict['keywords'] = literal_eval(hit.entity.get('keywords'))
        hit_dict['score'] = hit.distance

        hit_list.append(hit_dict)
    result_list.append(hit_list)

In [57]:
result_list[1]

[{'filename': '002.txt',
  'title': 'Dollar gains on Greenspan speech',
  'content': ' The dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.  And Alan Greenspan highlighted the US government\'s willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan\'s speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data. "I think the chairman\'s taking a much more sanguine view on the current account deficit than he\'s taken for some time," said Robert Sinche, head of currency strategy at Bank of America in New York. "He\'s taking a lo