In [22]:
import os
from dotenv import load_dotenv
load_dotenv()

True

# Milvus to manage vector datasets
Milvus work as vector store to enable quick document queries. We compute document embeddings using a small BERT model for semantic search.

### Download the dataset
Put "wikihow.csv" dataset in the "file" directory. Please download in here URL: `https://ibm.box.com/s/8nvanf974t35d89cmibk75e3gc6d1pbo`

In [49]:
WH_PATH = "file/wikihow.csv"
WH_PATH

'file/wikihow.csv'

### Load and check the data
Observe that some of the data could be cleaner:
- One of the titles seems to be mistakenly registered as a sectionLabel
- Some odd codepoint choices, for example for apostrophes
- Some titles end in spurious numbers

In [24]:
import pandas
doc = pandas.read_csv(WH_PATH)

In [25]:
doc_indexed = doc.set_index(['title', 'headline']).sort_index()

In [26]:
doc_indexed.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,overview,text,sectionLabel
title,headline,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
How to Zydeco,\nTry a side step.,Zydeco dancing is type of Cajun dancing perfo...,If you already have the rock step down (or ev...,Adding Movement
How to Zydeco,\nTry the open position.,Zydeco dancing is type of Cajun dancing perfo...,"The open position is, as it sounds, much more...",Learning the Closed and Open Position
How to Zydeco,\nUse a rock step.,Zydeco dancing is type of Cajun dancing perfo...,"Often, you'll just be shifting your weight ba...",Adding Movement
How to Zydeco,\nUse dance techniques for the extra beat.,Zydeco dancing is type of Cajun dancing perfo...,It can be hard to remember to hold for the ex...,Learning the Beat
,\nInsert the following into your <head> section:\n\n\n\n\n\n,Do you want to change the user's cursor when ...,"Steps,Tips,Related wikiHows",How to Set Cursors for Webpage Links


### Create Milvus connection
We will interact with our Milvus instance using the official pymilvus library. Alternatively, it is possible to use LangChain's Milvus vectorstores class to add documents instance. In that case, a simple `from_documents` or `from_texts` (or similar) will generate the collection using the correct settings expected by LangChain.

In [50]:
# Milvus requires a connection for all operations. Remember to disconnect at the end.

from pymilvus import connections
connections.connect(
  alias="default",
  host=os.getenv("MILVUS_HOST", None),
  port=os.getenv("MILVUS_PORT", None)
)

### Create schema for the milvus store
If a collection with the same name but a different schema exists, Milvus may throw a SchemaNotReady exception.
Also the text fields' max length is in bytes not characters. Even though it's possible to get the byte size of the string and trim it to fit the byte limits in the schema, there are finicky bits and it's better to simply set limits to the max allowable (65535).

fields in the collection must follow some special rules:
- The primary key must be called pk
- The vector must be called vector
- The text entry must be called text

In [28]:
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, utility

In [29]:
# Milvus also supports schemaless operations if `enable_dynamic_fields=True`.

MAX_TITLE = 512
MAX_TEXT = 1024
MAX_VEC = 384

NAME = "WikiHow"

if NAME in utility.list_collections():
    whcollection = Collection(NAME)
    whcollection.drop()

whschema = CollectionSchema(
    fields=[
        FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=65535, default_value=""),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535, default_value=""),
        FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384)
    ],
    enable_dynamic_fields=False,
    description="WikiHow collection"
)
whcollection = Collection(
    name=NAME,
    schema=whschema,
    consistency_level="Session" # Make sure we read our own writes, otherwise allowed to be a bit out of date.
)

### Batch-wise insertion into milvus
Use small BERT model to compute embeddings for our documents to place in the milvus store.

The choice of batch size in this example is arbitrary, and a double-batch system may be preferable to accomodate both the embedding model and milvus.

When the embedding model runs on GPU, the batch size should be selected so as to optimize the transfer-to-memory vs runtime overheads (too small and a major amount of time will be wasted on memory transfers instead of embedding proper, too large and it won't fit on the device).
If the model is accessed over the network, the batch size should be selected with the same concerns in mind, although further overhead may be incurred depending on how the model is scheduled or how the API is designed.

With regard to milvus, the idea is the same: a batch size that's too small means incurring milvus' operational overhead along with communication overhead. The other tradeoff of note regards any temporary processing or data streaming that may occur: a higher batch size also implies loading more data into memory and possibly generating longer-lasting temporary artifacts before submitting the data to milvus, after which it can all be discarded.



### Load embeddings
use HuggingFaceEmbeddings with the MiniLM BERT model.

In [30]:
import langchain
from langchain.embeddings import HuggingFaceEmbeddings

In [31]:
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

In [32]:
BATCH_SIZE = 2048

batch = []
def insert_data(data):
    import math

    batch = []

    titles = list(data.keys())

    vecs = embeddings.embed_documents(titles)
    
    entries = [[], [], []]

    for b, title in enumerate(titles):
        text = title + ":\n"
        for cat in data[title]:
            text += cat + ":\n"
            text += "\n".join(data[title][cat])
            
        title_len_diff = len(title.encode('utf-16-le')) - len(title)
        text_len_diff = len(text.encode('utf-16-le')) - len(text)
        entries[0].append(title[:MAX_TITLE - title_len_diff])
        entries[1].append(text[:MAX_TEXT - text_len_diff])
        entries[2].append(vecs[b])

    whcollection.insert(entries)

import collections, tqdm
doc_data = collections.defaultdict(lambda: collections.defaultdict(list))
for i in tqdm.tqdm(range(len(doc_indexed)), total=len(doc_indexed)):
    if (type(doc_indexed.index[i][0]) is not str) or (type(doc_indexed.index[i][1]) is not str):
        continue
    die = False
    for col in ['text', 'overview', 'sectionLabel']:
        if type(doc_indexed.iloc[i][col]) is not str:
            die = True
            break
    if die:
        continue
    section_head = doc_indexed.index[i][0] + " (" + doc_indexed.iloc[i]['overview'].strip() + ")"
    category = doc_indexed.index[i][1]
    step = " ".join(map(lambda x: x.strip(), doc_indexed.iloc[i][['sectionLabel', 'text']]))

    if len(doc_data) % BATCH_SIZE == 1 and len(doc_data) != 1:
        insert_data(doc_data)
        doc_data = collections.defaultdict(lambda: collections.defaultdict(list))
    doc_data[section_head][category].append(step)
    if i == len(doc_indexed) - 1:
        insert_data(doc_data)

100%|██████████| 1585695/1585695 [24:50<00:00, 1063.65it/s] 


In [38]:
doc_data

defaultdict(<function __main__.<lambda>()>,
            {"How to Write an Introduction Paragraph5 (When writing an introduction paragraph, you should always include a hook to capture the reader's attention, supporting information about the topic at hand, and a thesis statement. That said, there are still multiple introduction paragraphs you can use for your paper. This article will describe a few common ones, as well as some that you might not have seen.)": defaultdict(list,
                         {'\nProvide context for the quotation while bridging into the topic.': ['Words of Wisdom Context can who spoke or wrote the words originally, what the words are referring to, the time period the quotation came from, or how the quotation addresses your topic.\n\n\nNote that unless the quotation is anonymous, you must always state who is responsible for it.\nThis context will also introduce the topic of your paper and lead into supporting details that can introduce your thesis.'],
           

In [40]:
# Milvus will not seal segments that are too small, a flush is necessary to force it.
whcollection.flush()

## Create index and connect
Search can be accelerated significantly by creating an index on the vector. Here we use L2 similarity with a flat index using inverted files (`IVF_FLAT`).

If using the langchain milvus store interface, now is a good time to disconnect as well. Otherwise, now is the time to load the collection.

In [None]:
whcollection.create_index(field_name="vector", index_params={"metric_type": "L2", "index_type": "IVF_FLAT", "nlist": "1024"})

In [48]:
whcollection.load()
# To actually use the data, we would have to do a `whcollection.load()` before any queries.
# Once done with queries, we should then use `whcollection.release()` to stop using resources

## Disconnect
Unload the collection to stop using up resources, then close the connection. We're done!

In [None]:
whcollection.release()
connections.disconnect("default")