### Using Vector DB and LLMs for Entity Resolution

We will use sentence transformers to encode text to embeddings and use Vector DB to store and lookup said embeddings. The vector db used for this tutorial is <a href='https://milvus.io/'>Milvus</a>.

### Starting the DB Instance locally

In [13]:
from milvus import default_server
from pymilvus import connections, utility

'''with default_server:
    connections.connect(host='localhost', port=default_server.listen_port)
    print(utility.get_server_version())'''
default_server.start()
#default_server.stop()



    __  _________ _   ____  ______
   /  |/  /  _/ /| | / / / / / __/
  / /|_/ // // /_| |/ / /_/ /\ \
 /_/  /_/___/____/___/\____/___/ {Lite}

 Welcome to use Milvus!

 Version:   v2.2.8-lite
 Process:   28663
 Started:   2023-05-23 17:30:53
 Config:    /Users/abdul.jilani/.milvus.io/milvus-server/2.2.8/configs/milvus.yaml
 Logs:      /Users/abdul.jilani/.milvus.io/milvus-server/2.2.8/logs

 Ctrl+C to exit ...


In [15]:
print(utility.get_server_version())

v2.2.8-lite


In [115]:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

### Connecting and Inserting embeddings

In [116]:
connections.connect("default", host="localhost", port="19530")

In [151]:
data_records = [[0,'Steve King; 82-84 Kent Rd, North Ryde NSW 2113'],
             [1,'Stephen King; 82-84 Kent Road, North Ryde New South Wales 2113'],
             [2,'George Orwell; 82,84 Kent Road, North Ryde NSW 2113'],
             [3,'William Shakespeare; 610 Bells Line of Rd, Kurmond NSW 2757'],
             [4,'Bill Shakespeare; 980 Bells Line of Road, Kurrajong Heights 2758 NSW']]

In [152]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
data_vecs = [model.encode(record[1]).tolist() for record in data_records]

In [155]:
from pymilvus import utility
utility.drop_collection("person_lookup")

In [156]:
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="data_record", dtype=DataType.VARCHAR, max_length=1000),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=384)
]
schema = CollectionSchema(fields, "person records store")
person_lookup = Collection("person_lookup", schema)

In [158]:
entities = [
    [i[0] for i in data_records],  # id
    [i[1] for i in data_records],  # details
    data_vecs,  # embeddings
]
insert_result = person_lookup.insert(entities)
person_lookup.flush()  

In [159]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
person_lookup.create_index("embeddings", index)

Status(code=0, message=)

### Search Query examples 

In [160]:
person_lookup.load()
vectors_to_search = [data_vecs[0]]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 384},
}
result = person_lookup.search(vectors_to_search, "embeddings", 
                              search_params, limit=3, 
                              output_fields=["pk","data_record"])

In [161]:
[res for res in result[0]]

[id: 0, distance: 0.0, entity: {'pk': 0, 'data_record': 'Steve King; 82-84 Kent Rd, North Ryde NSW 2113'},
 id: 1, distance: 10.103018760681152, entity: {'pk': 1, 'data_record': 'Stephen King; 82-84 Kent Road, North Ryde New South Wales 2113'},
 id: 2, distance: 11.821717262268066, entity: {'pk': 2, 'data_record': 'George Orwell; 82,84 Kent Road, North Ryde NSW 2113'}]

In [173]:
entity_to_search = 'Stephen King; 53/57 Rawson St, Epping NSW 2121'
vectors_to_search = scaler.transform(model.encode(entity_to_search).reshape(1,-1)).tolist()
search_params = {
    "metric_type": "L2",#"IP",
    "params": {"nprobe": 384},
}
result = person_lookup.search(vectors_to_search, "embeddings", 
                              search_params, limit=2, 
                              output_fields=["pk","data_record"])
[res for res in result[0]]

[id: 0, distance: 359.0757751464844, entity: {'pk': 0, 'data_record': 'Steve King; 82-84 Kent Rd, North Ryde NSW 2113'},
 id: 1, distance: 366.83892822265625, entity: {'pk': 1, 'data_record': 'Stephen King; 82-84 Kent Road, North Ryde New South Wales 2113'}]

In [174]:
entity_to_search = 'Steve King; Kent Rd, North Ryde NSW 2113'
vectors_to_search = scaler.transform(model.encode(entity_to_search).reshape(1,-1)).tolist()
search_params = {
    "metric_type": "L2",#"IP",
    "params": {"nprobe": 384},
}
result = person_lookup.search(vectors_to_search, "embeddings", 
                              search_params, limit=2, 
                              output_fields=["pk","data_record"])
[res for res in result[0]]

[id: 0, distance: 253.13133239746094, entity: {'pk': 0, 'data_record': 'Steve King; 82-84 Kent Rd, North Ryde NSW 2113'},
 id: 1, distance: 276.904296875, entity: {'pk': 1, 'data_record': 'Stephen King; 82-84 Kent Road, North Ryde New South Wales 2113'}]

In [167]:
res = person_lookup.query(
  expr = "pk >= 0",
  output_fields=["pk","data_record"]
)
[res1 for res1 in res]


[{'pk': 0, 'data_record': 'Steve King; 82-84 Kent Rd, North Ryde NSW 2113'},
 {'pk': 1,
  'data_record': 'Stephen King; 82-84 Kent Road, North Ryde New South Wales 2113'},
 {'pk': 2,
  'data_record': 'George Orwell; 82,84 Kent Road, North Ryde NSW 2113'},
 {'pk': 3,
  'data_record': 'William Shakespeare; 610 Bells Line of Rd, Kurmond NSW 2757'},
 {'pk': 4,
  'data_record': 'Bill Shakespeare; 980 Bells Line of Road, Kurrajong Heights 2758 NSW'}]

In [177]:
default_server.stop()