In [1]:
# %pip install pymilvus opencv-python



Note: you may need to restart the kernel to use updated packages.


In [1]:
import csv
from glob import glob
from pathlib import Path
from statistics import mean

# from towhee import pipe, ops, DataCollection
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility


In [2]:
def create_milvus_collection(collection_name, dim):
    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)
    
    fields = [
        FieldSchema(name='path', dtype=DataType.VARCHAR, description='path to image', max_length=500, 
                    is_primary=True, auto_id=False),
        FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='image embedding vectors', dim=dim)
    ]
    schema = CollectionSchema(fields=fields, description='efiss-image-search')
    collection = Collection(name=collection_name, schema=schema)

    index_params = {
        'metric_type': 'L2',
        'index_type': 'FLAT',
        'params': {"nlist": 2048}
    }
    collection.create_index(field_name='embedding', index_params=index_params)
    return collection

In [2]:
# connections.connect(host='localhost', port='19530')
connections.connect(host='34.87.182.49', port='19530')

In [4]:
collection = create_milvus_collection('efiss_image_search', 768)

In [5]:
import torch

embeddings = torch.load('/media/thaiminhpv/Storage/MinhFileServer/Public-Filebrowser/Uploads/EFISS/vit-embeddings.pt')

In [6]:
import os
import sys
import pandas as pd

df = []
with open('/media/thaiminhpv/Storage/MinhFileServer/Public-Filebrowser/Uploads/EFISS/database_info.txt', 'r') as f:
    for line in f.readlines():
        df.append(line.strip())

df = pd.DataFrame(df)

In [7]:
df

Unnamed: 0,0
0,data/shopee_crop_yolo/images/6460ab0252e365505...
1,data/shopee_crop_yolo/images/6460ab0252e365505...
2,data/shopee_crop_yolo/images/6460ab0252e365505...
3,data/shopee_crop_yolo/images/6460ab0252e365505...
4,data/shopee_crop_yolo/images/6460ab0252e365505...
...,...
1347831,data/shopee_crop_yolo/images/6462d71a614eb2054...
1347832,data/shopee_crop_yolo/images/6462d71a614eb2054...
1347833,data/shopee_crop_yolo/images/6462d71a614eb2054...
1347834,data/shopee_crop_yolo/images/6462d71a614eb2054...


In [8]:
df[0] = df[0].str.replace('data/shopee_crop_yolo/', '')

In [9]:
_df = df.iloc[:1000]
_embeddings = embeddings[:1000, :]
_df.shape, _embeddings.shape

((1000, 1), torch.Size([1000, 768]))

In [10]:
# collection.insert(data={
#     "path": _df[0].values.tolist(),
#     "embedding": _embeddings.cpu().numpy()
# })

In [25]:
collection.insert(data=[
    _df[0].values.tolist()[:10],
    _embeddings.cpu().numpy()[:10, :]
])

(insert count: 10, delete count: 0, upsert count: 0, timestamp: 443396698044366851, success count: 10, err count: 0)

In [11]:
from tqdm import tqdm, trange

In [12]:
chunk_size = 1000

# milvus insert chunk by chunk
for i in trange(0, len(df), chunk_size):
    collection.insert(data=[
        df[0].iloc[i:i+chunk_size].values.tolist(),
        embeddings[i:i+chunk_size].cpu().numpy()
    ])

100%|██████████| 1348/1348 [08:59<00:00,  2.50it/s]


In [3]:
collection = Collection("efiss_image_search")

In [4]:
collection.load()

In [6]:
collection.num_entities

1347836

In [7]:
# len(collection.query(expr='path != "milvus.ipynb"'))

In [58]:
result = collection.search(
    data=[embeddings[10].cpu().numpy()],
    anns_field="embedding",
    # expr=None,
    param={
        "metric_type": "L2", 
        # "offset": 5, 
        # "ignore_growing": False, 
        "params": {}
        # "params": {"nprobe": 10}
    },
    limit=10000,
    output_fields=['path'],
    # consistency_level="Strong"
)

In [60]:
len(result[0].distances)

10000

In [31]:
type(result[0].ids[0])

str

In [109]:
len(result[0].distances)

1000