In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('DATA/products_with_tags_and_description_seperated.csv')

In [3]:
def to_string(row):
    return ", ".join([
        str(row.get('color', '')),
        str(row.get('craft', '')),
        str(row.get('fabric', '')),
        str(row.get('Product Types', ''))
    ])

In [4]:
from sentence_transformers import SentenceTransformer
MODEL_NAME = 'WhereIsAI/UAE-Large-V1'
model = SentenceTransformer(MODEL_NAME).to("cuda")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
embeddings_description = model.encode(df['description'])
df['Description'] = df['Description'].fillna('')
embeddings_image_description = model.encode(df['Description'])
df['image_description_vector'] = embeddings_image_description.tolist()
df['description_vector'] = embeddings_description.tolist()
df['Tags'] = df.apply(to_string, axis=1)
embeddings_Tags = model.encode(df['Tags'])
df['Tags_vector'] = embeddings_Tags.tolist()

In [6]:
df.columns

Index(['Unnamed: 0', 'Handle', 'Title', 'Type', 'description', 'craft',
       'fabric', 'color', 'Product Types', 'Description',
       'image_description_vector', 'description_vector', 'Tags',
       'Tags_vector'],
      dtype='object')

In [7]:
df_copy = df.copy()

In [8]:
df_copy.drop(columns=['Unnamed: 0'], inplace = True)

In [9]:
df_copy.columns

Index(['Handle', 'Title', 'Type', 'description', 'craft', 'fabric', 'color',
       'Product Types', 'Description', 'image_description_vector',
       'description_vector', 'Tags', 'Tags_vector'],
      dtype='object')

In [10]:
df_copy.rename(columns={'Tags_vector': 'tags_vector'}, inplace=True)
df_copy.rename(columns={'Product Types': 'Product_Types'}, inplace=True)

In [None]:
df_copy['craft'] = df_copy['craft'].fillna("Unknown").astype(str)

In [None]:
data = df_copy.to_dict(orient='records')

In [13]:
data[0].keys()

dict_keys(['Handle', 'Title', 'Type', 'description', 'craft', 'fabric', 'color', 'Product_Types', 'Description', 'image_description_vector', 'description_vector', 'Tags', 'tags_vector'])

In [14]:
# import numpy as np

# for record in data:
#     record["Tags_vector"] = record.pop("tags_vector", [0.0]*1024)  # also sets default if missing    

# default_vector = [0.0] * 1024

# for record in data:
#     if 'tags_vector' not in record or record['tags_vector'] is None or len(record['tags_vector']) != 1024:
#         record['tags_vector'] = default_vector

# Milvus

In [15]:
from pymilvus import MilvusClient, DataType

client = MilvusClient(
    uri="http://localhost:19530",
    token="root:Milvus"
)

In [16]:
schema = MilvusClient.create_schema(
    auto_id=False,
    # enable_dynamic_field=True,
)

schema.add_field(field_name="Handle", datatype=DataType.VARCHAR, max_length=512, is_primary=True)
schema.add_field(field_name="Title", datatype=DataType.VARCHAR, max_length=512)
schema.add_field(field_name="Type", datatype=DataType.VARCHAR, max_length=512)
schema.add_field(field_name='craft', datatype=DataType.VARCHAR, max_length=256)
schema.add_field(field_name="fabric", datatype=DataType.VARCHAR, max_length=256)
schema.add_field(field_name="Product_Types", datatype=DataType.VARCHAR, max_length=256)
schema.add_field(field_name="Tags", datatype=DataType.VARCHAR, max_length=256)
schema.add_field(field_name="color", datatype=DataType.VARCHAR, max_length=256)
schema.add_field(field_name="description", datatype=DataType.VARCHAR, max_length=1024)
schema.add_field(field_name="Description", datatype=DataType.VARCHAR, max_length=1024)

schema.add_field(field_name="image_description_vector", datatype=DataType.FLOAT_VECTOR, dim = 1024)
schema.add_field(field_name="description_vector", datatype=DataType.FLOAT_VECTOR, dim = 1024)
schema.add_field(field_name="tags_vector", datatype=DataType.FLOAT_VECTOR, dim = 1024)

{'auto_id': False, 'description': '', 'fields': [{'name': 'Handle', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 512}, 'is_primary': True, 'auto_id': False}, {'name': 'Title', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 512}}, {'name': 'Type', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 512}}, {'name': 'craft', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'fabric', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'Product_Types', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'Tags', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'color', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'description', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length

In [17]:
from pymilvus import MilvusClient

index_params = MilvusClient.prepare_index_params()

index_params.add_index(
    field_name="description_vector",
    index_type="FLAT", 
    index_name="description_vector_index",
    metric_type="COSINE", 
    params={} 
)
index_params.add_index(
    field_name="image_description_vector", 
    index_type="FLAT",
    index_name="image_description_vector_index",
    metric_type="COSINE",
    params={} 
)
index_params.add_index(
    field_name="tags_vector", # Name of the vector field to be indexed
    index_type="FLAT", # Type of the index to create
    index_name="tags_vector_index", # Name of the index to create
    metric_type="COSINE", # Metric type used to measure similarity
    params={} # No additional parameters required for FLAT
)


In [18]:
if client.has_collection(collection_name="SemanticSearch"):
    client.drop_collection(collection_name="SemanticSearch")
    
client.create_collection(
    collection_name="SemanticSearch",
    schema=schema,
    index_params=index_params
)

In [19]:
res = client.get_load_state(
    collection_name="SemanticSearch"
)
print(res)

{'state': <LoadState: Loaded>}


In [20]:
res = client.insert(
    collection_name="SemanticSearch",
    # partition_name="partitionA",
    data=data
)

In [21]:
res

{'insert_count': 1048, 'ids': ['gaia-co-ordinate-set-3583', 'shiqa-co-ordinate-set-3582', 'arzen-co-ordinate-set-3581', 'alisha-co-ordinate-set-3580', 'cyra-co-ordinate-set-3579', 'elara-co-ordinate-set-3578', 'raisa-co-ordinate-set-3577', 'soha-co-ordinate-set-3576', 'lamya-co-ordinate-set-3575', 'sania-co-ordinate-set-3574', 'selia-co-ordinate-set-3573', 'iris-co-ordinate-set-3572', 'alba-co-ordinate-set-3571', 'eden-co-ordinate-set-3570', 'zafira-co-ordinate-set-3569', 'aya-co-ordinate-set-3568', 'elena-co-ordinate-set-3567', 'amina-co-ordinate-set-3566', 'anisha-co-ordinate-set-3565', 'mehzal-co-ordinate-set-3564', 'izel-tunic-set-3563', 'ziana-tunic-set-3562', 'naz-tunic-set-3561', 'hifza-tunic-set-3560', 'yasira-kaftan-set-3559', 'maliha-kaftan-set-3558', 'huda-long-kurta-set-3557', 'iffat-long-kurta-set-3556', 'yasim-long-kurta-set-3555', 'riva-long-kurta-set-3554', 'mishka-long-anarkali-set-3553', 'parisa-long-anarkali-set-3552', 'quirat-short-kurta-set-3551', 'kenza-short-kurt

In [22]:
query = "white cotton kurta with floral embroidery"
# query = "Blue Floral lehenga with silk work"
query_vector = model.encode(query)

In [23]:
df_copy.columns

Index(['Handle', 'Title', 'Type', 'description', 'craft', 'fabric', 'color',
       'Product_Types', 'Description', 'image_description_vector',
       'description_vector', 'Tags', 'tags_vector'],
      dtype='object')

In [28]:
client.load_collection("SemanticSearch")

res1 = client.search(
    collection_name="SemanticSearch",
    anns_field="description_vector",
    data=[query_vector],
    limit=3,
    search_params={"metric_type": "COSINE", "params": {}},
    output_fields=['Handle', 'Title', 'Type', 'description', 'craft', 'fabric', 'color',
       'Product_Types', 'Description', 'description_vector', 'image_description_vector', 'tags_vector']
)
res2 = client.search(
    collection_name="SemanticSearch",
    anns_field="image_description_vector",
    data=[query_vector],
    limit=3,
    search_params={"metric_type": "COSINE", "params": {}},
    output_fields=['Handle', 'Title', 'Type', 'description', 'craft', 'fabric', 'color',
       'Product_Types', 'Description', 'image_description_vector', 'description_vector', 'tags_vector']
)
res3 = client.search(
    collection_name="SemanticSearch",
    anns_field="tags_vector",
    data=[query_vector],
    limit=3,
    search_params={"metric_type": "COSINE", "params": {}},
    output_fields=['Handle', 'Title', 'Type', 'description', 'craft', 'fabric', 'color',
       'Product_Types', 'Description','description_vector', 'image_description_vector', 'tags_vector']
)

# TODO:
# add decriptions, tags, image_descriptions
# output_fields=["Handle", "Title", "description", "tags", "image_description", *vector_field_all*]


In [29]:
res3

data: [[{'Handle': 'waheeda-kurta-set-ivory', 'distance': 0.9173080325126648, 'entity': {'Title': 'Waheeda Kurta Set', 'color': 'White', 'Description': 'The dress is a three-piece set featuring an off-white kurta with a round neckline and delicate floral embroidery along the placket, a matching dupatta with a blue and yellow border and floral embroidery, and straight-leg pants with similar floral embroidery at the hem, the kurta also has gold floral print on the lower half.\n', 'image_description_vector': [-0.8935191035270691, -0.38839730620384216, -0.3640506863594055, -0.20863154530525208, -1.06609046459198, 0.20536451041698456, 0.17412212491035461, 0.07187548279762268, 1.1781717538833618, 0.3379130959510803, 0.42605310678482056, 0.3467601537704468, 0.15918071568012238, -0.5695711374282837, -0.527640163898468, 0.6064415574073792, -0.4688110053539276, -0.26306021213531494, -0.7246874570846558, 0.6276862621307373, -0.7943631410598755, 0.5854891538619995, -1.5951021909713745, -0.25123137

In [30]:
type(res1)

pymilvus.client.search_result.SearchResult

In [31]:
res3[0][0]

{'Handle': 'waheeda-kurta-set-ivory', 'distance': 0.9173080325126648, 'entity': {'Title': 'Waheeda Kurta Set', 'color': 'White', 'Description': 'The dress is a three-piece set featuring an off-white kurta with a round neckline and delicate floral embroidery along the placket, a matching dupatta with a blue and yellow border and floral embroidery, and straight-leg pants with similar floral embroidery at the hem, the kurta also has gold floral print on the lower half.\n', 'image_description_vector': [-0.8935191035270691, -0.38839730620384216, -0.3640506863594055, -0.20863154530525208, -1.06609046459198, 0.20536451041698456, 0.17412212491035461, 0.07187548279762268, 1.1781717538833618, 0.3379130959510803, 0.42605310678482056, 0.3467601537704468, 0.15918071568012238, -0.5695711374282837, -0.527640163898468, 0.6064415574073792, -0.4688110053539276, -0.26306021213531494, -0.7246874570846558, 0.6276862621307373, -0.7943631410598755, 0.5854891538619995, -1.5951021909713745, -0.2512313723564148

In [32]:
res = client.query(collection_name="SemanticSearch",
                           filter=f'Handle == "waheeda-kurta-set-ivory"',
                           output_fields=["Handle", "Title", "Type", "description", "Description", "Tags", 'craft', 'fabric', 'color', "Product_Types", "description_vector", "tags_vector", "image_description_vector"]
                           )
res

data: ["{'Title': 'Waheeda Kurta Set', 'Type': 'Kurta Set', 'fabric': 'Cotton', 'color': 'White', 'description_vector': [np.float32(-0.72806764), np.float32(-0.057942532), np.float32(0.12301643), np.float32(-0.6942318), np.float32(-0.559206), np.float32(-0.31156117), np.float32(-0.22777292), np.float32(0.40123862), np.float32(0.8326455), np.float32(0.63694215), np.float32(-0.2673614), np.float32(0.1153076), np.float32(-0.22410719), np.float32(-0.31239498), np.float32(-0.17531314), np.float32(0.24429134), np.float32(0.17356911), np.float32(-0.049336497), np.float32(-0.114058346), np.float32(0.92679495), np.float32(0.17132172), np.float32(0.93840545), np.float32(-1.3324668), np.float32(-0.9472834), np.float32(0.5641205), np.float32(1.0980657), np.float32(-0.35615668), np.float32(-0.2037988), np.float32(1.0429326), np.float32(0.3231222), np.float32(-0.67500806), np.float32(0.84501046), np.float32(0.39708), np.float32(-1.0783012), np.float32(-0.004996522), np.float32(0.12678026), np.float3

In [33]:
res[0].keys()

dict_keys(['Title', 'Type', 'fabric', 'color', 'description_vector', 'tags_vector', 'image_description_vector', 'description', 'Description', 'Tags', 'craft', 'Product_Types', 'Handle'])

In [34]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def best_results(image_description_result, description_result, tags_result, model, query, client):
    all_handle = set()
    results_map= defaultdict(dict)

    description_result = description_result[0]
    image_description_result = image_description_result[0]
    tags_result = tags_result[0]

    for obj in description_result:
        handle = obj.entity.get('Handle')
        if handle:
            results_map[handle]['description_vector'] = obj.get('distance')
            all_handle.add(obj.get('Handle'))
    for obj in tags_result:
        handle = obj.entity.get('Handle')
        if handle:
            results_map[handle]['tags_vector'] = obj.get('distance')
            all_handle.add(obj.get('Handle'))
    for obj in image_description_result:
        handle = obj.entity.get('Handle')
        if handle:
            results_map[handle]['image_description_vector'] = obj.get('distance')
            all_handle.add(obj.get('Handle'))

    query_vector = model.encode(query)
    
    if query_vector.ndim == 1:
        query_vector = query_vector.reshape(1, -1) 
    final_result = []
    
    for handle in all_handle: 
        dist_dict = {}

        res = client.query(collection_name="SemanticSearch",
                           filter=f'Handle == "{handle}"',
                           output_fields=["Handle", "Title", "Type", "description", "Description", "Tags", 'craft', 'fabric', 'color', "Product_Types", "description_vector", "tags_vector", "image_description_vector"]
                           )
        obj = res[0]

        if not obj:
            continue

        vectors = {}
        vectors['description_vector'] = obj.get('description_vector')
        vectors['image_description_vector'] = obj.get('image_description_vector')
        vectors['tags_vector'] = obj.get('tags_vector')

        for vec_name in ['description_vector', 'image_description_vector', 'tags_vector']:
            if vec_name in vectors:
                product_vec = np.array(vectors[vec_name])
                distance = 1 - cosine_similarity(query_vector, [product_vec])[0][0]
                dist_dict[vec_name] = distance
        
        distances = list(dist_dict.values())
        avg_distance = sum(distances) / len(distances)
        result_entry = {"handle": handle,
                        "distance": avg_distance
                        }

        for vec_name in ['description_vector', 'image_description_vector', 'tags_vector']:
            if vec_name in dist_dict:
                result_entry[vec_name] = dist_dict[vec_name]
                result_entry['description'] = obj.get('description')
                result_entry['Description'] = obj.get('Description')
                result_entry['fabric'] = obj.get('fabric')
                result_entry['craft'] = obj.get('craft')
                result_entry['color'] = obj.get('color')
                result_entry['Product_Types'] = obj.get('Product_Types')

        final_result.append(result_entry)

        # final_result.append({"handle":handle, "distance": avg_distance, for vec_name in ['description_vector', 'image_description_vector', 'tags_vector']: vec_name in vectors: dist_dict[vec_name]})
    final_result.sort(key=lambda x: x["distance"])

    return final_result

In [35]:
from pymilvus import MilvusClient

def similarity_search_VDB(user_query, model, client, top_k=5):
   
    query_vector = model.encode(user_query)

    description_result = client.search(
                                collection_name="SemanticSearch",
                                anns_field="description_vector",
                                data=[query_vector],
                                limit=top_k,
                                search_params={"metric_type": "COSINE", "params": {}},
    )
    
    image_description_result = client.search(
                                collection_name="SemanticSearch",
                                anns_field="image_description_vector",
                                data=[query_vector],
                                limit=top_k,
                                search_params={"metric_type": "COSINE", "params": {}},
    )

    tags_result = client.search(
                                collection_name="SemanticSearch",
                                anns_field="tags_vector",
                                data=[query_vector],
                                limit=top_k,
                                search_params={"metric_type": "COSINE", "params": {}},
                                # output_fields=["Handle", "Title"]
)
    combined_results = best_results(description_result,
                                    image_description_result,
                                    tags_result,
                                    model,
                                    user_query,
                                    client
                                    )
    # print("description_result", type(description_result))
    return combined_results

In [44]:
query = "lehenga taht has silver borders and floral embroidery"
results = similarity_search_VDB(user_query=query, model=model, client=client)

In [45]:
results

[{'handle': 'aaloka-lehenga-set-red-gold',
  'distance': np.float32(0.23092555),
  'description_vector': np.float32(0.2229197),
  'description': "Embrace elegance in our Red Handwoven Silk Tissue Double Layer 20 Kaali Lehenga Set. The lehenga features a handwoven red brocade inner layer with gold and silver aari work. Paired with a handwoven brocade embroidered blouse and a gold and silver tissue fabric blocked dupatta finished with a red tissue border and an old gold kiran, this ensemble is a masterpiece of artistry.This handwoven ensemble showcases unique charm and character with slight irregularities in pattern, colour, or texture, celebrating the craftsmanship and individuality of each piece. Embrace these variations as part of the product's story, crafted with care and attention to detail.",
  'Description': 'The dress is a red lehenga with intricate white and gold floral embroidery, featuring a V-neck blouse with similar embroidery, a high-waisted skirt with tiered detailing and 