# Semantic Search 

In [2]:
import numpy as np
import pandas as pd
import faiss
import warnings
warnings.filterwarnings('ignore')


In [6]:
df = pd.read_csv('myntra_products_catalog.csv').loc[:499]

In [7]:
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [8]:
df.shape

(500, 8)

In [9]:
# Check dataset information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ProductID     500 non-null    int64 
 1   ProductName   500 non-null    object
 2   ProductBrand  500 non-null    object
 3   Gender        500 non-null    object
 4   Price (INR)   500 non-null    int64 
 5   NumImages     500 non-null    int64 
 6   Description   500 non-null    object
 7   PrimaryColor  468 non-null    object
dtypes: int64(3), object(5)
memory usage: 31.4+ KB


In [10]:
# check Statistical Summary 

df.describe()

Unnamed: 0,ProductID,Price (INR),NumImages
count,500.0,500.0,500.0
mean,9649705.0,1875.462,4.612
std,1767212.0,3127.722934,1.213425
min,1000182.0,266.0,1.0
25%,10003500.0,653.5,4.0
50%,10012390.0,966.5,5.0
75%,10015170.0,1620.0,5.0
max,10018080.0,31100.0,7.0


In [11]:
# Check Null values 

df.isnull().sum()

ProductID        0
ProductName      0
ProductBrand     0
Gender           0
Price (INR)      0
NumImages        0
Description      0
PrimaryColor    32
dtype: int64

In [12]:
df.fillna("None" ,inplace= True)

In [14]:
df.isna().sum()

ProductID       0
ProductName     0
ProductBrand    0
Gender          0
Price (INR)     0
NumImages       0
Description     0
PrimaryColor    0
dtype: int64

In [15]:
# Check Duplicate Records 

df.duplicated().sum()

0

# Convert Relevant field to Vector using BERT Model  

In [17]:
!pip install tf-keras

Collecting tf-keras
  Using cached tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Using cached tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
Installing collected packages: tf-keras
Successfully installed tf-keras-2.19.0


In [18]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [19]:
df['DescriptionVector'] = df['Description'].apply(lambda x : model.encode(x))

In [20]:
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor,DescriptionVector
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black,"[0.027645921, -0.00263416, -0.0035883936, 0.05..."
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige,"[-0.02466071, -0.028755339, -0.020332504, 0.03..."
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink,"[-0.046943247, 0.081827976, 0.048335172, -0.00..."
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue,"[-0.015098776, -0.010285394, 0.009487305, -0.0..."
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White,"[-0.017746579, 0.0062096375, 0.021813972, 0.02..."


In [None]:
# Stack the vectors from the 'DescriptionVector' column
vectors = np.stack(df['DescriptionVector'].values).astype("float32")
dimension = vectors.shape[1]

#  FAISS Index
index = faiss.IndexFlatL2(dimension)
index.add(vectors)

In [23]:
def semantic_search(query: str, top_k: int = 5):
    query_vector = model.encode([query]).astype("float32")
    distances, indices = index.search(query_vector, top_k)
    return df.iloc[indices[0]].assign(score=distances[0])



In [28]:
# Query
results = semantic_search("Blue Shoes", top_k=5)
print(results[['ProductName', 'Description', 'PrimaryColor', 'Price (INR)', 'score']])

import plotly.graph_objects as go

fig = go.Figure(data=[go.Table(
    header=dict(values=list(results[['ProductName', 'Description', 'PrimaryColor', 'Price (INR)', 'score']].columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[results['ProductName'], results['Description'], results['PrimaryColor'], results['Price (INR)'], results['score']],
               fill_color='lavender',
               align='left'))
])
fig.show()

                                         ProductName  \
310                           Puma Men Blue Sneakers   
495                           Puma Men Blue Sneakers   
362  ID Men Navy Blue Solid Leather Mid-Top Sneakers   
472                      Puma Men Navy Blue Sneakers   
483                  ID Men Navy Blue Leather Derbys   

                                           Description PrimaryColor  \
310  A pair of round-toe blue sneakers, has regular...         Blue   
495  A pair of round-toe blue sneakers, has regular...         Blue   
362  A pair of round-toe navy blue sneakers, has mi...         Blue   
472  A pair of round-toe navy blue & white sneakers...         Blue   
483  A pair of round-toe navy blue derbys, has regu...         Blue   

     Price (INR)     score  
310         1799  0.627885  
495         1749  0.627885  
362         1286  0.713041  
472         1959  0.715494  
483         2685  0.785897  


In [29]:
df.to_pickle("products_with_vectors.pkl")