In [1]:
#!pip install faiss-cpu

Collecting faiss-cpu
  Using cached faiss_cpu-1.7.4-cp39-cp39-macosx_11_0_arm64.whl (2.7 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [1]:
import faiss
import numpy as np
import pandas as pd


## Create an index

In [11]:
image_embeddings = np.load("image_embeddings.npy")
df = pd.read_csv("H&M_items.csv", dtype = str)
df.head()
print(image_embeddings.shape)

(104835, 512)


In [12]:
list(df[df.article_id.isin(['0108775015'])].index)

[0]

In [37]:
I[0]

array([  100,     0, 73844, 35177])

In [35]:
df.iloc[I]

ValueError: Buffer has wrong number of dimensions (expected 1, got 3)

In [14]:
d=512
index = faiss.IndexFlatIP(d)
index.add(image_embeddings.astype('float32'))

In [15]:
# Save the index to a file
faiss.write_index(index, "index.faiss")

## Search index

In [16]:
# Perform a search
k = 4  # number of nearest neighbors
#either IDSelectorBatch, or IDSelectorArray for filtered search
D, I = index.search(image_embeddings[100].reshape((1,512)).astype('float32'),
                    k,
                    params=faiss.SearchParameters(sel = faiss.IDSelectorBatch([0, 35177,73844, 100,1001,1230,122]))
                    )
I

array([[  100,     0, 73844, 35177]])

In [20]:
embeddings = index.reconstruct_batch([100]) # get vector at index 100
print(embeddings)
print(embeddings.shape)

[[ 1.93273854e-02 -9.37842950e-02 -7.37950057e-02  1.17455442e-04
  -3.50538045e-02 -2.61178277e-02 -5.01587167e-02 -5.40749682e-03
  -1.13040013e-02  1.61023568e-02  5.04050665e-02  1.68056821e-03
  -5.08043915e-02 -2.24936549e-02 -3.68862040e-02  3.80077250e-02
  -4.65479232e-02 -2.66675837e-02  7.58600831e-02 -6.38416223e-03
  -6.32607862e-02  4.59217606e-03 -3.52157988e-02  5.32379970e-02
   1.07020428e-02 -2.72748861e-02  5.22757210e-02 -4.37464612e-03
  -1.16495427e-03 -3.11148483e-02  1.19050685e-02  2.80885468e-03
  -1.94660258e-02 -1.56171657e-02  1.91521528e-03  5.07290289e-02
   1.17237596e-02  7.01037282e-03  2.44581886e-03  1.19407736e-02
  -1.58118736e-02  1.68048162e-02  5.22434562e-02  3.88998725e-03
  -1.57615654e-02  8.46302428e-04  7.16961734e-03  4.13147025e-02
  -4.19461392e-02  1.46266809e-02 -1.15152262e-02 -1.06423534e-02
  -8.55518691e-03 -3.80785614e-02  1.57510974e-02  4.26360145e-02
  -1.34986630e-02  1.16961654e-02 -4.14402373e-02  1.91165917e-02
  -2.68566

In [31]:
n = np.mean(image_embeddings, axis=0).reshape((1,512))

n = n/np.linalg.norm(n, ord=2, axis=-1, keepdims=True)
n.shape
type(n.astype('float32'))

numpy.ndarray

In [20]:
def get_embeddings_by_id(ids:list):
    """
    When two or more items are chosen, their ids are passed to this function to get their embeddings
    
        params: ids: list of ids
    
        returns: embeddings of the ids
    """
    query_string = " or ".join([f"article_id=='{id}'" for id in ids])
    
    indexes = df.query(query_string).index
    embeddings = index.reconstruct_batch(indexes)
    
    return embeddings

In [21]:
ids = ["0429322001", "0429322007"]

get_embeddings_by_id(ids)

array([[ 0.05774637,  0.03786624,  0.03480565, ..., -0.01809942,
        -0.00956022, -0.04715981],
       [ 0.02655817,  0.0578466 ,  0.02785705, ...,  0.01810479,
        -0.01383074, -0.00934319]], dtype=float32)

## Filter items

In [22]:
def filter_items(filters:list):
    """
    params:
        filters: list of tuples of the form (column, value)
    """
    query_string = " and ".join([f"{filter[0]}=='{filter[1]}'" for filter in filters])

    print(query_string)
    return df.query(query_string)

In [24]:
string = [("index_group_name", "Ladieswear"),
          ("product_group_name", "Nightwear"),
          ("color", "Black")
          ]

filter_items(string)

index_group_name=='Ladieswear' and product_group_name=='Nightwear' and color=='Black'


Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,color,index_group_name,image_src,href
2747,0429322001,Charlotte SP N-Slip Andes,Night gown,Nightwear,Black,Ladieswear,https://d11p8vtjlacpl4.cloudfront.net/kaggle-h...,https://www2.hm.com/en_gb/productpage.04293220...
2749,0429322007,Charlotte SP N-Slip Andes,Night gown,Nightwear,Black,Ladieswear,https://d11p8vtjlacpl4.cloudfront.net/kaggle-h...,https://www2.hm.com/en_gb/productpage.04293220...
6143,0503729006,India PJ (W),Pyjama set,Nightwear,Black,Ladieswear,https://d11p8vtjlacpl4.cloudfront.net/kaggle-h...,https://www2.hm.com/en_gb/productpage.05037290...
6439,0506452003,Sunday SL Set,Pyjama set,Nightwear,Black,Ladieswear,https://d11p8vtjlacpl4.cloudfront.net/kaggle-h...,https://www2.hm.com/en_gb/productpage.05064520...
6592,0508156028,Donna N-slip Print (J),Night gown,Nightwear,Black,Ladieswear,https://d11p8vtjlacpl4.cloudfront.net/kaggle-h...,https://www2.hm.com/en_gb/productpage.05081560...
...,...,...,...,...,...,...,...,...
91938,0854154002,NW LUNA TRS (J),Pyjama bottom,Nightwear,Black,Ladieswear,https://d11p8vtjlacpl4.cloudfront.net/kaggle-h...,https://www2.hm.com/en_gb/productpage.08541540...
93365,0860527001,Emily trouser (W),Pyjama bottom,Nightwear,Black,Ladieswear,https://d11p8vtjlacpl4.cloudfront.net/kaggle-h...,https://www2.hm.com/en_gb/productpage.08605270...
99428,0888331005,Bonnie SL set (J),Pyjama set,Nightwear,Black,Ladieswear,https://d11p8vtjlacpl4.cloudfront.net/kaggle-h...,https://www2.hm.com/en_gb/productpage.08883310...
100033,0891877001,Sara LL PJ (J),Pyjama set,Nightwear,Black,Ladieswear,https://d11p8vtjlacpl4.cloudfront.net/kaggle-h...,https://www2.hm.com/en_gb/productpage.08918770...


### PLANS

group product_group_name as category with product_type_name as subcategory

group everything as subcategories to index_group_name


- index_group_name:
    - product_group_name
- perceived_colour_master_name


index_group_name is in header

product_group_name on side
perceived_colour_master_name on side as a slidedown
product_type_name on each item






In [154]:
category_structure = {}

cateories = {}
for i_group_name in pd.unique(df.index_group_name):

    sub_cats = {}
    for p_group_name in df.query(f"index_group_name=='{i_group_name}'")['product_group_name'].unique():
        sub_cats.update({p_group_name : df.query(f"product_group_name=='{p_group_name}'")['product_type_name'].unique()})
    cateories.update({i_group_name : list(sub_cats)})

category_structure.update({'Categories': cateories})
category_structure.update({'Color': list(df.perceived_colour_master_name.unique())})

In [155]:
import json

json_string = json.dumps(category_structure, indent=4 )  
print(json_string)

{
    "Categories": {
        "Ladieswear": [
            "Garment Upper body",
            "Underwear",
            "Socks & Tights",
            "Garment Lower body",
            "Accessories",
            "Items",
            "Swimwear",
            "Garment Full body",
            "Nightwear",
            "Shoes",
            "Furniture"
        ],
        "Baby/Children": [
            "Garment Upper body",
            "Nightwear",
            "Garment Lower body",
            "Underwear",
            "Garment Full body",
            "Socks & Tights",
            "Accessories",
            "Underwear/nightwear",
            "Swimwear",
            "Shoes",
            "Cosmetic"
        ],
        "Menswear": [
            "Garment Lower body",
            "Garment Upper body",
            "Underwear",
            "Accessories",
            "Shoes",
            "Socks & Tights",
            "Swimwear",
            "Nightwear",
            "Garment Full body"
        ],
        "Sp