In [1]:
import pandas as pd 

In [2]:
dataset=pd.read_excel("data/data_test.xlsx")

In [3]:
df=pd.read_excel("data/data_200_300.xlsx")

In [4]:
df1=pd.read_excel("data/data_300_400.xlsx")

In [5]:
dataset=pd.concat([dataset,df,df1],ignore_index=True)

In [6]:
dataset=dataset.drop("Unnamed: 0",axis=1)

In [7]:
dataset=dataset.drop_duplicates()

In [13]:
dataset.to_excel("data/data_preview.xlsx")

In [9]:
dataset.head(2)

Unnamed: 0,Photo produit 1,Photo produit 2,Nom produit,Lien achat,Catégorie produit,Genre,cielab_colors
0,https://wasted.fr/cdn/shop/files/t-shirt-acid-...,-,T-Shirt Acid,https://wasted.fr/collections/t-shirts-homme/p...,T-shirt,H/F,"[[10.588235294117647, 0.0, 1.0]]"
1,https://wasted.fr/cdn/shop/files/t-shirt-happy...,-,T-Shirt Happy When it Rains,https://wasted.fr/collections/t-shirts-homme/p...,T-shirt,H/F,"[[10.588235294117647, 0.0, 0.0]]"


In [10]:
dataset.columns

Index(['Photo produit 1', 'Photo produit 2 ', 'Nom produit ', 'Lien achat',
       'Catégorie produit', 'Genre', 'cielab_colors'],
      dtype='object')

In [12]:
dataset=dataset.dropna()

In [14]:
import ast 

dataset["cielab_colors"]=dataset["cielab_colors"].apply(ast.literal_eval)

In [15]:
data=dataset.explode("cielab_colors").reset_index(drop=True)

In [16]:
data.shape


(2125, 7)

In [17]:
import faiss
import numpy as np

# --------------------------------------
# 1. Prepare Data for FAISS
# --------------------------------------
# Extract all color vectors as a numpy array
all_colors = np.array(data["cielab_colors"].tolist(), dtype="float32")

# Create a FAISS index (L2 distance for CIELAB)
dim = 3 # L, a, b
index = faiss.IndexFlatL2(dim)
index.add(all_colors)


In [19]:
faiss.write_index(index,"vector_store/faiss_index.idx")

In [20]:
# --------------------------------------
# 2. Query Function
# --------------------------------------
def find_matching_products(query_colors, index, df, top_k=5):
    query_colors = np.array(query_colors, dtype="float32")
    distances, indices = index.search(query_colors, top_k)

    # Collect all matching product IDs and their distances
    matches = []
    for i, query in enumerate(query_colors):
        for j in range(top_k):
            product_id = df.iloc[indices[i][j]]["Photo produit 1"]
            distance = distances[i][j]
            matches.append((product_id, distance))

    # Group by Product ID and keep the best (min) distance
    product_distances = {}
    for product_id, distance in matches:
        if product_id not in product_distances or distance < product_distances[product_id]:
            product_distances[product_id] = distance

    # Sort products by distance
    sorted_products = sorted(product_distances.items(), key=lambda x: x[1])
    return sorted_products[:top_k]


In [171]:
# --------------------------------------
# 3. Example Usage
# --------------------------------------
# Input: Assume 2 dominant colors
query_colors = query_color = [[63.921568627450974, -5.0, -17.0]] # Example CIELAB vectors
matches = find_matching_products(query_colors, index, data, top_k=5)


In [172]:
matches

[('https://wasted.fr/cdn/shop/files/varsity-jacket-prey-742723.jpg?v=1737390429',
  np.float32(0.0)),
 ('https://ntmb.it/cdn/shop/products/jeans-baggy-reworked-blue-311475.jpg?v=1694471080&width=3000',
  np.float32(7.8446774)),
 ('https://nestorevintage.fr/cdn/shop/files/055917A6-267F-48CA-87FE-973C1946E906.jpg?v=1725801192&width=2048',
  np.float32(14.384086)),
 ('https://fr.sandro-paris.com/dw/image/v2/BCMW_PRD/on/demandware.static/-/Sites-master-catalog/default/dwcd63739a/images/packshot/Sandro_SHPBE00082-40_H_P.jpg?sw=650&sh=650',
  np.float32(16.153788)),
 ('https://nestorevintage.fr/cdn/shop/files/IMG_23132.jpg?v=1715529282&width=2048',
  np.float32(18.384087))]

In [173]:
from sklearn.neighbors import NearestNeighbors

In [174]:
nn=NearestNeighbors(n_neighbors=5,metric="euclidean")

In [175]:
nn.fit(all_colors)

In [179]:
def find_matching_products_nn(query_colors, nn_model, df, top_k=5):
    query_colors = np.array(query_colors, dtype="float32")
    distances, indices = nn_model.kneighbors(query_colors)

    # Collect all matching product IDs and their distances
    matches = []
    for i, query in enumerate(query_colors):
        for j in range(top_k):
            product_id = df.iloc[indices[i][j]]["Photo produit 1"]
            distance = distances[i][j]
            matches.append((product_id, distance))

    # Group by Product ID and keep the best (min) distance
    product_distances = {}
    for product_id, distance in matches:
        if product_id not in product_distances or distance < product_distances[product_id]:
            product_distances[product_id] = distance

    # Sort products by distance
    sorted_products = sorted(product_distances.items(), key=lambda x: x[1])
    return sorted_products[:top_k]


In [185]:
query_colors = query_color = [[83.13725490196079, 0.0, 0.0], [70.58823529411765, 0.0, 0.0]] # Example CIELAB vectors
matches = find_matching_products(query_colors, index, data, top_k=5)

In [186]:
matches

[('https://wasted.fr/cdn/shop/files/crewneck-negative-617699_2048x2048.jpg?v=1712661665',
  np.float32(0.0)),
 ('https://wasted.fr/cdn/shop/products/jean-casper-snow-feeler-762622.jpg?v=1706147807',
  np.float32(0.0)),
 ('https://wasted.fr/cdn/shop/files/t-shirt-united-930779_2048x2048.jpg?v=1730548101',
  np.float32(0.1537883)),
 ('https://wasted.fr/cdn/shop/products/hoodie-zip-crown-pitcher-736902_2048x2048.jpg?v=1689958848',
  np.float32(0.1537883)),
 ('https://wasted.fr/cdn/shop/files/jean-casper-signature-610785.jpg?v=1737390239',
  np.float32(0.1537883))]

In [187]:
dataset.iloc[140,][0]

  dataset.iloc[140,][0]


'https://wasted.fr/cdn/shop/products/jean-casper-snow-feeler-762622.jpg?v=1706147807'

In [68]:
data_preview=pd.read_excel("data/data_preview.xlsx")

In [69]:
data_preview=data_preview.drop("Unnamed: 0",axis=1)

In [70]:
products=data_preview["Catégorie produit"].unique().tolist()

In [28]:
for item in products:
    print(f"{item}")

T-shirt
Hoodie
Pull
Jean
Pantalon
Jogging
Veste
Doudoune
Jupe
Bonnet
Casquette
Chapeau
Short
Chemise
Pantalon habillé
Chino
short
Blazer
Robe
Manteau
Blouson
Trench
Cardigan
Col roulé
Top
top
Débardeur
chemise
t shirt 
body
t shirt
jean
pantalon habillé
jupe
sneakers
botte
Chaussures de ville
bottes
escarpin
Bottes
T-Shirt
Écharpe
Portefeuille
Sac bandoulière
Sac à main
Sac de voyage
Lunettes
Sac à dos
Veste 
Bob


In [29]:
import yaml
from pyprojroot import here
with open(here("colorfact/configs/outfit.yaml")) as cfg:
            outfit = yaml.load(cfg, Loader=yaml.FullLoader)

In [36]:
cat=list(outfit["Ontologie"].values())

In [43]:
cat=[item.split(',') for item in cat]

In [45]:
cat=[item.strip() for sublist in cat for item in sublist]

In [46]:
cat

['T-shirt',
 'Polo',
 'Chemise',
 'Col-roulés',
 'Sweatshirt',
 'Hoodie',
 'Pull',
 'Cardigan',
 'Veste',
 'Blouson',
 'Manteau',
 'Parka',
 'Trench',
 'Pantalons',
 'Jean',
 'Short',
 'Jogging',
 'Chinos',
 'Jupes/Robes',
 'Combinaisons',
 'Costumes',
 'Tailleurs',
 'Pantalon habillé',
 'Blazer',
 'Robe de soirée',
 'Sneakers',
 'Bottes',
 'Chaussures de ville',
 'Escarpins',
 'Talons',
 'Sandales',
 'Sac à main',
 'Sac à dos',
 'Lunettes',
 'Bonnet',
 'Casquettes',
 'Ceinture',
 'Montre']

In [71]:
products_not_in=[item for item in products if item not in cat]

In [72]:
products_not_in

['Pantalon',
 'Doudoune',
 'Jupe',
 'Casquette',
 'Chapeau',
 'Chino',
 'short',
 'Robe',
 'Col roulé',
 'Top',
 'top',
 'Débardeur',
 'chemise',
 't shirt ',
 'body',
 't shirt',
 'jean',
 'pantalon habillé',
 'jupe',
 'sneakers',
 'botte',
 'bottes',
 'escarpin',
 'T-Shirt',
 'Écharpe',
 'Portefeuille',
 'Sac bandoulière',
 'Sac de voyage',
 'Veste ',
 'Bob']

In [51]:
products

['T-shirt',
 'Hoodie',
 'Pull',
 'Jean',
 'Pantalon',
 'Jogging',
 'Veste',
 'Doudoune',
 'Jupe',
 'Bonnet',
 'Casquette',
 'Chapeau',
 'Short',
 'Chemise',
 'Pantalon habillé',
 'Chino',
 'short',
 'Blazer',
 'Robe',
 'Manteau',
 'Blouson',
 'Trench',
 'Cardigan',
 'Col roulé',
 'Top',
 'top',
 'Débardeur',
 'chemise',
 't shirt ',
 'body',
 't shirt',
 'jean',
 'pantalon habillé',
 'jupe',
 'sneakers',
 'botte',
 'Chaussures de ville',
 'bottes',
 'escarpin',
 'Bottes',
 'T-Shirt',
 'Écharpe',
 'Portefeuille',
 'Sac bandoulière',
 'Sac à main',
 'Sac de voyage',
 'Lunettes',
 'Sac à dos',
 'Veste ',
 'Bob']

In [52]:
from thefuzz import process

# Your raw product list
raw_products =products

# Standardized product categories
standard_products = cat

# Match each raw product to the closest standard category
matched_products = {product: process.extractOne(product, standard_products)[0] for product in raw_products}

# Print results
for original, matched in matched_products.items():
    print(f"{original} -> {matched}")


T-shirt -> T-shirt
Hoodie -> Hoodie
Pull -> Pull
Jean -> Jean
Pantalon -> Pantalons
Jogging -> Jogging
Veste -> Veste
Doudoune -> Blouson
Jupe -> Jupes/Robes
Bonnet -> Bonnet
Casquette -> Casquettes
Chapeau -> Chaussures de ville
Short -> Short
Chemise -> Chemise
Pantalon habillé -> Pantalon habillé
Chino -> Chinos
short -> Short
Blazer -> Blazer
Robe -> Jupes/Robes
Manteau -> Manteau
Blouson -> Blouson
Trench -> Trench
Cardigan -> Cardigan
Col roulé -> Col-roulés
Top -> T-shirt
top -> T-shirt
Débardeur -> Blazer
chemise -> Chemise
t shirt  -> T-shirt
body -> Bottes
t shirt -> T-shirt
jean -> Jean
pantalon habillé -> Pantalon habillé
jupe -> Jupes/Robes
sneakers -> Sneakers
botte -> Bottes
Chaussures de ville -> Chaussures de ville
bottes -> Bottes
escarpin -> Escarpins
Bottes -> Bottes
T-Shirt -> T-shirt
Écharpe -> Chaussures de ville
Portefeuille -> Pull
Sac bandoulière -> Sac à main
Sac à main -> Sac à main
Sac de voyage -> Sac à dos
Lunettes -> Lunettes
Sac à dos -> Sac à dos
Veste

In [53]:
new_products=list(matched_products.values())

In [56]:
products_not_in=[item for item in new_products if item not in cat]

In [58]:
products_not_in

[]

In [59]:
import json 
# Save matched products
with open(here("colorfact/data/product_mapping.json"), "w", encoding="utf-8") as f:
    json.dump(matched_products, f, ensure_ascii=False, indent=4)

In [61]:
# Load the mapping
with open(here("colorfact/data/product_mapping.json"), "r", encoding="utf-8") as f:
    saved_mapping = json.load(f)

# Use the mapping instead of re-matching
new_input = "chemise"
standardized_output = saved_mapping.get(new_input, process.extractOne(new_input, standard_products)[0])
print(standardized_output)

Chemise


In [67]:
data_preview.columns

Index(['Photo produit 1', 'Photo produit 2 ', 'Nom produit ', 'Lien achat',
       'Catégorie produit', 'Genre', 'cielab_colors'],
      dtype='object')

In [73]:
data_preview['Catégorie produit'] = data_preview['Catégorie produit'].map(saved_mapping)

In [74]:

standard_products = list(set(saved_mapping.values()))  # Ensure unique standard product names

data_preview['Catégorie produit'] = data_preview['Catégorie produit'].fillna(data_preview['Catégorie produit'].apply(lambda x: process.extractOne(x, standard_products)[0]))


In [75]:
products=data_preview["Catégorie produit"].unique().tolist()

In [76]:
products_not_in=[item for item in products if item not in cat]

In [77]:
products_not_in

[]

In [78]:
from dotenv import load_dotenv

In [79]:
load_dotenv()

True