# Do experiments here

# Build a feature matrix

In [23]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import json
from IPython.display import display, Image

In [24]:
DATA_DIR = "../Backend/MigrosData/Migros_case/products/en/"

# Step 1: Load all JSON files from the directory
file_paths = [os.path.join(DATA_DIR, file) for file in os.listdir(DATA_DIR) if file.endswith('.json')]
product_list = [json.load(open(file_path, 'r')) for file_path in file_paths]

In [25]:
len(product_list)

39056

In [26]:
# Step 2: Extract relevant features
def extract_features(product):
    return {
        'id': product['id'],
        'name': product['name'],
        'description': product['description']['text'] if 'description' in product and 'text' in product['description'] else '',
        'brand': product['brand']['name'] if 'brand' in product else '',
        'categories': [cat['name'] for cat in product['categories']] if 'categories' in product else [],
        'image_url': product['image']['original'] if 'image' in product else '',
    }

In [27]:

features_list = [extract_features(product) for product in product_list]
df = pd.DataFrame(features_list)


In [28]:
# Step 3: Vectorize textual features using TF-IDF
tfidf_vectorizer_name = TfidfVectorizer(max_features=500)
tfidf_vectorizer_description = TfidfVectorizer(max_features=500)
name_matrix = tfidf_vectorizer_name.fit_transform(df['name'])
description_matrix = tfidf_vectorizer_description.fit_transform(df['description'])


In [29]:
# Step 4: One-Hot encode categorical features
mlb_brand = MultiLabelBinarizer()
brand_matrix = mlb_brand.fit_transform(df['brand'].values.reshape(-1, 1))

mlb_categories = MultiLabelBinarizer()
categories_matrix = mlb_categories.fit_transform(df['categories'])


In [30]:
# Step 5: Concatenate all feature matrices to get the final feature matrix for all products
final_matrix = pd.concat([pd.DataFrame(name_matrix.todense()), 
                        pd.DataFrame(description_matrix.todense()), 
                        pd.DataFrame(brand_matrix), 
                        pd.DataFrame(categories_matrix)], axis=1)

final_matrix.shape


(39056, 2607)

In [31]:
# count zeros of each column
final_matrix.astype(bool).sum(axis=0)

0      370
1       80
2       57
3      136
4       56
      ... 
630     37
631     65
632     34
633      2
634     30
Length: 2607, dtype: int64

# Get similar items

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_products(product_vector, matrix, product_ids, top_n=5):
    """
    Compute the cosine similarity between the given product vector and all vectors in the matrix.
    Return the top_n most similar product IDs.
    """
    # Compute cosine similarities
    similarities = cosine_similarity(product_vector, matrix)
    
    # Get top_n product indices
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    
    # Get product IDs for the top indices
    similar_products = [product_ids[idx] for idx in top_indices]
    
    return similar_products

In [38]:
# Suppose we want to find products similar to the product with ID '110230800000'
product_id = '111481800500'
product_idx = df[df['id'] == product_id].index[0]
product_image = df[df['id'] == product_id]['image_url'].values[0]
product_vector = final_matrix.iloc[product_idx].values.reshape(1, -1)

display(Image(url=product_image, width=300))


# Get the top 5 similar products
similar_products = get_similar_products(product_vector, final_matrix, df['id'].tolist(), top_n=5)

for similar_product in similar_products:
    display(df[df['id'] == similar_product])
    display(Image(url=df[df['id'] == similar_product]['image_url'].values[0], width=300))


Unnamed: 0,id,name,description,brand,categories,image_url
22937,111487700500,Silserkranz IP-SUISSE,,,"[Brote hell, Brote, Bäckerei, Confiserie & Bac...",https://image.migros.ch/original/b15c8990aa056...


Unnamed: 0,id,name,description,brand,categories,image_url
28571,111443000000,Mailänder Brot IP-SUISSE,,,"[Brote hell, Brote, Bäckerei, Confiserie & Bac...",https://image.migros.ch/original/b7e3f1da12e72...


Unnamed: 0,id,name,description,brand,categories,image_url
14030,111474500000,Taillaule IP-SUISSE,,,"[Brote hell, Brote, Bäckerei, Confiserie & Bac...",https://image.migros.ch/original/6430863930265...


Unnamed: 0,id,name,description,brand,categories,image_url
32710,111481800500,Tessinerbrot IP-SUISSE,,,"[Brote hell, Brote, Bäckerei, Confiserie & Bac...",https://image.migros.ch/original/103ad1c61787e...


Unnamed: 0,id,name,description,brand,categories,image_url
19199,111458100500,Laugentessinerbrot IP-SUISSE,,,"[Brote hell, Brote, Bäckerei, Confiserie & Bac...",https://image.migros.ch/original/8e67660919aa5...
