# Do experiments here

In [3]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import json

In [4]:
DATA_DIR = "../Backend/MigrosData/Migros_case/products/en/"

# Step 1: Load all JSON files from the directory
file_paths = [os.path.join(DATA_DIR, file) for file in os.listdir(DATA_DIR) if file.endswith('.json')]
product_list = [json.load(open(file_path, 'r')) for file_path in file_paths]

In [5]:
len(product_list)

39056

In [6]:
# Step 2: Extract relevant features
def extract_features(product):
    return {
        'id': product['id'],
        'name': product['name'],
        'description': product['description']['text'] if 'description' in product and 'text' in product['description'] else '',
        'brand': product['brand']['name'] if 'brand' in product else '',
        'categories': [cat['name'] for cat in product['categories']] if 'categories' in product else []
    }

In [7]:

features_list = [extract_features(product) for product in product_list]
df = pd.DataFrame(features_list)


In [8]:
# Step 3: Vectorize textual features using TF-IDF
tfidf_vectorizer_name = TfidfVectorizer(max_features=500)
tfidf_vectorizer_description = TfidfVectorizer(max_features=500)
name_matrix = tfidf_vectorizer_name.fit_transform(df['name'])
description_matrix = tfidf_vectorizer_description.fit_transform(df['description'])


In [9]:
# Step 4: One-Hot encode categorical features
mlb_brand = MultiLabelBinarizer()
brand_matrix = mlb_brand.fit_transform(df['brand'].values.reshape(-1, 1))

mlb_categories = MultiLabelBinarizer()
categories_matrix = mlb_categories.fit_transform(df['categories'])


In [10]:
# Step 5: Concatenate all feature matrices to get the final feature matrix for all products
final_matrix = pd.concat([pd.DataFrame(name_matrix.todense()), 
                        pd.DataFrame(description_matrix.todense()), 
                        pd.DataFrame(brand_matrix), 
                        pd.DataFrame(categories_matrix)], axis=1)

final_matrix.shape


(39056, 2607)

In [15]:
# count zeros of each column
final_matrix.astype(bool).sum(axis=0)

0      370
1       80
2       57
3      136
4       56
      ... 
630     37
631     65
632     34
633      2
634     30
Length: 2607, dtype: int64