In [8]:
from sys import path
path.append('../..')
from getData import getProducts

def normalizeData():
    products = getProducts()
    justProducts = []
    categories = list(products.keys())
    category_to_id = {category: idx for idx, category in enumerate(categories)}
    
    for category in products:
        category_id = category_to_id[category]
        for name, price in products[category]:
            justProducts.append((category, category_id, name, price))
    
    normalizedProducts = []
    for i in range(len(justProducts)):
        normalizedProducts.append({
            "product_id": i,                      
            "category_name": justProducts[i][0],     
            "category_id": justProducts[i][1],     
            "product_name": justProducts[i][2],     
            "product_price": justProducts[i][3]      
        })
    return normalizedProducts

In [9]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp 

products = normalizeData()
print(products)

[{'product_id': 0, 'category_name': 'jackets', 'category_id': 0, 'product_name': 'Baby-Jacke mit Kapuze - geblümt', 'product_price': 22.99}, {'product_id': 1, 'category_name': 'jackets', 'category_id': 0, 'product_name': 'Blümchen - 3-in-1-Baby-Jacke mit Kapuze', 'product_price': 29.99}, {'product_id': 2, 'category_name': 'jackets', 'category_id': 0, 'product_name': 'Baby-Jacke mit Kapuze - wasserabweisend', 'product_price': 22.99}, {'product_id': 3, 'category_name': 'jackets', 'category_id': 0, 'product_name': 'Blümchen - Baby-Jacke mit Kapuze', 'product_price': 22.99}, {'product_id': 4, 'category_name': 'jackets', 'category_id': 0, 'product_name': 'Baby-Steppjacke mit Kapuze - geblümt', 'product_price': 19.99}, {'product_id': 5, 'category_name': 'jackets', 'category_id': 0, 'product_name': 'Micky Maus - Baby-Jacke mit Kapuze - wasserabweisend', 'product_price': 22.99}, {'product_id': 6, 'category_name': 'jackets', 'category_id': 0, 'product_name': 'Baby-Jacke mit Kapuze - wasserabwei

In [10]:
reformattedProducts = {
    "category_name": [],
    "product_name": [],
    "product_price": []
}

for product in products:
    reformattedProducts["category_name"].append(product["category_name"])
    reformattedProducts["product_name"].append(product["product_name"])
    reformattedProducts["product_price"].append(product["product_price"])
print(reformattedProducts)

{'category_name': ['jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'jackets', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 'sweaters', 's

In [11]:
df = pd.DataFrame(data=reformattedProducts)
df.head()

Unnamed: 0,category_name,product_name,product_price
0,jackets,Baby-Jacke mit Kapuze - geblümt,22.99
1,jackets,Blümchen - 3-in-1-Baby-Jacke mit Kapuze,29.99
2,jackets,Baby-Jacke mit Kapuze - wasserabweisend,22.99
3,jackets,Blümchen - Baby-Jacke mit Kapuze,22.99
4,jackets,Baby-Steppjacke mit Kapuze - geblümt,19.99


In [12]:
ohe = pp.OneHotEncoder(handle_unknown = "ignore", sparse_output = False).set_output(transform = "pandas")
oheTransformed = ohe.fit_transform(df[["category_name"]])
oheTransformed.head()

Unnamed: 0,category_name_accessories,category_name_baby,category_name_dresses,category_name_jackets,category_name_jeans,category_name_other,category_name_pants,category_name_sets,category_name_shirts,category_name_skirts,category_name_sleep,category_name_sweaters,category_name_swimwear,category_name_underwear,category_name_vests
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df = pd.concat([df, oheTransformed], axis=1).drop(columns=["category_name"])
df.head(5)

Unnamed: 0,product_name,product_price,category_name_accessories,category_name_baby,category_name_dresses,category_name_jackets,category_name_jeans,category_name_other,category_name_pants,category_name_sets,category_name_shirts,category_name_skirts,category_name_sleep,category_name_sweaters,category_name_swimwear,category_name_underwear,category_name_vests
0,Baby-Jacke mit Kapuze - geblümt,22.99,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Blümchen - 3-in-1-Baby-Jacke mit Kapuze,29.99,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Baby-Jacke mit Kapuze - wasserabweisend,22.99,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Blümchen - Baby-Jacke mit Kapuze,22.99,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Baby-Steppjacke mit Kapuze - geblümt,19.99,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
