In [1]:
import pandas as pd
from urllib.parse import unquote

# --- Load raw scraped products ---
df = pd.read_csv("data/grouped_products_final.csv")


In [2]:
# --- Extract categories ---
categories = (
    df["Product URL"]
    .str.split("en/", n=1).str[1].str.split("/", n=0).str[0]
    .apply(lambda x: unquote(x) if isinstance(x, str) else x)
    .drop_duplicates()
    .reset_index(drop=True)
    .to_frame(name="category")
)

categories["id_category"] = categories.index + 1
categories = categories[["id_category", "category"]]
categories.to_csv("categories.csv", index=False)


In [3]:
# --- Extract subcategories ---
df_subcategories = (
    df["Product URL"]
    .drop_duplicates()
    .dropna()
    .str.replace("-", " ")
    .str.split("/")
    .str[7:]  # take from the 8th element onward
    .apply(lambda x: "/".join(x[:-2]) if isinstance(x, list) else "")
    .reset_index(drop=True)
    .to_frame(name="subcategory")
)

# explode multiple subcategories
df_subcategories = (
    df_subcategories["subcategory"]
    .str.split("/")
    .explode()
    .reset_index(drop=True)
    .to_frame(name="subcategory")
)


In [4]:
# normalize names
def normalize_subcat(x: str) -> str:
    if not isinstance(x, str):
        return x
    x_up = x.upper()
    if "DRIVE" in x_up:
        return "DRIVE"
    elif "INVERTER" in x_up:
        return "INVERTER"
    elif "W22" in x_up:
        return "W22"
    elif "INDUSTRIAL CONNECTOR" in x_up:
        return "INDUSTRIAL CONNECTOR"
    elif "INDUSTRIAL WALL" in x_up:
        return "INDUSTRIAL WALL"
    elif "INDUSTRIAL PANEL SOCKET" in x_up:
        return "INDUSTRIAL PANEL SOCKET"
    elif "INDUSTRIAL PLUG PIWD" in x_up:
        return "INDUSTRIAL PLUG PIWD"
    else:
        return " ".join(x.split()[:2])  # fallback: first two words

df_subcategories["subcategory"] = df_subcategories["subcategory"].apply(normalize_subcat)

# assign IDs
df_subcategories = df_subcategories.drop_duplicates(subset=["subcategory"]).reset_index(drop=True)
df_subcategories["id_subcategory"] = (df_subcategories.index + 1).astype("Int64")
df_subcategories = df_subcategories[["id_subcategory", "subcategory"]]
df_subcategories.to_csv("sub_categories.csv", index=False)


In [5]:
# --- Attach category and subcategory to main df ---
df["category"] = (
    df["Product URL"]
    .str.split("en/", n=1).str[1].str.split("/", n=0).str[0]
    .apply(lambda x: unquote(x) if isinstance(x, str) else x)
)

df["subcategory"] = (
    df["Product URL"]
    .dropna()
    .str.replace("-", " ")
    .str.split("/")
    .str[7:]
    .apply(lambda x: "/".join(x[:-2]) if isinstance(x, list) else "")
)

df["subcategory"] = df["subcategory"].apply(normalize_subcat)

# merge with lookup tables to get IDs
df = df.merge(categories, on="category", how="left")
df = df.merge(df_subcategories, on="subcategory", how="left")
df = df.drop(columns=["Product URL"])

In [6]:
# --- Save enriched product table ---
df.to_csv("products_with_categories.csv", index=False)

# Preview
df.head()


Unnamed: 0,Application,Applied product,Basic name,Cable application,Color scale,Connector cross-section,Control method - induction motor,DESCRICAO TINTA,Degree of Protection,Description,...,Starting duty cycle,Subscription time,Supply voltage,Terminal line reference,Terminal type,Type,category,subcategory,id_category,id_subcategory
0,,,,,,,"V/f, VVW, Sensorless, Encoder and VVW PM",,,,...,,,,,,,,DRIVE,1,2
1,,,,,,,"V/f, VVW, Sensorless, Encoder and VVW PM",,,,...,,,,,,,,DRIVE,1,2
2,,,,,,,"V/f, VVW, Sensorless, Encoder and VVW PM",,,,...,,,,,,,,DRIVE,1,2
3,,,,,,,,,,,...,,,,,,,,DRIVE,1,2
4,,,,,,,"V/f, VVW, Sensorless, Encoder and VVW PM",,,,...,,,,,,,,DRIVE,1,2
