In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from pathlib import Path

BASE_DIR = Path.cwd()
RAW_CSV = BASE_DIR / "customer_shopping_data.csv"

DIM_CUSTOMERS = BASE_DIR / "dim_customers.csv"
DIM_PRODUCTS  = BASE_DIR / "dim_products.csv"
DIM_DATES     = BASE_DIR / "dim_dates.csv"
DIM_PAYMENTS  = BASE_DIR / "dim_payments.csv"
DIM_MALLS     = BASE_DIR / "dim_malls.csv"
FACT_SALES    = BASE_DIR / "fact_sales.csv"
path = kagglehub.dataset_download("mehmettahiraslan/customer-shopping-dataset")

df = pd.read_csv(RAW_CSV)
print("Datos originales:", df.shape)
display(df.head())

  from .autonotebook import tqdm as notebook_tqdm


Datos originales: (99457, 10)


Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
0,I138884,C241288,Female,28,Clothing,5,1500.4,Credit Card,5/8/2022,Kanyon
1,I317333,C111565,Male,21,Shoes,3,1800.51,Debit Card,12/12/2021,Forum Istanbul
2,I127801,C266599,Male,20,Clothing,1,300.08,Cash,9/11/2021,Metrocity
3,I173702,C988172,Female,66,Shoes,5,3000.85,Credit Card,16/05/2021,Metropol AVM
4,I337046,C189076,Female,53,Books,4,60.6,Cash,24/10/2021,Kanyon


In [2]:
dim_customers = df[["customer_id", "gender", "age"]].drop_duplicates().reset_index(drop=True)
dim_customers.to_csv(DIM_CUSTOMERS, index=False)
print(f"DimCustomers: {len(dim_customers)} filas")

DimCustomers: 99457 filas


In [3]:
dim_products = df[["category"]].drop_duplicates().reset_index(drop=True)
dim_products["product_id"] = dim_products.index + 1
dim_products = dim_products[["product_id", "category"]]
dim_products.to_csv(DIM_PRODUCTS, index=False)

In [4]:

dim_payments = df[["payment_method"]].drop_duplicates().reset_index(drop=True)
dim_payments["payment_id"] = dim_payments.index + 1
dim_payments = dim_payments[["payment_id", "payment_method"]]
dim_payments.to_csv(DIM_PAYMENTS, index=False)


In [5]:
dim_malls = df[["shopping_mall"]].drop_duplicates().reset_index(drop=True)
dim_malls["mall_id"] = dim_malls.index + 1
dim_malls = dim_malls[["mall_id", "shopping_mall"]]
dim_malls.to_csv(DIM_MALLS, index=False)

In [6]:
dim_dates = df[["invoice_date"]].drop_duplicates().reset_index(drop=True)
dim_dates["invoice_date"] = pd.to_datetime(dim_dates["invoice_date"], dayfirst=True)
dim_dates["date_id"] = dim_dates["invoice_date"].dt.strftime("%Y%m%d").astype(int)
dim_dates["year"] = dim_dates["invoice_date"].dt.year
dim_dates["month"] = dim_dates["invoice_date"].dt.month
dim_dates["day"] = dim_dates["invoice_date"].dt.day
dim_dates = dim_dates[["date_id", "invoice_date", "year", "month", "day"]]
dim_dates.to_csv(DIM_DATES, index=False)

In [8]:
# Asegurar que en ambos DataFrames invoice_date es datetime
df["invoice_date"] = pd.to_datetime(df["invoice_date"], errors="coerce")
dim_dates["invoice_date"] = pd.to_datetime(dim_dates["invoice_date"], errors="coerce")

# Ahora sí hacer el merge
fact_sales = (
    df.merge(dim_products, on="category")
      .merge(dim_payments, on="payment_method")
      .merge(dim_malls, on="shopping_mall")
      .merge(dim_dates, on="invoice_date")
)

fact_sales = fact_sales[[
    "invoice_no", "customer_id", "product_id",
    "date_id", "payment_id", "mall_id",
    "quantity", "price"
]]

fact_sales.to_csv(FACT_SALES, index=False)
