In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load raw files (exact names from attachments)
orders = pd.read_csv("List of Orders.csv")
details = pd.read_csv("Order Details.csv")
targets = pd.read_csv("Sales target.csv")

# Standardize column names
orders.columns = [c.strip().replace(" ", "_") for c in orders.columns]
details.columns = [c.strip().replace(" ", "_") for c in details.columns]
targets.columns = [c.strip().replace(" ", "_") for c in targets.columns]

# Parse dates and numerics
orders["Order_Date"] = pd.to_datetime(orders["Order_Date"], dayfirst=True, errors="coerce")
for col in ["Amount", "Profit", "Quantity"]:
    details[col] = pd.to_numeric(details[col], errors="coerce")

# Clean text fields (strip)
for col in ["CustomerName", "State", "City"]:
    if col in orders.columns:
        orders[col] = orders[col].astype(str).str.strip()

for col in ["Category", "Sub-Category"]:
    if col in details.columns:
        details[col] = details[col].astype(str).str.strip()

# Drop obviously empty rows (if any) based on critical keys
orders = orders.dropna(subset=["Order_ID"]).copy()
details = details.dropna(subset=["Order_ID"]).copy()

# Add month fields for easy grouping
orders["Order_Month"] = orders["Order_Date"].dt.to_period("M")
orders["Month_End"] = orders["Order_Date"] + pd.offsets.MonthEnd(0)

# Basic revenue features at line level
# Amount already represents line revenue; compute net_revenue = Amount (keep simple), margin and margin% if Profit present
details["Revenue"] = details["Amount"].fillna(0.0)
details["Net_Revenue"] = details["Revenue"]  # no returns/discount columns present, keep simple
details["Margin"] = details["Profit"].fillna(0.0)
details["Margin_Pct"] = np.where(details["Revenue"].gt(0), details["Margin"] / details["Revenue"], np.nan)

# Derive a simple product key (Category + Sub-Category)
details["Product_Key"] = details["Category"].astype(str) + " | " + details["Sub-Category"].astype(str)

# Attach month to details via orders
details = details.merge(
    orders[["Order_ID", "Order_Date", "Order_Month", "Month_End", "State", "City", "CustomerName"]],
    on="Order_ID", how="left"
)

# Aggregate to order level to build fact_orders
order_agg = details.groupby("Order_ID", as_index=False).agg(
    Order_Lines=("Order_ID", "size"),
    Items=("Quantity", "sum"),
    Revenue=("Revenue", "sum"),
    Net_Revenue=("Net_Revenue", "sum"),
    Profit=("Profit", "sum"),
    Margin=("Margin", "sum")
)

# Join back order attributes (date, customer, region)
fact_orders = orders.merge(order_agg, on="Order_ID", how="left")

# Fill NaNs for orders without detail lines (rare)
fact_orders[["Order_Lines","Items","Revenue","Net_Revenue","Profit","Margin"]] = \
    fact_orders[["Order_Lines","Items","Revenue","Net_Revenue","Profit","Margin"]].fillna(0)

# Basic AOV metric for convenience
fact_orders["Avg_Item_Price"] = np.where(fact_orders["Items"].gt(0), fact_orders["Revenue"] / fact_orders["Items"], np.nan)

# Build dim_customers (unique customers with simple RFM features)
# First purchase date, last purchase date, order count, monetary (net revenue)
cust_orders = fact_orders.groupby("CustomerName", as_index=False).agg(
    First_Purchase=("Order_Date", "min"),
    Last_Purchase=("Order_Date", "max"),
    Orders=("Order_ID", "nunique"),
    Monetary=("Net_Revenue", "sum")
)
# Choose a reference date as max order date for recency
ref_date = fact_orders["Order_Date"].max()
cust_orders["Recency_Days"] = (ref_date - cust_orders["Last_Purchase"]).dt.days

# Simple RFM banding (beginner-friendly rules)
cust_orders["R_Band"] = pd.qcut(cust_orders["Recency_Days"].rank(method="first"), 4, labels=["R4","R3","R2","R1"])  # R4=most recent
cust_orders["F_Band"] = pd.qcut(cust_orders["Orders"].rank(method="first"), 4, labels=["F1","F2","F3","F4"])        # F4=most frequent
cust_orders["M_Band"] = pd.qcut(cust_orders["Monetary"].rank(method="first"), 4, labels=["M1","M2","M3","M4"])      # M4=highest spend
cust_orders["RFM_Segment"] = cust_orders["R_Band"].astype(str) + cust_orders["F_Band"].astype(str) + cust_orders["M_Band"].astype(str)

dim_customers = cust_orders.rename(columns={"CustomerName":"Customer_Name"})

# Build dim_products from distinct Category/Sub-Category pairs
dim_products = (
    details[["Category", "Sub-Category", "Product_Key"]]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={"Sub-Category":"Sub_Category"})
)

# Build dim_date covering min..max order dates
date_min = fact_orders["Order_Date"].min()
date_max = fact_orders["Order_Date"].max()
date_index = pd.date_range(start=date_min, end=date_max, freq="D")
dim_date = pd.DataFrame({"Date": date_index})
dim_date["Year"] = dim_date["Date"].dt.year
dim_date["Month"] = dim_date["Date"].dt.month
dim_date["Month_Name"] = dim_date["Date"].dt.strftime("%b")
dim_date["Quarter"] = dim_date["Date"].dt.to_period("Q").astype(str)
dim_date["Month_End"] = dim_date["Date"] + pd.offsets.MonthEnd(0)

# Ensure output folders
Path("data").mkdir(exist_ok=True)
Path("data/cleaned").mkdir(parents=True, exist_ok=True)

# Finalize fact_orders columns (keep it simple)
fact_orders_out = fact_orders[[
    "Order_ID","Order_Date","Month_End","CustomerName","State","City",
    "Order_Lines","Items","Revenue","Net_Revenue","Profit","Margin","Avg_Item_Price"
]].rename(columns={"CustomerName":"Customer_Name"})

# Save cleaned outputs
fact_orders_out.to_csv("data/cleaned/fact_orders.csv", index=False)
dim_products.to_csv("data/cleaned/dim_products.csv", index=False)
dim_customers.to_csv("data/cleaned/dim_customers.csv", index=False)
dim_date.to_csv("data/cleaned/dim_date.csv", index=False)

print("Saved: data/cleaned/fact_orders.csv, dim_products.csv, dim_customers.csv, dim_date.csv")


Saved: data/cleaned/fact_orders.csv, dim_products.csv, dim_customers.csv, dim_date.csv


In [2]:
print("Fact rows:", len(fact_orders_out), "Unique orders:", fact_orders_out["Order_ID"].nunique())
print("Products:", len(dim_products))
print("Customers:", len(dim_customers))
print("Date range:", fact_orders_out["Order_Date"].min().date(), "to", fact_orders_out["Order_Date"].max().date())


Fact rows: 500 Unique orders: 500
Products: 17
Customers: 332
Date range: 2018-04-01 to 2019-03-31
