In [1]:
import pandas as pd
import numpy as np

# Load your dataset into a DataFrame
df = pd.read_csv('DataCoSupplyChainDataset.csv', encoding='latin-1')

In [2]:
df['Order Id'].nunique()

65752

In [3]:
# Convert 'order date (DateOrders)' to datetime format and extract date only
df['order date (DateOrders)'] = pd.to_datetime(df['order date (DateOrders)']).dt.date

# Group by 'Order Id' and aggregate metrics and additional columns
fact_order = df.groupby('Order Id').agg({
    'order date (DateOrders)': 'first',
    'Order Item Quantity': 'sum',
    'Order Item Total': 'sum',
    'Order Item Discount':'sum',
    'Order Profit Per Order': 'sum',
    'Order Status': 'last',  # Retain the last status as 'Final Order Status
    'Customer Id': 'first',
    'Type': 'first',
    'Customer Segment': 'first',
    'Sales': 'sum', 
    'Market': 'first',
    'Department Id': 'first'
}).reset_index()

# Add prefix 'O' to 'Order Id' and 'C' to 'Customer Id' and 'D' to 'Department Id'
fact_order['Order Id'] = fact_order['Order Id'].apply(lambda x: 'O' + str(x))
fact_order['Customer Id'] = fact_order['Customer Id'].apply(lambda x: 'C' + str(x))
fact_order['Department Id'] = fact_order['Department Id'].apply(lambda x: 'D' + str(x))

# Rename columns to match the FactOrder schema
fact_order.rename(columns={
    'Order Id': 'Order Id',
    'order date (DateOrders)': 'Order Date',
    'Order Item Quantity': 'Quantity (items)',
    'Order Item Total': 'Sales (After discount)',
    'Order Item Discount':'Discount Amount',
    'Order Profit Per Order': 'Total Profit',
    'Order Status': 'Order Status',
    'Customer Id': 'Customer Id',
    'Type': 'Transaction Type',
    'Customer Segment': 'Customer Segment',
    'Sales': 'Sales (before Discount)',
    'Market': 'Market',
    'Department Id': 'Department Id'
}, inplace=True)

# Display the first few rows of the fact_order DataFrame
print(fact_order.head())

  Order Id  Order Date  Quantity (items)  Sales (After discount)  \
0       O1  2015-01-01                 1              239.979996   
1       O2  2015-01-01                 7              529.380005   
2       O4  2015-01-01                14              620.870014   
3       O5  2015-01-01                10              987.070007   
4       O7  2015-01-01                 7              525.520004   

   Discount Amount  Total Profit     Order Status Customer Id  \
0        60.000000     88.790001           CLOSED      C11599   
1        50.600000    195.900002  PENDING_PAYMENT        C256   
2        78.980000    124.090000           CLOSED       C8827   
3       142.789999    390.089995         COMPLETE      C11318   
4        54.400000    203.929998         COMPLETE       C4530   

  Transaction Type Customer Segment  Sales (before Discount) Market  \
0             CASH         Consumer               299.980011  LATAM   
1          PAYMENT         Consumer               579.9800

In [4]:
fact_order.to_excel('ProcessedOrderFact.xlsx', index=False)

print("Order Fact table saved successfully")

Order Fact table saved successfully


In [5]:
len(fact_order)

65752