In [4]:
import sys
import pandas as pd
sys.path.append('../../src')
from create_synthetic_data import generate_fashion_data_with_brand, inject_anomalies_by_date

In [15]:
# Example usage
start_date = '2023-01-01'
end_date = '2023-12-30'
df = generate_fashion_data_with_brand(start_date, end_date)

df['Category'] = df['MERCHANDISE_HIERARCHY'].str.split('.').str[0]
df['SubCategory'] = df['MERCHANDISE_HIERARCHY'].str.split('.').str[1] 

anomaly_schedule = {
    '2023-01-10': ('ExcessiveDiscount', 0.5, 'PricingError', 'Apparel'),
    '2023-06-10': ('COGSOverstatement', -0.8, 'SupplierIssue', 'Footwear'),
    '2023-09-10': ('FulfillmentSpike', -3, 'LogisticsIssue', 'Beauty'),
    '2023-12-10': ('ReturnSurge', 10, 'QualityIssue', 'Accessories')
}

df_anomalous = inject_anomalies_by_date(df, anomaly_schedule)
df_anomalous.to_csv('fashion_data_with_anomalies.csv', index=False)
df_anomalous.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,UNIT_COST,ORDERDATE,SALES,DISCOUNT,NET_SALES,STATUS,QTR_ID,...,SHIPPING_REVENUE,PROFIT,PROFIT_MARGIN,IS_MARGIN_NEGATIVE,CUSTOMER_LOYALTY,Category,SubCategory,ANOMALY_TYPE,SEVERITY,ROOT_CAUSE
0,58740,1,150.0,44.22,2023-01-01,150.0,12.0,138.0,Shipped,1,...,0.0,71.76,52.0,False,Loyal,Apparel,Men,,,
1,44805,1,93.93,25.08,2023-01-01,93.93,11.27,82.66,Shipped,1,...,3.0,43.36,52.46,False,New,Apparel,Men,,,
2,61786,1,58.96,17.03,2023-01-01,58.96,12.74,46.22,Shipped,1,...,3.0,14.38,31.11,False,New,Apparel,Men,,,
3,81153,1,35.75,9.28,2023-01-01,35.75,0.0,35.75,Shipped,1,...,3.0,18.13,50.71,False,Loyal,Apparel,Men,,,
4,84024,2,35.13,10.31,2023-01-01,70.26,13.49,56.77,In Process,1,...,6.0,26.05,45.89,False,Loyal,Apparel,Men,,,


In [16]:
# Convert 'ORDERDATE' to datetime objects if not already done
df_anomalous['ORDERDATE'] = pd.to_datetime(df_anomalous['ORDERDATE'])

# Define categorical variables for grouping
categorical_variables = [
    "Category", "SubCategory", "PROMO_CODE", "SALES_CHANNEL", "CUSTOMER_LOYALTY"
]

# Define numerical/fiscal variables for aggregation
numerical_variables = [
    "PRICEEACH", "UNIT_COST", "QUANTITYORDERED", "SALES", "DISCOUNT", "NET_SALES",
    "FULFILLMENT_COST", "MARKETING_COST", "RETURN_COST", "COST_OF_GOODS_SOLD",
    "SHIPPING_REVENUE", "PROFIT", "PROFIT_MARGIN", "IS_MARGIN_NEGATIVE"
]

# Define aggregation methods for numerical variables
aggregation_dict = {
    "PRICEEACH": "mean",  # Average price per unit within each group
    "UNIT_COST": "mean",  # Average cost per unit
    "QUANTITYORDERED": "sum",  # Total units sold within each group per day
    "SALES": "sum",  # Total sales revenue
    "DISCOUNT": "sum",  # Total discounts applied
    "NET_SALES": "sum",  # Total net sales
    "FULFILLMENT_COST": "sum",  # Total fulfillment costs
    "MARKETING_COST": "sum",  # Total marketing costs
    "RETURN_COST": "sum",  # Total return costs
    "COST_OF_GOODS_SOLD": "sum",  # Total cost of goods sold
    "SHIPPING_REVENUE": "sum",  # Total shipping revenue
    "PROFIT": "sum",  # Total profit
    "PROFIT_MARGIN": "mean",  # Average profit margin (weighted by NET_SALES if needed)
    "IS_MARGIN_NEGATIVE": "mean"  # Proportion of negative margins (0 or 1)
}

# Group by ORDERDATE and categorical variables, then aggregate numerical variables
#df_agg = df_anomalous.groupby([pd.Grouper(key='ORDERDATE', freq='D')] + categorical_variables).agg(aggregation_dict).reset_index()
df_final  = df_anomalous.groupby([pd.Grouper(key='ORDERDATE', freq='D')]).agg(aggregation_dict).reset_index()
df_final.to_csv('fashion_data_with_anomalies_aggregated.csv', index=False)