In [1]:
# M5 Forecasting - Statistical Data Analysis Script
# -------------------------------------------------
# This script extracts descriptive, distributional, and correlation-based statistics
# from the M5 Forecasting dataset to support Statistical Machine Learning analysis.

# %% [markdown]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from statsmodels.tsa.stattools import adfuller
import os

 ## 1. Load Data

In [2]:
data_path = 'datas/'
calendar = pd.read_csv(data_path + 'calendar.csv')
prices = pd.read_csv(data_path + 'sell_prices.csv')
sales = pd.read_csv(data_path + 'sales_train_validation.csv')

# Melt the sales dataset into long format
sales_long = sales.melt(id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                        var_name='d', value_name='sales')
sales_long = sales_long.merge(calendar[['d', 'date']], on='d', how='left')
sales_long['date'] = pd.to_datetime(sales_long['date'])

 ## 2. Basic Descriptive Statistics

In [3]:
sales_stats = sales_long.groupby(['item_id', 'store_id'])['sales'].agg(
    mean_sales='mean',
    median_sales='median',
    std_sales='std',
    var_sales='var',
    skewness=lambda x: skew(x, nan_policy='omit'),
    kurtosis=lambda x: kurtosis(x, nan_policy='omit'),
    zero_rate=lambda x: (x == 0).mean(),
    cv=lambda x: np.std(x) / (np.mean(x) + 1e-9)
).reset_index()

sales_stats.to_csv('sales_statistics.csv', index=False)
print('Sales statistics saved to sales_statistics.csv')

Sales statistics saved to sales_statistics.csv


 ## 3. Stationarity Check (ADF Test)

In [9]:
sample_series = sales_long[sales_long['item_id'] == sales_long['item_id'].iloc[0]]['sales']
adf_stat, adf_p, _, _, _, _ = adfuller(sample_series)
print(f"ADF Statistic: {adf_stat:.3e}, p-value: {adf_p:.3e}")

ADF Statistic: -7.616e+00, p-value: 2.191e-11


 ## 4. Price Statistics

In [5]:
price_stats = prices.groupby(['item_id'])['sell_price'].agg(
    mean_price='mean',
    std_price='std',
    min_price='min',
    max_price='max',
    volatility=lambda x: np.std(x) / (np.mean(x) + 1e-9)
).reset_index()

price_stats.to_csv('price_statistics.csv', index=False)
print('Price statistics saved to price_statistics.csv')

Price statistics saved to price_statistics.csv


 ## 5. Correlation Analysis (Sales vs Price)
 Merge sales and prices using week info

In [6]:
calendar_prices = calendar[['d', 'wm_yr_wk']]
sales_price = sales_long.merge(calendar_prices, on='d', how='left')
sales_price = sales_price.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

corr_data = sales_price[['sales', 'sell_price']].dropna()
correlation = corr_data.corr().iloc[0, 1]
print(f'Correlation between sales and price: {correlation:.3f}')

Correlation between sales and price: -0.151


 ## 6. Temporal & Categorical Trends

In [7]:
sales_long['weekday'] = sales_long['date'].dt.day_name()
sales_long['month'] = sales_long['date'].dt.month

# Mean sales by weekday
weekday_mean = sales_long.groupby('weekday')['sales'].mean().reindex(
    ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
)
plt.figure(figsize=(8,4))
weekday_mean.plot(kind='bar', color='skyblue')
plt.title('Average Sales by Weekday')
plt.ylabel('Mean Sales')
plt.tight_layout()
plt.savefig('sales_by_weekday.png', dpi=200)
plt.close()

# Mean sales by month
monthly_mean = sales_long.groupby('month')['sales'].mean()
plt.figure(figsize=(8,4))
monthly_mean.plot(kind='line', marker='o', color='coral')
plt.title('Average Sales by Month')
plt.xlabel('Month')
plt.ylabel('Mean Sales')
plt.tight_layout()
plt.savefig('sales_by_month.png', dpi=200)
plt.close()

 ## 7. Correlation Heatmap for Summary Metrics

In [8]:
corr_matrix = sales_stats[['mean_sales','std_sales','cv','zero_rate']].corr()
plt.figure(figsize=(6,4))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Sales Statistics')
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=200)
plt.close()