In [1]:
# Data Handling and Processing
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
from dotenv import dotenv_values

# Machine Learning Models and Evaluation Metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Visualization (Optional)
import matplotlib.pyplot as plt
import seaborn as sns

# Date Handling
from datetime import timedelta

# Suppress Warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load environment variables
config = dotenv_values()

# Define variables for the login
pg_user = config['POSTGRES_USER']
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

# Set up the PostgreSQL connection URL
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'

# Create the database engine
engine = create_engine(url, echo=False)
my_schema = 'capstone_barstov_industries'

# Load data directly into a DataFrame
with engine.connect() as conn:
    conn.execute(text(f'SET search_path TO {my_schema};'))
    data = pd.read_sql("SELECT * FROM model_data_week;", conn)

# Check the DataFrame structure
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172977 entries, 0 to 172976
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   product_type_no    172977 non-null  int64  
 1   product_type_name  172977 non-null  object 
 2   colour_group_code  172977 non-null  int64  
 3   colour_group_name  172977 non-null  object 
 4   week               172977 non-null  object 
 5   average_price      172977 non-null  float64
 6   total_units_sold   172977 non-null  int64  
 7   unique_customers   172977 non-null  int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 10.6+ MB


In [None]:

# Ensure 'week' is in datetime format without timezone
data['week'] = pd.to_datetime(data['week']).dt.tz_localize(None)

# Filter data up to March 2020 and sort
data = data[data['week'] <= "2020-03-01"].copy()
data = data.sort_values(by=['product_type_no', 'colour_group_code', 'week'])

# Step 1: Data-Driven Peak Season Identification Using Quantile-Based Thresholds
demand_clusters = data.groupby(['product_type_no', 'colour_group_code', 'week'])['total_units_sold'].sum().reset_index()
demand_clusters['peak_threshold'] = demand_clusters.groupby(['product_type_no', 'colour_group_code'])['total_units_sold'].transform(lambda x: x.quantile(0.90))
demand_clusters['is_peak_season'] = demand_clusters['total_units_sold'] >= demand_clusters['peak_threshold']

# Merge peak season flags back into main dataset
data = pd.merge(data, demand_clusters[['product_type_no', 'colour_group_code', 'week', 'is_peak_season']], on=['product_type_no', 'colour_group_code', 'week'], how='left')

# Remove duplicate columns if any
if 'is_peak_season_x' in data.columns and 'is_peak_season_y' in data.columns:
    data = data.drop(columns=['is_peak_season_x']).rename(columns={'is_peak_season_y': 'is_peak_season'})

# Step 2: Feature Engineering - Lagged Features and Seasonal Features
data['lag_units_sold_1week'] = data.groupby(['product_type_no', 'colour_group_code'])['total_units_sold'].shift(1)
data['lag_units_sold_2weeks'] = data.groupby(['product_type_no', 'colour_group_code'])['total_units_sold'].shift(2)
data = data.dropna(subset=['lag_units_sold_1week', 'lag_units_sold_2weeks']).reset_index(drop=True)

# Month feature for seasonality
data['month'] = data['week'].dt.month

# Scale the price
data['average_price'] = data['average_price'] * 10

# Reapply the train, validation, and test splits
train_data = data[(data['week'] >= "2019-01-01") & (data['week'] < "2020-01-01")]
validation_data = data[(data['week'] >= "2020-01-01") & (data['week'] < "2020-02-01")]
test_data = data[(data['week'] >= "2020-02-01") & (data['week'] < "2020-03-01")]

# Print sizes for verification
print("Training set size:", train_data.shape)
print("Validation set size:", validation_data.shape)
print("Test set size:", test_data.shape)

# Define feature columns
feature_columns = ['product_type_no', 'colour_group_code', 'average_price',
                   'lag_units_sold_1week', 'lag_units_sold_2weeks', 'month', 'is_peak_season']

X_train, y_train = train_data[feature_columns], train_data['total_units_sold']
X_validation, y_validation = validation_data[feature_columns], validation_data['total_units_sold']
X_test, y_test = test_data[feature_columns], test_data['total_units_sold']

# Initialize and train base models
model_rf = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=42)
model_lr = LinearRegression()
model_mlp = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

model_rf.fit(X_train, y_train)
model_lr.fit(X_train, y_train)
model_mlp.fit(X_train, y_train)

# Generate predictions and round to integers
preds_rf_val = np.round(model_rf.predict(X_validation)).astype(int)
preds_lr_val = np.round(model_lr.predict(X_validation)).astype(int)
preds_mlp_val = np.round(model_mlp.predict(X_validation)).astype(int)

preds_rf_test = np.round(model_rf.predict(X_test)).astype(int)
preds_lr_test = np.round(model_lr.predict(X_test)).astype(int)
preds_mlp_test = np.round(model_mlp.predict(X_test)).astype(int)

# Meta-model predictions
val_predictions_df = pd.DataFrame({'RandomForest': preds_rf_val, 'LinearRegression': preds_lr_val, 'MLPRegressor': preds_mlp_val})
test_predictions_df = pd.DataFrame({'RandomForest': preds_rf_test, 'LinearRegression': preds_lr_test, 'MLPRegressor': preds_mlp_test})

final_model_rf = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=42)
final_model_rf.fit(val_predictions_df, y_validation)

# Generate and round meta-model predictions
final_preds_val = np.round(final_model_rf.predict(val_predictions_df)).astype(int)
final_preds_test = np.round(final_model_rf.predict(test_predictions_df)).astype(int)

# Calculate metrics
mae_val = mean_absolute_error(y_validation, final_preds_val)
rmse_val = mean_squared_error(y_validation, final_preds_val, squared=False)
mae_test = mean_absolute_error(y_test, final_preds_test)
rmse_test = mean_squared_error(y_test, final_preds_test, squared=False)

print("Extended Validation MAE:", mae_val)
print("Extended Validation RMSE:", rmse_val)
print("Test MAE:", mae_test)
print("Test RMSE:", rmse_test)

# Aggregate inventory needs by product type and color for the test period
test_data['predicted_inventory_needs'] = final_preds_test
inventory_needs = test_data.groupby(['product_type_no', 'colour_group_code'])['predicted_inventory_needs'].sum().reset_index()

# Calculate average demand for each product-type and color combination
avg_demand = data.groupby(['product_type_no', 'colour_group_code'])['total_units_sold'].mean().reset_index()
avg_demand = avg_demand.rename(columns={'total_units_sold': 'avg_demand'})

# Merge average demand back into `test_data`
test_data = pd.merge(test_data, avg_demand, on=['product_type_no', 'colour_group_code'], how='left')

# Define multipliers for peak and non-peak seasons
non_peak_season_multiplier = 0.5  # Trigger rebalancing sooner for non-peak items (50% of average demand)
peak_season_multiplier = 1.0      # Allow peak items more space (100% of average demand)

# Calculate the low-demand threshold dynamically based on the seasonality of each item
test_data['low_demand_threshold'] = np.where(
    test_data['is_peak_season'],
    test_data['avg_demand'] * peak_season_multiplier,     # Use peak multiplier for peak season items
    test_data['avg_demand'] * non_peak_season_multiplier  # Use non-peak multiplier for non-peak items
)

# Calculate a rolling 4-week sum of predicted inventory needs to identify low-demand periods
test_data['low_demand_4weeks'] = test_data.groupby(['product_type_no', 'colour_group_code'])['predicted_inventory_needs'] \
    .transform(lambda x: x.rolling(window=4, min_periods=1).sum())

# Flag items for rebalancing based on the dynamic low-demand threshold
test_data['rebalance_flag'] = test_data['low_demand_4weeks'] <= test_data['low_demand_threshold']

# Summarize the flagged items for rebalancing
rebalance_needs = test_data[test_data['rebalance_flag']].groupby(['product_type_no', 'colour_group_code'])['predicted_inventory_needs'].sum().reset_index()
rebalance_needs['rebalance_units'] = np.round(rebalance_needs['predicted_inventory_needs']).astype(int)

# Summarize the overall predicted inventory needs
inventory_needs = test_data.groupby(['product_type_no', 'colour_group_code'])['predicted_inventory_needs'].sum().reset_index()

# Print results
print("\nPredicted Inventory Needs (Test Period):")
print(inventory_needs)

print("\nPotential Rebalance Needs (Dynamic Low Demand Thresholds):")
print(rebalance_needs)

print("Total Predicted Inventory Needs (Test Period):", int(inventory_needs['predicted_inventory_needs'].sum()))
print("Total Rebalance Needs (Low Demand Adjustments):", int(rebalance_needs['rebalance_units'].sum()))
print("Total Actual Units Sold (Test Period):", int(y_test.sum()))
