In [22]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
from dotenv import dotenv_values

from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string

In [23]:
config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']


url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'

engine = create_engine(url, echo=False)
my_schema = 'capstone_barstov_industries'

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

with engine.begin() as conn: 
    result = conn.execute(text(f'''
                               SELECT * FROM model_data_week; 
                                '''))
    data = result.all()

### Let's create a dataframe out of that
model_data_week = pd.DataFrame(data) 
model_data_week.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172968 entries, 0 to 172967
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   product_type_no    172968 non-null  int64              
 1   product_type_name  172968 non-null  object             
 2   colour_group_code  172968 non-null  int64              
 3   colour_group_name  172968 non-null  object             
 4   week               172968 non-null  datetime64[ns, UTC]
 5   average_price      172968 non-null  float64            
 6   total_units_sold   172968 non-null  int64              
 7   unique_customers   172968 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(1), int64(4), object(2)
memory usage: 10.6+ MB


In [28]:
# Inventory Sales Prediction with Ensemble Modeling
# -------------------------------------------------
# This notebook demonstrates the use of an ensemble model with Random Forest as the final estimator
# to predict weekly sales for inventory management.

# Imports
# -------
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Load and Preprocess Data
# ------------------------
# Ensure 'week' is a datetime object and remove timezone information
data['week'] = pd.to_datetime(data['week']).dt.tz_localize(None)

# Sort data by date to ensure temporal order
data = data.sort_values(by='week')

# Feature Engineering: Creating Lagged Features for Sales
data['lag_units_sold_1week'] = data.groupby('product_type_no')['total_units_sold'].shift(1)
data['lag_units_sold_2weeks'] = data.groupby('product_type_no')['total_units_sold'].shift(2)

# Drop rows with NaN values created by lagging (occurs at the beginning of each product group)
data = data.dropna(subset=['lag_units_sold_1week', 'lag_units_sold_2weeks']).reset_index(drop=True)

# Adding Cyclic Encoding for Weeks
data['week_of_year'] = data['week'].dt.isocalendar().week
data['week_sin'] = np.sin(2 * np.pi * data['week_of_year'] / 52)
data['week_cos'] = np.cos(2 * np.pi * data['week_of_year'] / 52)

# Train-Validation-Test Split with Date Ranges
# --------------------------------------------

# Define the end date as the maximum date in the dataset
end_date = data['week'].max()

# Define date ranges for train, validation, and test
train_end_date = end_date - pd.Timedelta(weeks=4)  # End training 4 weeks before the end date
validation_start_date = train_end_date + pd.Timedelta(weeks=2)  # Start validation 2 weeks after training ends
validation_end_date = end_date - pd.Timedelta(weeks=2)  # End validation 2 weeks before the last date

# Split data into train, validation, and test sets based on specific date ranges
train_data = data[data['week'] <= train_end_date]
validation_set = data[(data['week'] > train_end_date) & (data['week'] <= validation_end_date)]
test_set = data[data['week'] > validation_end_date]

# Define features and target variable
feature_columns = ['product_type_no', 'colour_group_code', 'average_price', 
                   'lag_units_sold_1week', 'lag_units_sold_2weeks', 'week_sin', 'week_cos']

X_train = train_data[feature_columns]
y_train = train_data['total_units_sold']

X_validation = validation_set[feature_columns]
y_validation = validation_set['total_units_sold']

X_test = test_set[feature_columns]
y_test = test_set['total_units_sold']

# Confirm the split date ranges
print("Training Date Range:", train_data['week'].min(), "to", train_data['week'].max())
print("Validation Date Range:", validation_set['week'].min(), "to", validation_set['week'].max())
print("Test Date Range:", test_set['week'].min(), "to", test_set['week'].max())

# Initialize and Train Base Models
# -------------------------------
model_rf = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=42)
model_lr = LinearRegression()
model_mlp = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# Fit each base model on the training data
model_rf.fit(X_train, y_train)
model_lr.fit(X_train, y_train)
model_mlp.fit(X_train, y_train)

# Generate predictions for the validation set from each model
preds_rf = model_rf.predict(X_validation)
preds_lr = model_lr.predict(X_validation)
preds_mlp = model_mlp.predict(X_validation)

# Create a new DataFrame to hold the base model predictions
predictions_df = pd.DataFrame({
    'RandomForest': preds_rf,
    'LinearRegression': preds_lr,
    'MLPRegressor': preds_mlp
})

# Final Meta-Model Using Random Forest
# ------------------------------------
final_model_rf = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=42)
final_model_rf.fit(predictions_df, y_validation)

# Final Predictions and Evaluation
# --------------------------------
final_preds_rf = final_model_rf.predict(predictions_df)

# Evaluate the Random Forest final model on validation set
mae_rf = mean_absolute_error(y_validation, final_preds_rf)
rmse_rf = mean_squared_error(y_validation, final_preds_rf, squared=False)

print("Ensemble with Random Forest as Final Model - MAE:", mae_rf)
print("Ensemble with Random Forest as Final Model - RMSE:", rmse_rf)

# Feature Importance Analysis (for the final model)
# -------------------------------------------------
print("Feature importances from the final meta-model:", final_model_rf.feature_importances_)


Training Date Range: 2018-09-16 22:00:00 to 2020-08-23 22:00:00
Validation Date Range: 2020-08-30 22:00:00 to 2020-09-06 22:00:00
Test Date Range: 2020-09-13 22:00:00 to 2020-09-20 22:00:00
Ensemble with Random Forest as Final Model - MAE: 29.640685494538445
Ensemble with Random Forest as Final Model - RMSE: 100.83484946249463
Feature importances from the final meta-model: [0.91164921 0.03971616 0.04863464]




In [29]:
# Group by 'week' and calculate the total units sold per week
weekly_sales = data.groupby('week')['total_units_sold'].sum()

# Calculate the average units sold per week across all weeks
average_units_sold_per_week = weekly_sales.mean()

# Display the result
print("Average units sold per week:", average_units_sold_per_week)


Average units sold per week: 270376.61320754717
