In [1]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
from dotenv import dotenv_values

from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string

In [2]:
# Load environment variables
config = dotenv_values()

# Define variables for the login
pg_user = config['POSTGRES_USER']
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

# Set up the PostgreSQL connection URL
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'

# Create the database engine
engine = create_engine(url, echo=False)
my_schema = 'capstone_barstov_industries'

# Load data directly into a DataFrame
with engine.connect() as conn:
    conn.execute(text(f'SET search_path TO {my_schema};'))
    seasonal_data = pd.read_sql("SELECT * FROM seasonality_analysis;", conn)
    item_params_final = pd.read_sql("SELECT * FROM items_params_final", conn)

# Check the DataFrame structure



In [3]:
total_sales_article_id = seasonal_data.groupby(['article_id', 'product_type_name'])['total_sales'].sum().reset_index()
total_sales_id = total_sales_article_id.sort_values(by='total_sales', ascending=False)
top_30_percent = int(len(total_sales_id) * 0.3)
total_sales_id_sorted = total_sales_article_id.sort_values(by='total_sales', ascending=False)


In [4]:
total_sales_id.head(top_30_percent)

Unnamed: 0,article_id,product_type_name,total_sales
53811,706016001,Trousers,1374.568748
53812,706016002,Trousers,994.142671
15970,568601006,Blazer,809.587383
3086,448509014,Trousers,649.887626
58405,720125001,Leggings/Tights,612.479067
...,...,...,...
59367,723488002,Sunglasses,5.230847
82642,810169020,Trousers,5.230390
82819,810845001,Dress,5.229119
74240,776746005,T-shirt,5.228424


In [5]:
# Ensure `sale_date` is in datetime format
seasonal_data['sale_date'] = pd.to_datetime(seasonal_data['sale_date'])

# List of top 30% selling article IDs
top_30_article_ids = total_sales_id.head(top_30_percent)['article_id'].values  # Assuming this list was calculated as shown in your code

# Step 1: Filter `seasonal_data` to include only top 30% article IDs
top_selling_data = seasonal_data[seasonal_data['article_id'].isin(top_30_article_ids)]

In [6]:
# RANDOM FOREST 7 DAYS 5% ITEMS, all 5% are in the supreme Mighty Best Params Set

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import warnings

warnings.filterwarnings("ignore")  # Suppress non-critical warnings

# Ensure `sale_date` is in datetime format and filter for dates up to the end of February 2020
seasonal_data['sale_date'] = pd.to_datetime(seasonal_data['sale_date'])
seasonal_data = seasonal_data[seasonal_data['sale_date'] <= '2020-02-29']

# List of top 5% selling article IDs
top_5_percent = int(len(total_sales_id_sorted) * 0.05)
top_sample_article_ids = total_sales_id_sorted.head(top_5_percent)['article_id'].values

# Step 1: Filter `top_sample_article_ids` to include only those present in `item_params_final`
filtered_sample_ids = [article_id for article_id in top_sample_article_ids if article_id in item_params_final['article_id'].values]

# Step 2: Filter `seasonal_data` to include only items in the filtered sample
sampled_data = seasonal_data[seasonal_data['article_id'].isin(filtered_sample_ids)]

# Initialize dictionaries to store predictions and validation metrics
predictions_7_days = {}
validation_results = []

# Set default parameters for items without specific tuning information (if applicable)
default_params = {
    'max_depth': 10,
    'min_samples_split': 2,
    'n_estimators': 100
}

# Loop through each article in the filtered sampled set and apply Random Forest with validation
for article_id in filtered_sample_ids:
    # Filter data for the current article_id
    article_data = sampled_data[sampled_data['article_id'] == article_id]
    
    # Sort data by `sale_date`, set it as index
    article_data = article_data.set_index('sale_date').sort_index()
    article_data['total_sales'] = article_data['total_sales'].fillna(0)
    
    # Extract the time series for training
    ts_data = article_data['total_sales']
    
    # Check for sufficient data points
    if len(ts_data) < 60:  # Ensure at least 60 days of data for model stability
        continue
    
    # Fetch parameters for the current article_id from `item_params_final`
    params = item_params_final[item_params_final['article_id'] == article_id].iloc[0]

    # Use specific parameters if they exist, otherwise default to common ones
    if len(params) > 0:  # Check if params is not empty
        max_depth = int(params['max_depth'])
        min_samples_split = int(params['min_samples_split'])
        n_estimators = int(params['n_estimators'])

    else:
        max_depth = default_params['max_depth']
        min_samples_split = default_params['min_samples_split']
        n_estimators = default_params['n_estimators']
    
    # Define the RandomForest model with the selected parameters
    rf_model = RandomForestRegressor(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        n_estimators=n_estimators,
        random_state=42
    )
    
    # Split the data for validation: last 8 weeks (56 days)
    train_data = ts_data[:-56]  # All data except the last 8 weeks
    validation_data = ts_data[-56:]  # Last 8 weeks
    
    # Further split the last 8 weeks into 4-week training and 4-week validation sets
    rf_train = validation_data[:-28].values.reshape(-1, 1)  # First 4 weeks of the last 8 weeks
    rf_validation = validation_data[-28:].values  # Last 4 weeks of the last 8 weeks
    validation_dates = validation_data[-28:].index  # Dates for the validation period
    
    # Prepare target data for training
    y_train = train_data.values.reshape(-1, 1)
    
    # Fit the model on training data
    rf_model.fit(np.arange(len(y_train)).reshape(-1, 1), y_train.ravel())
    
    # Predict the next 7 days for forecasting using the trained model
    future_days = np.arange(len(y_train), len(y_train) + 7).reshape(-1, 1)
    forecast_7_days = rf_model.predict(future_days)
    predictions_7_days[article_id] = forecast_7_days
    
    # Validate the model on the 4-week validation period
    validation_days = np.arange(len(y_train), len(y_train) + 28).reshape(-1, 1)
    rf_validation_pred = rf_model.predict(validation_days)
    
    # Calculate RMSE and MAE on validation set
    rmse = np.sqrt(mean_squared_error(rf_validation, rf_validation_pred))
    mae = mean_absolute_error(rf_validation, rf_validation_pred)
    
    # Store each validation point with actual and predicted values, along with date
    for date, actual, predicted in zip(validation_dates, rf_validation, rf_validation_pred):
        validation_results.append({
            'article_id': article_id,
            'date': date,
            'total_sales': actual,
            'total_sales_predicted': predicted,
            'rmse': rmse,
            'mae': mae
        })

    print(f"Random Forest validation for article_id {article_id}: RMSE={rmse}, MAE={mae}")

# Convert predictions and validation results to DataFrames
forecast_df = pd.DataFrame(predictions_7_days).T  # Transpose so each article_id is a row
forecast_df.columns = [f"day_{i+1}" for i in range(7)]  # Rename columns for each forecasted day

validation_df = pd.DataFrame(validation_results)

# Save the results to CSV
forecast_df.to_csv("covid_5_rf_7_day_forecasts.csv", index=True)
validation_df.to_csv("covid_5_rf_7_day_validation_results_with_dates.csv", index=False)

print("Random Forest predictions and validation tables with time information created successfully!")


Random Forest validation for article_id 706016002: RMSE=0.549808418080242, MAE=0.3862977507619038
Random Forest validation for article_id 568601006: RMSE=0.38776462276093465, MAE=0.32003002935911057
Random Forest validation for article_id 448509014: RMSE=1.028374212113053, MAE=0.8776318890539858
Random Forest validation for article_id 720125001: RMSE=1.1723946052733083, MAE=1.0341467297999989
Random Forest validation for article_id 399223001: RMSE=0.36394992445390006, MAE=0.31856117152246605
Random Forest validation for article_id 706016003: RMSE=1.246453165835904, MAE=1.1560983521278203
Random Forest validation for article_id 562245046: RMSE=0.31784391253613603, MAE=0.27608948279999945
Random Forest validation for article_id 751471001: RMSE=1.0201915122973442, MAE=0.9483195091285718
Random Forest validation for article_id 661794001: RMSE=1.5301947365688624, MAE=1.5091359147857162
Random Forest validation for article_id 706016015: RMSE=0.2582770120799426, MAE=0.2025072741428571
Random 

In [7]:
# RANDOM FOREST WITH 30 DAYS AND 30% ITEMS

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import warnings

warnings.filterwarnings("ignore")  # Suppress non-critical warnings

# Ensure `sale_date` is in datetime format and filter for dates up to the end of February 2020
seasonal_data['sale_date'] = pd.to_datetime(seasonal_data['sale_date'])
seasonal_data = seasonal_data[seasonal_data['sale_date'] <= '2020-02-29']

# List of top 30% selling article IDs
top_30_percent = int(len(total_sales_id_sorted) * 0.3)
top_sample_article_ids = total_sales_id_sorted.head(top_30_percent)['article_id'].values

# Step 1: Filter `top_sample_article_ids` to include only those present in `item_params_final`
filtered_sample_ids = [article_id for article_id in top_sample_article_ids if article_id in item_params_final['article_id'].values]

# Step 2: Filter `seasonal_data` to include only items in the filtered sample
sampled_data = seasonal_data[seasonal_data['article_id'].isin(filtered_sample_ids)]

# Initialize dictionaries to store predictions and validation metrics
predictions_30_days = {}
validation_results = []

# Set default parameters for items without specific tuning information (if applicable)
default_params = {
    'max_depth': 10,
    'min_samples_split': 2,
    'n_estimators': 100
}

# Loop through each article in the filtered sampled set and apply Random Forest with validation
for article_id in filtered_sample_ids:
    # Filter data for the current article_id
    article_data = sampled_data[sampled_data['article_id'] == article_id]
    
    # Sort data by `sale_date`, set it as index
    article_data = article_data.set_index('sale_date').sort_index()
    article_data['total_sales'] = article_data['total_sales'].fillna(0)
    
    # Extract the time series for training
    ts_data = article_data['total_sales']
    
    # Check for sufficient data points
    if len(ts_data) < 60:  # Ensure at least 60 days of data for model stability
        continue
    
    # Fetch parameters for the current article_id from `item_params_final`
    params = item_params_final[item_params_final['article_id'] == article_id]

    # Use specific parameters if they exist, otherwise default to common ones
    if len(params) > 0:
        max_depth = int(params['max_depth'].values[0])
        min_samples_split = int(params['min_samples_split'].values[0])
        n_estimators = int(params['n_estimators'].values[0])
    else:
        max_depth = default_params['max_depth']
        min_samples_split = default_params['min_samples_split']
        n_estimators = default_params['n_estimators']
    
    # Define the RandomForest model with the selected parameters
    rf_model = RandomForestRegressor(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        n_estimators=n_estimators,
        random_state=42
    )
    
    # Split the data for validation: last 8 weeks (56 days)
    train_data = ts_data[:-56]  # All data except the last 8 weeks
    validation_data = ts_data[-56:]  # Last 8 weeks
    
    # Further split the last 8 weeks into 4-week training and 4-week validation sets
    rf_train = validation_data[:-28].values.reshape(-1, 1)  # First 4 weeks of the last 8 weeks
    rf_validation = validation_data[-28:].values  # Last 4 weeks of the last 8 weeks
    validation_dates = validation_data[-28:].index  # Dates for the validation period
    
    # Prepare target data for training
    y_train = train_data.values.reshape(-1, 1)
    
    # Fit the model on training data
    rf_model.fit(np.arange(len(y_train)).reshape(-1, 1), y_train.ravel())
    
    # Predict the next 30 days for forecasting using the trained model
    future_days = np.arange(len(y_train), len(y_train) + 30).reshape(-1, 1)
    forecast_30_days = rf_model.predict(future_days)
    predictions_30_days[article_id] = forecast_30_days
    
    # Validate the model on the 4-week validation period
    validation_days = np.arange(len(y_train), len(y_train) + 28).reshape(-1, 1)
    rf_validation_pred = rf_model.predict(validation_days)
    
    # Calculate RMSE and MAE on validation set
    rmse = np.sqrt(mean_squared_error(rf_validation, rf_validation_pred))
    mae = mean_absolute_error(rf_validation, rf_validation_pred)
    
    # Store each validation point with actual and predicted values, along with date
    for date, actual, predicted in zip(validation_dates, rf_validation, rf_validation_pred):
        validation_results.append({
            'article_id': article_id,
            'date': date,
            'total_sales': actual,
            'total_sales_predicted': predicted,
            'rmse': rmse,
            'mae': mae
        })

    print(f"Random Forest validation for article_id {article_id}: RMSE={rmse}, MAE={mae}")

# Convert predictions and validation results to DataFrames
forecast_df = pd.DataFrame(predictions_30_days).T  # Transpose so each article_id is a row
forecast_df.columns = [f"day_{i+1}" for i in range(30)]  # Rename columns for each forecasted day

validation_df = pd.DataFrame(validation_results)

# Save the results to CSV
forecast_df.to_csv("covid_30_rf_30_day_forecasts.csv", index=True)
validation_df.to_csv("covid_30_rf_30day_validation_results_with_dates.csv", index=False)

print("Random Forest predictions and validation tables with time information created successfully!")


Random Forest validation for article_id 706016002: RMSE=0.549808418080242, MAE=0.3862977507619038
Random Forest validation for article_id 568601006: RMSE=0.38776462276093465, MAE=0.32003002935911057
Random Forest validation for article_id 448509014: RMSE=1.028374212113053, MAE=0.8776318890539858
Random Forest validation for article_id 720125001: RMSE=1.1723946052733083, MAE=1.0341467297999989
Random Forest validation for article_id 399223001: RMSE=0.36394992445390006, MAE=0.31856117152246605
Random Forest validation for article_id 706016003: RMSE=1.246453165835904, MAE=1.1560983521278203
Random Forest validation for article_id 562245046: RMSE=0.31784391253613603, MAE=0.27608948279999945
Random Forest validation for article_id 751471001: RMSE=1.0201915122973442, MAE=0.9483195091285718
Random Forest validation for article_id 661794001: RMSE=1.5301947365688624, MAE=1.5091359147857162
Random Forest validation for article_id 706016015: RMSE=0.2582770120799426, MAE=0.2025072741428571
Random 

In [8]:
#RANDOMFOREST 7 DAYS 30% ITEMS, supreme Mighty Best Params Set

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import warnings

warnings.filterwarnings("ignore")  # Suppress non-critical warnings

# Ensure `sale_date` is in datetime format
seasonal_data['sale_date'] = pd.to_datetime(seasonal_data['sale_date'])
seasonal_data = seasonal_data[seasonal_data['sale_date'] <= '2020-02-29']


# List of top 3ß% selling article IDs
top_30_percent = int(len(total_sales_id_sorted) * 0.3)
top_sample_article_ids = total_sales_id_sorted.head(top_30_percent)['article_id'].values

# Step 1: Filter `top_sample_article_ids` to include only those present in `item_params_final`
filtered_sample_ids = [article_id for article_id in top_sample_article_ids if article_id in item_params_final['article_id'].values]

# Step 2: Filter `seasonal_data` to include only items in the filtered sample
sampled_data = seasonal_data[seasonal_data['article_id'].isin(filtered_sample_ids)]

# Initialize dictionaries to store predictions and validation metrics
predictions_7_days = {}
validation_results = []

# Set default parameters for items without specific tuning information (if applicable)
default_params = {
    'max_depth': 10,
    'min_samples_split': 2,
    'n_estimators': 100
}

# Loop through each article in the filtered sampled set and apply Random Forest with validation
for article_id in filtered_sample_ids:
    # Filter data for the current article_id
    article_data = sampled_data[sampled_data['article_id'] == article_id]
    
    # Sort data by `sale_date`, set it as index
    article_data = article_data.set_index('sale_date').sort_index()
    article_data['total_sales'] = article_data['total_sales'].fillna(0)
    
    # Extract the time series for training
    ts_data = article_data['total_sales']
    
    # Check for sufficient data points
    if len(ts_data) < 60:  # Ensure at least 60 days of data for model stability
        continue
    
    # Fetch parameters for the current article_id from `item_params_final`
    params = item_params_final[item_params_final['article_id'] == article_id].iloc[0]

    # Use specific parameters if they exist, otherwise default to common ones
    if len(params) > 0:  # Check if params is not empty
        max_depth = int(params['max_depth'])
        min_samples_split = int(params['min_samples_split'])
        n_estimators = int(params['n_estimators'])

    else:
        max_depth = 10  # Default to the most common value
        min_samples_split = 2  # Default to the most common value
        n_estimators = 100  # Default to the most common value
    
    # Define the RandomForest model with the selected parameters
    rf_model = RandomForestRegressor(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        n_estimators=n_estimators,
        random_state=42
    )
    
    # Split the data for validation: last 8 weeks (56 days)
    train_data = ts_data[:-56]  # All data except the last 8 weeks
    validation_data = ts_data[-56:]  # Last 8 weeks
    
    # Further split the last 8 weeks into 4-week training and 4-week validation sets
    rf_train = validation_data[:-28].values.reshape(-1, 1)  # First 4 weeks of the last 8 weeks
    rf_validation = validation_data[-28:].values  # Last 4 weeks of the last 8 weeks
    validation_dates = validation_data[-28:].index  # Dates for the validation period
    
    # Prepare target data for training
    y_train = train_data.values.reshape(-1, 1)
    
    # Fit the model on training data
    rf_model.fit(np.arange(len(y_train)).reshape(-1, 1), y_train.ravel())
    
    # Predict the next 7 days for forecasting using the trained model
    future_days = np.arange(len(y_train), len(y_train) + 7).reshape(-1, 1)
    forecast_7_days = rf_model.predict(future_days)
    predictions_7_days[article_id] = forecast_7_days
    
    # Validate the model on the 4-week validation period
    validation_days = np.arange(len(y_train), len(y_train) + 28).reshape(-1, 1)
    rf_validation_pred = rf_model.predict(validation_days)
    
    # Calculate RMSE and MAE on validation set
    rmse = np.sqrt(mean_squared_error(rf_validation, rf_validation_pred))
    mae = mean_absolute_error(rf_validation, rf_validation_pred)
    
    # Store each validation point with actual and predicted values, along with date
    for date, actual, predicted in zip(validation_dates, rf_validation, rf_validation_pred):
        validation_results.append({
            'article_id': article_id,
            'date': date,
            'total_sales': actual,
            'total_sales_predicted': predicted,
            'rmse': rmse,
            'mae': mae
        })

    print(f"Random Forest validation for article_id {article_id}: RMSE={rmse}, MAE={mae}")

# Convert predictions and validation results to DataFrames
forecast_df = pd.DataFrame(predictions_7_days).T  # Transpose so each article_id is a row
forecast_df.columns = [f"day_{i+1}" for i in range(7)]  # Rename columns for each forecasted day

validation_df = pd.DataFrame(validation_results)

# Save the results to CSV
forecast_df.to_csv("covid_30_rf_7_day_forecasts.csv", index=True)
validation_df.to_csv("covid_30_rf_7_day_validation_results_with_dates.csv", index=False)

print("Random Forest predictions and validation tables with time information created successfully!")


Random Forest validation for article_id 706016002: RMSE=0.549808418080242, MAE=0.3862977507619038
Random Forest validation for article_id 568601006: RMSE=0.38776462276093465, MAE=0.32003002935911057
Random Forest validation for article_id 448509014: RMSE=1.028374212113053, MAE=0.8776318890539858
Random Forest validation for article_id 720125001: RMSE=1.1723946052733083, MAE=1.0341467297999989
Random Forest validation for article_id 399223001: RMSE=0.36394992445390006, MAE=0.31856117152246605
Random Forest validation for article_id 706016003: RMSE=1.246453165835904, MAE=1.1560983521278203
Random Forest validation for article_id 562245046: RMSE=0.31784391253613603, MAE=0.27608948279999945
Random Forest validation for article_id 751471001: RMSE=1.0201915122973442, MAE=0.9483195091285718
Random Forest validation for article_id 661794001: RMSE=1.5301947365688624, MAE=1.5091359147857162
Random Forest validation for article_id 706016015: RMSE=0.2582770120799426, MAE=0.2025072741428571
Random 