In [1]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
from dotenv import dotenv_values

from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string

# Load environment variables
config = dotenv_values()

# Define variables for the login
pg_user = config['POSTGRES_USER']
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

# Set up the PostgreSQL connection URL
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'

# Create the database engine
engine = create_engine(url, echo=False)
my_schema = 'capstone_barstov_industries'

# Load data directly into a DataFrame
with engine.connect() as conn:
    conn.execute(text(f'SET search_path TO {my_schema};'))
    data= pd.read_sql("SELECT * FROM black_trousers_forecast", conn)
    #item_params_final = pd.read_sql("SELECT * FROM items_params_final", conn)


# Check the DataFrame structure
#model_data.info()


In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import gc

# Data Preparation
print("Starting data preparation...")
data['week'] = pd.to_datetime(data['week']).dt.tz_localize(None)
data = data.sort_values(by=['product_type_no', 'week'])


# Feature Engineering: Lagged Features with Improved Imputation
print("Creating lagged features with mean imputation...")
data['lag_units_sold_1week'] = data.groupby(['article_id', 'product_type_no'])['units_sold'].shift(1)
data['lag_units_sold_2weeks'] = data.groupby(['article_id', 'product_type_no'])['units_sold'].shift(2)
# Fill missing lag values with the mean units sold for each article, or other appropriate methods
data['lag_units_sold_1week'].fillna(data.groupby('article_id')['units_sold'].transform('mean'), inplace=True)
data['lag_units_sold_2weeks'].fillna(data.groupby('article_id')['units_sold'].transform('mean'), inplace=True)
data['month'] = data['week'].dt.month

# Define Train, Validation, and Test Splits
print("Defining train, validation, and test sets...")
train_data = data[data['week'] < "2020-01-01"]
validation_data = data[(data['week'] >= "2020-01-01") & (data['week'] < "2020-03-01")]  # Original validation period
test_data = data[data['week'] == data['week'].max()]

# Feature columns
feature_columns = ['product_type_no', 'average_price', 'lag_units_sold_1week', 'lag_units_sold_2weeks', 'month']

# Initialize and Tune Base Models
print("Initializing models with potential tuning...")
model_rf = RandomForestRegressor(n_estimators=150, max_depth=25, random_state=42, n_jobs=2)
model_lr = LinearRegression()
model_mlp = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# Fit Base Models
print("Fitting base models...")
X_train, y_train = train_data[feature_columns], train_data['units_sold']
model_rf.fit(X_train, y_train)
model_lr.fit(X_train, y_train)
model_mlp.fit(X_train, y_train)

# Store predictions
predictions_summary = []

# Loop through each unique article_id
unique_articles = data['article_id'].unique()
for idx, article_id in enumerate(unique_articles):
    print(f"Processing article {article_id} ({idx + 1}/{len(unique_articles)})")
    
    # Filter data for the current article
    article_data = data[data['article_id'] == article_id]
    article_validation_data = validation_data[validation_data['article_id'] == article_id]
    if len(article_validation_data) == 0:
        continue  # Skip if there's no validation data

    # Prepare feature matrix for validation
    X_article = article_validation_data[feature_columns]
    y_true = article_validation_data['units_sold'].values

    # Generate predictions from each base model
    preds_rf = model_rf.predict(X_article)
    preds_lr = model_lr.predict(X_article)
    preds_mlp = model_mlp.predict(X_article)

    # Create a DataFrame for meta-model
    meta_features = pd.DataFrame({
        'RandomForest': preds_rf,
        'LinearRegression': preds_lr,
        'MLPRegressor': preds_mlp
    })

    # Fit and Predict with Meta-model
    final_model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)
    final_model.fit(meta_features, y_true)
    y_pred = final_model.predict(meta_features)

    # Calculate MAE and RMSE for validation
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)

    # Save results
    predictions_summary.append({
        'article_id': article_id,
        'sold_units': y_true.sum(),
        'predicted_sold_units': y_pred.sum(),
        'rmse': rmse,
        'mae': mae
    })
    print(f"Validation RMSE for article {article_id}: {rmse}, MAE: {mae}")

    # Clean up memory
    gc.collect()

# Convert to DataFrame
predictions_df = pd.DataFrame(predictions_summary)

# Calculate overall RMSE and MAE
overall_rmse = np.sqrt(mean_squared_error(predictions_df['sold_units'], predictions_df['predicted_sold_units']))
overall_mae = mean_absolute_error(predictions_df['sold_units'], predictions_df['predicted_sold_units'])

print("\nOverall Validation Metrics:")
print(f"Overall RMSE: {overall_rmse}")
print(f"Overall MAE: {overall_mae}")

# Export Results to CSV
predictions_df.to_csv("article_predictions_summary.csv", index=False)
print("Process completed and results saved.")


Starting data preparation...
Creating lagged features with mean imputation...
Defining train, validation, and test sets...
Initializing models with potential tuning...
Fitting base models...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['lag_units_sold_1week'].fillna(data.groupby('article_id')['units_sold'].transform('mean'), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['lag_units_sold_2weeks'].fillna(data.groupby('article_id')['units_sold'].transform('mean'), inplace=True)


Processing article 118458028 (1/2697)
Validation RMSE for article 118458028: 0.0, MAE: 0.0
Processing article 118458038 (2/2697)
Processing article 150959011 (3/2697)
Validation RMSE for article 150959011: 0.33343331833783174, MAE: 0.15424242424242426
Processing article 156610001 (4/2697)
Processing article 186372011 (5/2697)
Validation RMSE for article 186372011: 0.24819347291981714, MAE: 0.14
Processing article 212766043 (6/2697)
Validation RMSE for article 212766043: 0.3405973277677611, MAE: 0.19219047619047622
Processing article 219075021 (7/2697)
Validation RMSE for article 219075021: 0.0, MAE: 0.0
Processing article 241486015 (8/2697)
Processing article 252229001 (9/2697)
Processing article 252298002 (10/2697)
Validation RMSE for article 252298002: 0.28180425633565026, MAE: 0.22041666666666673
Processing article 254940026 (11/2697)
Processing article 256151014 (12/2697)
Validation RMSE for article 256151014: 0.5341114115987412, MAE: 0.36124999999999996
Processing article 26227701

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import gc

# Sample Data Load
print("Starting data preparation...")
data['week'] = pd.to_datetime(data['week']).dt.tz_localize(None)
data = data.sort_values(by=['product_type_no', 'week'])

# Feature Engineering: Lagged Features and Seasonal Features
print("Creating lagged features...")
data['lag_units_sold_1week'] = data.groupby(['article_id', 'product_type_no'])['units_sold'].shift(1)
data['lag_units_sold_2weeks'] = data.groupby(['article_id', 'product_type_no'])['units_sold'].shift(2)
data = data.dropna(subset=['lag_units_sold_1week', 'lag_units_sold_2weeks']).reset_index(drop=True)
data['month'] = data['week'].dt.month

# Define Train, Validation, and Test Splits
print("Defining train, validation, and test sets...")
train_data = data[data['week'] < "2020-01-01"]
validation_data = data[(data['week'] >= "2020-01-01") & (data['week'] < "2020-03-01")]
test_data = data[data['week'] == data['week'].max()]

# Feature columns
feature_columns = ['product_type_no', 'average_price', 'lag_units_sold_1week', 'lag_units_sold_2weeks', 'month']

# Initialize Base Models
print("Initializing models...")
model_rf = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=2)
model_lr = LinearRegression()
model_mlp = MLPRegressor(hidden_layer_sizes=(50,), max_iter=300, random_state=42)

# Fit Base Models
print("Fitting base models...")
X_train, y_train = train_data[feature_columns], train_data['units_sold']
model_rf.fit(X_train, y_train)
model_lr.fit(X_train, y_train)
model_mlp.fit(X_train, y_train)

# Store predictions
predictions_summary = []

# Loop through each unique article_id
unique_articles = data['article_id'].unique()
for idx, article_id in enumerate(unique_articles):
    print(f"Processing article {article_id} ({idx + 1}/{len(unique_articles)})")
    
    # Filter data for the current article
    article_data = data[data['article_id'] == article_id]
    article_validation_data = validation_data[validation_data['article_id'] == article_id]
    if len(article_validation_data) == 0:
        continue  # Skip if there's no validation data

    # Prepare feature matrix for validation
    X_article = article_validation_data[feature_columns]
    y_true = article_validation_data['units_sold'].values

    # Generate predictions from each base model
    preds_rf = model_rf.predict(X_article)
    preds_lr = model_lr.predict(X_article)
    preds_mlp = model_mlp.predict(X_article)

    # Create a DataFrame for meta-model
    meta_features = pd.DataFrame({
        'RandomForest': preds_rf,
        'LinearRegression': preds_lr,
        'MLPRegressor': preds_mlp
    })

    # Fit and Predict with Meta-model
    final_model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)
    final_model.fit(meta_features, y_true)
    y_pred = final_model.predict(meta_features)

    # Calculate MAE and RMSE for validation
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)

    # Save results
    predictions_summary.append({
        'article_id': article_id,
        'sold_units': y_true.sum(),
        'predicted_sold_units': y_pred.sum(),
        'rmse': rmse,
        'mae': mae
    })
    print(f"Validation RMSE for article {article_id}: {rmse}, MAE: {mae}")

    # Clean up memory
    gc.collect()

# Convert to DataFrame
predictions_df = pd.DataFrame(predictions_summary)

# Calculate overall RMSE and MAE
overall_rmse = np.sqrt(mean_squared_error(predictions_df['sold_units'], predictions_df['predicted_sold_units']))
overall_mae = mean_absolute_error(predictions_df['sold_units'], predictions_df['predicted_sold_units'])

print("\nOverall Validation Metrics:")
print(f"Overall RMSE: {overall_rmse}")
print(f"Overall MAE: {overall_mae}")

# Export Results to CSV
predictions_df.to_csv("article_predictions_summary.csv", index=False)
print("Process completed and results saved.")


In [7]:
print(f'Predicted:{predictions_df['predicted_sold_units'].sum()}, Units_Sold:{predictions_df['sold_units'].sum()}')

Predicted:92958.07914788835, Units_Sold:90503


In [6]:
#OLD

import pandas as pd
import numpy as np

# Assume the base models (RandomForestRegressor, LinearRegression, MLPRegressor)
# are already trained and available as `model_rf`, `model_lr`, and `model_mlp`

# Feature columns (same as in Part One)
feature_columns = ['product_type_no', 'average_price', 'lag_units_sold_1week', 'lag_units_sold_2weeks', 'month']

# Define thresholds for rebalancing
REBALANCING_MIN = 10  # Minimum demand threshold
REBALANCING_MAX = 500  # Example maximum threshold for excess stock

# Create forecast summary
forecast_results = []

# Loop through each unique article_id
unique_articles = data['article_id'].unique()
for idx, article_id in enumerate(unique_articles):
    print(f"Forecasting for article {article_id} ({idx + 1}/{len(unique_articles)})")
    
    # Get the latest data for the article
    article_data = data[data['article_id'] == article_id]
    if article_data.empty:
        continue  # Skip if no data for the article
    
    # Initialize forecast with the most recent available week
    latest_week = article_data['week'].max()
    product_type_no = article_data['product_type_no'].iloc[0]
    article_forecast = []

    for week_ahead in range(1, 5):  # Forecast for the next 4 weeks
        # Prepare lagged features for the next week's prediction
        lag_1 = article_data.loc[article_data['week'] == latest_week, 'units_sold'].values
        lag_2 = article_data.loc[article_data['week'] == latest_week - pd.Timedelta(weeks=1), 'units_sold'].values

        if len(lag_1) == 0 or len(lag_2) == 0:
            break  # Stop forecasting if lag features are unavailable

        next_week = latest_week + pd.Timedelta(weeks=1)
        next_row = {
            'product_type_no': product_type_no,
            'average_price': article_data['average_price'].iloc[0],
            'lag_units_sold_1week': lag_1[0],
            'lag_units_sold_2weeks': lag_2[0],
            'month': next_week.month
        }
        X_next = pd.DataFrame([next_row])

        # Predictions from base models
        pred_rf = model_rf.predict(X_next)[0]
        pred_lr = model_lr.predict(X_next)[0]
        pred_mlp = model_mlp.predict(X_next)[0]

        # Combine predictions into meta-model input
        final_prediction = (pred_rf + pred_lr + pred_mlp) / 3  # Simple averaging for final prediction

        # Determine rebalancing flag
        if final_prediction < REBALANCING_MIN:
            rebalancing_flag = "Understock"
        elif final_prediction > REBALANCING_MAX:
            rebalancing_flag = "Overstock"
        else:
            rebalancing_flag = "Balanced"

        # Save forecast for the current week
        article_forecast.append({
            'article_id': article_id,
            'product_type_no': product_type_no,
            'predicted_units_sold': final_prediction,
            'week_ahead': week_ahead,
            'rebalancing_flag': rebalancing_flag
        })

        # Append the prediction to the article's data for further forecasts
        new_data_row = pd.DataFrame({
            'week': [next_week],
            'units_sold': [final_prediction],
            'product_type_no': [product_type_no],
            'average_price': [next_row['average_price']]
        })
        article_data = pd.concat([article_data, new_data_row], ignore_index=True)
        latest_week = next_week  # Move to the next forecast week

    # Add article forecast to results
    forecast_results.extend(article_forecast)

# Convert forecasts to DataFrame
forecast_throusers = pd.DataFrame(forecast_results)

# Save results to a CSV file
forecast_throusers.to_csv("AAAHHH_four_week_forecast_results_with_rebalancing.csv", index=False)

print("Four-week forecast completed and results saved.")

forecast_throusers.info()

#OLD

Forecasting for article 118458028 (1/2697)
Forecasting for article 118458038 (2/2697)
Forecasting for article 150959011 (3/2697)
Forecasting for article 156610001 (4/2697)
Forecasting for article 186372011 (5/2697)
Forecasting for article 212766043 (6/2697)
Forecasting for article 219075021 (7/2697)
Forecasting for article 241486015 (8/2697)
Forecasting for article 252229001 (9/2697)
Forecasting for article 252298002 (10/2697)
Forecasting for article 254940026 (11/2697)
Forecasting for article 256151014 (12/2697)
Forecasting for article 262277011 (13/2697)
Forecasting for article 291957009 (14/2697)
Forecasting for article 294076007 (15/2697)
Forecasting for article 309864002 (16/2697)
Forecasting for article 318914001 (17/2697)
Forecasting for article 326885010 (18/2697)
Forecasting for article 333159002 (19/2697)
Forecasting for article 350082029 (20/2697)
Forecasting for article 352811001 (21/2697)
Forecasting for article 356174004 (22/2697)
Forecasting for article 375585027 (23/269

In [None]:
with engine.connect() as conn:
    conn.execute(text(f'SET search_path TO {my_schema};'))
    predictions_df.to_sql('black_throusers_validations', con=engine, schema=my_schema, if_exists='replace', index=False)
    forecast_throusers.to_sql('black_throusers_future_predictions', con=engine, schema=my_schema, if_exists='replace', index=False)

print("Table successfully pushed to the database!")
