In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet
from prophet.diagnostics import cross_validation
from hyperopt import fmin, tpe, hp, Trials
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
dataset_1_path = '../Datasets/df_aggregated_month_populated_btp_core.parquet'
dataset_2_path = '../Datasets/df_filtered_active_customers.parquet'
df_populated_customers = pd.read_parquet(dataset_1_path, engine='pyarrow')
df_filtered_active_customers = pd.read_parquet(dataset_2_path, engine='pyarrow')

In [None]:
# Custom MAPE function that ignores zero values
def custom_mape(y_true, y_pred):
    mask = y_true != 0
    return (np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])).mean() * 100

In [None]:
# Load and preprocess your data
df_populated_customers['DATE'] = pd.to_datetime(df_populated_customers['DATE'])
df_populated_customers = df_populated_customers.rename(columns={'DATE': 'ds', 'TOTAL_CONSUMPTION_SUM': 'y'})

In [None]:
# Get unique customer IDs (first 1000 df_populated_customers)
customer_ids = df_populated_customers['CUSTOMER_ID'].unique()[:1000]

# Hyperparameter tuning function for Prophet
def prophet_objective(params):
    # Initialize metrics for this tuning run
    rmse_list = []
    
    # Iterate over the first 1000 df_populated_customers for tuning
    for customer_id in customer_ids:
        df_customer = df_populated_customers[df_populated_customers['CUSTOMER_ID'] == customer_id]

        model = Prophet(
            changepoint_prior_scale=params['changepoint_prior_scale'],
            seasonality_mode=params['seasonality_mode'],
            weekly_seasonality=params['weekly_seasonality'],
            yearly_seasonality=params['yearly_seasonality']
        )
        model.fit(df_customer)

        # Cross-validate to compute metrics
        df_cv = cross_validation(model, initial='540 days', period='30 days', horizon='90 days')
        df_cv[['yhat', 'yhat_lower', 'yhat_upper']] = df_cv[['yhat', 'yhat_lower', 'yhat_upper']].applymap(lambda x: max(0, x))

        rmse = mean_squared_error(df_cv['y'], df_cv['yhat'], squared=False)
        rmse_list.append(rmse)

    return np.mean(rmse_list)  # Return the average RMSE across df_populated_customers

# Define the hyperparameter search space
param_space = {
    'changepoint_prior_scale': hp.uniform('changepoint_prior_scale', 0.001, 0.5),
    'seasonality_mode': hp.choice('seasonality_mode', ['additive', 'multiplicative']),
    'weekly_seasonality': hp.choice('weekly_seasonality', [True, False]),
    'yearly_seasonality': hp.choice('yearly_seasonality', [True, False])
}

# Run hyperparameter optimization
trials = Trials()
best_params = fmin(fn=prophet_objective, space=param_space, algo=tpe.suggest, max_evals=50, trials=trials)

# Translate the best_params result to readable format
best_params['seasonality_mode'] = ['additive', 'multiplicative'][best_params['seasonality_mode']]
best_params['weekly_seasonality'] = [True, False][best_params['weekly_seasonality']]
best_params['yearly_seasonality'] = [True, False][best_params['yearly_seasonality']]

# Prepare a list to store performance metrics
performance_metrics_list = []

# Process df_populated_customers and calculate metrics using the best parameters
for customer_id in customer_ids:
    df_customer = df_populated_customers[df_populated_customers['CUSTOMER_ID'] == customer_id]

    if len(df_customer) < 3:
        continue

    model = Prophet(
        changepoint_prior_scale=best_params['changepoint_prior_scale'],
        seasonality_mode=best_params['seasonality_mode'],
        weekly_seasonality=best_params['weekly_seasonality'],
        yearly_seasonality=best_params['yearly_seasonality']
    )
    model.fit(df_customer)

    # Create forecasts for the next 3 months (one for each horizon)
    future = model.make_future_dataframe(periods=3, freq='M')
    forecast = model.predict(future)
    forecast[['yhat', 'yhat_lower', 'yhat_upper']] = forecast[['yhat', 'yhat_lower', 'yhat_upper']].applymap(lambda x: max(0, x))

    # Cross-validate to compute metrics for multiple horizons
    for horizon_days in [30, 60, 90]:
        df_cv = cross_validation(model, initial='540 days', period='30 days', horizon=f'{horizon_days} days')
        df_cv[['yhat', 'yhat_lower', 'yhat_upper']] = df_cv[['yhat', 'yhat_lower', 'yhat_upper']].applymap(lambda x: max(0, x))

        rmse = mean_squared_error(df_cv['y'], df_cv['yhat'], squared=False)
        mae = mean_absolute_error(df_cv['y'], df_cv['yhat'])
        r2 = r2_score(df_cv['y'], df_cv['yhat']) if len(df_cv) > 1 else None
        mape = custom_mape(df_cv['y'], df_cv['yhat'])

        # Store metrics for each horizon
        performance_metrics_list.append({
            'Customer_ID': customer_id,
            'Horizon (Days)': horizon_days,
            'RMSE': rmse,
            'MAE': mae,
            'MAPE': mape,
            'R²': r2
        })

# Create DataFrame from metrics
performance_metrics_df = pd.DataFrame(performance_metrics_list)

# Calculate average metrics per horizon
average_metrics = performance_metrics_df.groupby('Horizon (Days)').mean(numeric_only=True).reset_index()
average_metrics['Customer_ID'] = 'Average'

# Append average metrics to the DataFrame
performance_metrics_df = pd.concat([performance_metrics_df, average_metrics], ignore_index=True)

# Save the metrics to a CSV file
performance_metrics_df.to_csv('populated_customer_performance_metrics_horizons.csv', index=False)

print("Performance metrics for each customer and each horizon, including averages, saved to 'populated_customer_performance_metrics_horizons.csv'.")

Active Customer Dataset

In [None]:
# Load and preprocess your data
df_filtered_active_customers['DATE'] = pd.to_datetime(df_filtered_active_customers['DATE'])
df_filtered_active_customers = df_filtered_active_customers.rename(columns={'DATE': 'ds', 'TOTAL_CONSUMPTION_SUM': 'y'})

In [None]:
# Get unique customer IDs (first 1000 customers)
customer_ids = df_filtered_active_customers['CUSTOMER_ID'].unique()[:1000]

# Hyperparameter tuning function for Prophet
def prophet_objective(params):
    # Initialize metrics for this tuning run
    rmse_list = []
    
    # Iterate over the first 1000 customers for tuning
    for customer_id in customer_ids:
        df_customer = df_filtered_active_customers[df_filtered_active_customers['CUSTOMER_ID'] == customer_id]

        model = Prophet(
            changepoint_prior_scale=params['changepoint_prior_scale'],
            seasonality_mode=params['seasonality_mode'],
            weekly_seasonality=params['weekly_seasonality'],
            yearly_seasonality=params['yearly_seasonality']
        )
        model.fit(df_customer)

        # Cross-validate to compute metrics
        df_cv = cross_validation(model, initial='540 days', period='30 days', horizon='90 days')
        df_cv[['yhat', 'yhat_lower', 'yhat_upper']] = df_cv[['yhat', 'yhat_lower', 'yhat_upper']].applymap(lambda x: max(0, x))

        rmse = mean_squared_error(df_cv['y'], df_cv['yhat'], squared=False)
        rmse_list.append(rmse)

    return np.mean(rmse_list)  # Return the average RMSE across customers

# Define the hyperparameter search space
param_space = {
    'changepoint_prior_scale': hp.uniform('changepoint_prior_scale', 0.001, 0.5),
    'seasonality_mode': hp.choice('seasonality_mode', ['additive', 'multiplicative']),
    'weekly_seasonality': hp.choice('weekly_seasonality', [True, False]),
    'yearly_seasonality': hp.choice('yearly_seasonality', [True, False])
}

# Run hyperparameter optimization
trials = Trials()
best_params = fmin(fn=prophet_objective, space=param_space, algo=tpe.suggest, max_evals=50, trials=trials)

# Translate the best_params result to readable format
best_params['seasonality_mode'] = ['additive', 'multiplicative'][best_params['seasonality_mode']]
best_params['weekly_seasonality'] = [True, False][best_params['weekly_seasonality']]
best_params['yearly_seasonality'] = [True, False][best_params['yearly_seasonality']]

# Prepare a list to store performance metrics
performance_metrics_list = []

# Process customers and calculate metrics using the best parameters
for customer_id in customer_ids:
    df_customer = df_filtered_active_customers[df_filtered_active_customers['CUSTOMER_ID'] == customer_id]

    if len(df_customer) < 3:
        continue

    model = Prophet(
        changepoint_prior_scale=best_params['changepoint_prior_scale'],
        seasonality_mode=best_params['seasonality_mode'],
        weekly_seasonality=best_params['weekly_seasonality'],
        yearly_seasonality=best_params['yearly_seasonality']
    )
    model.fit(df_customer)

    # Create forecasts for the next 3 months (one for each horizon)
    future = model.make_future_dataframe(periods=3, freq='M')
    forecast = model.predict(future)
    forecast[['yhat', 'yhat_lower', 'yhat_upper']] = forecast[['yhat', 'yhat_lower', 'yhat_upper']].applymap(lambda x: max(0, x))

    # Cross-validate to compute metrics for multiple horizons
    for horizon_days in [30, 60, 90]:
        df_cv = cross_validation(model, initial='540 days', period='30 days', horizon=f'{horizon_days} days')
        df_cv[['yhat', 'yhat_lower', 'yhat_upper']] = df_cv[['yhat', 'yhat_lower', 'yhat_upper']].applymap(lambda x: max(0, x))

        rmse = mean_squared_error(df_cv['y'], df_cv['yhat'], squared=False)
        mae = mean_absolute_error(df_cv['y'], df_cv['yhat'])
        r2 = r2_score(df_cv['y'], df_cv['yhat']) if len(df_cv) > 1 else None
        mape = custom_mape(df_cv['y'], df_cv['yhat'])

        # Store metrics for each horizon
        performance_metrics_list.append({
            'Customer_ID': customer_id,
            'Horizon (Days)': horizon_days,
            'RMSE': rmse,
            'MAE': mae,
            'MAPE': mape,
            'R²': r2
        })

# Create DataFrame from metrics
performance_metrics_df = pd.DataFrame(performance_metrics_list)

# Calculate average metrics per horizon
average_metrics = performance_metrics_df.groupby('Horizon (Days)').mean(numeric_only=True).reset_index()
average_metrics['Customer_ID'] = 'Average'

# Append average metrics to the DataFrame
performance_metrics_df = pd.concat([performance_metrics_df, average_metrics], ignore_index=True)

# Save the metrics to a CSV file
performance_metrics_df.to_csv('active_customer_performance_metrics_horizons.csv', index=False)

print("Performance metrics for each customer and each horizon, including averages, saved to 'active_customer_performance_metrics_horizons.csv'.")