In [0]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

In [0]:
# Define the relative path to the Dataset folder
dataset_path = '../Datasets/df_aggregated_month_populated_btp_core.parquet'

# Read DataFrame from Parquet
df_aggregated_month_populated_btp_core = pd.read_parquet(dataset_path, engine='pyarrow')
print(df_aggregated_month_populated_btp_core)

In [0]:
# First test
# Sort the data by DATE to ensure proper ordering
df_aggregated_month_populated_btp_core = df_aggregated_month_populated_btp_core.sort_values(by=['DATE', 'CUSTOMER_ID'])

# Creating a naive forecast (shifting TOTAL_CONSUMPTION_SUM by 1 to predict next month based on the previous month)
df_aggregated_month_populated_btp_core['NAIVE_PREDICTION'] = df_aggregated_month_populated_btp_core.groupby('CUSTOMER_ID')['TOTAL_CONSUMPTION_SUM'].shift(1)

# Dropping the first row of each customer where no previous data exists for prediction
df_naive = df_aggregated_month_populated_btp_core.dropna(subset=['NAIVE_PREDICTION'])

# Extract actual values and naive predictions
y_true = df_naive['TOTAL_CONSUMPTION_SUM']
y_pred = df_naive['NAIVE_PREDICTION']

# Calculate R²
r2 = r2_score(y_true, y_pred)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

# Calculate MAE
mae = mean_absolute_error(y_true, y_pred)

# Exclude zero actual values for MAPE calculation
non_zero_mask = y_true != 0
y_true_non_zero = y_true[non_zero_mask]
y_pred_non_zero = y_pred[non_zero_mask]

# Calculate MAPE (without dividing by zero)
mape = np.mean(np.abs((y_true_non_zero - y_pred_non_zero) / y_true_non_zero)) * 100

# Display the metrics
print(f'R²: {r2}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'MAPE: {mape}%')


In [0]:
# Model for each horizon
# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    # Calculate R²
    r2 = r2_score(y_true, y_pred)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # Calculate MAE
    mae = mean_absolute_error(y_true, y_pred)

    # Exclude zero actual values for MAPE calculation
    non_zero_mask = y_true != 0
    y_true_non_zero = y_true[non_zero_mask]
    y_pred_non_zero = y_pred[non_zero_mask]

    # Calculate MAPE (without dividing by zero)
    mape = np.mean(np.abs((y_true_non_zero - y_pred_non_zero) / y_true_non_zero)) * 100

    # Return the calculated metrics
    return r2, rmse, mae, mape

# Initialize dictionaries to store metrics for averaging across horizons
metrics = {'H1': {'r2_scores': [], 'rmses': [], 'maes': [], 'mapes': []},
           'H2': {'r2_scores': [], 'rmses': [], 'maes': [], 'mapes': []},
           'H3': {'r2_scores': [], 'rmses': [], 'maes': [], 'mapes': []}}

# Apply naive forecast using TimeSeriesSplit
for train_index, test_index in tscv.split(df_aggregated_month_populated_btp_core):
    train = df_aggregated_month_populated_btp_core.iloc[train_index].copy()  # Create a copy of the train DataFrame
    test = df_aggregated_month_populated_btp_core.iloc[test_index].copy()    # Create a copy of the test DataFrame

    # Shift TOTAL_CONSUMPTION_SUM by 1, 2, and 3 to predict next months based on previous months
    for h in range(1, 4):
        train[f'NAIVE_PREDICTION_H{h}'] = train.groupby('CUSTOMER_ID')['TOTAL_CONSUMPTION_SUM'].shift(h)
        test[f'NAIVE_PREDICTION_H{h}'] = test.groupby('CUSTOMER_ID')['TOTAL_CONSUMPTION_SUM'].shift(h)

    # Evaluate each horizon
    for h in range(1, 4):
        # Drop rows with NaN predictions
        test_naive = test.dropna(subset=[f'NAIVE_PREDICTION_H{h}'])

        # Extract true and predicted values for testing
        y_true_test = test_naive['TOTAL_CONSUMPTION_SUM']
        y_pred_test = test_naive[f'NAIVE_PREDICTION_H{h}']

        # Calculate metrics for the current split and horizon
        r2, rmse, mae, mape = calculate_metrics(y_true_test, y_pred_test)

        # Append metrics to the corresponding horizon lists
        metrics[f'H{h}']['r2_scores'].append(r2)
        metrics[f'H{h}']['rmses'].append(rmse)
        metrics[f'H{h}']['maes'].append(mae)
        metrics[f'H{h}']['mapes'].append(mape)

        # Print the metrics for the current split and horizon
        print(f'--- Metrics for Horizon {h} (Time Series Split) ---')
        print(f'R²: {r2}')
        print(f'RMSE: {rmse}')
        print(f'MAE: {mae}')
        print(f'MAPE: {mape}%\n')

# Calculate and print average metrics across all splits for each horizon
for h in range(1, 4):
    average_r2 = np.mean(metrics[f'H{h}']['r2_scores'])
    average_rmse = np.mean(metrics[f'H{h}']['rmses'])
    average_mae = np.mean(metrics[f'H{h}']['maes'])
    average_mape = np.mean(metrics[f'H{h}']['mapes'])

    print(f'--- Average Metrics Across All Splits for Horizon {h} ---')
    print(f'Average R²: {average_r2}')
    print(f'Average RMSE: {average_rmse}')
    print(f'Average MAE: {average_mae}')
    print(f'Average MAPE: {average_mape}%\n')


Baseline for active contract customer for all months

In [0]:
# Define the relative path to the Dataset folder
dataset_path = '../Datasets/df_filtered_active_customers.parquet'

# Read DataFrame from Parquet
df_filtered_active_customers = pd.read_parquet(dataset_path, engine='pyarrow')
print(df_filtered_active_customers)

In [0]:
# First test
# Sort the data by DATE to ensure proper ordering
df_filtered_active_customers.sort_values(by=['DATE', 'CUSTOMER_ID'])

# Creating a naive forecast (shifting TOTAL_CONSUMPTION_SUM by 1 to predict next month based on the previous month)
df_filtered_active_customers['NAIVE_PREDICTION'] = df_filtered_active_customers.groupby('CUSTOMER_ID')['TOTAL_CONSUMPTION_SUM'].shift(1)

# Dropping the first row of each customer where no previous data exists for prediction
df_naive = df_filtered_active_customers.dropna(subset=['NAIVE_PREDICTION'])

# Extract actual values and naive predictions
y_true = df_naive['TOTAL_CONSUMPTION_SUM']
y_pred = df_naive['NAIVE_PREDICTION']

# Calculate R²
r2 = r2_score(y_true, y_pred)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

# Calculate MAE
mae = mean_absolute_error(y_true, y_pred)

# Exclude zero actual values for MAPE calculation
non_zero_mask = y_true != 0
y_true_non_zero = y_true[non_zero_mask]
y_pred_non_zero = y_pred[non_zero_mask]

# Calculate MAPE (without dividing by zero)
mape = np.mean(np.abs((y_true_non_zero - y_pred_non_zero) / y_true_non_zero)) * 100

# Display the metrics
print(f'R²: {r2}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'MAPE: {mape}%')


In [0]:
# Model for each horizon
# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    # Calculate R²
    r2 = r2_score(y_true, y_pred)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # Calculate MAE
    mae = mean_absolute_error(y_true, y_pred)

    # Exclude zero actual values for MAPE calculation
    non_zero_mask = y_true != 0
    y_true_non_zero = y_true[non_zero_mask]
    y_pred_non_zero = y_pred[non_zero_mask]

    # Calculate MAPE (without dividing by zero)
    mape = np.mean(np.abs((y_true_non_zero - y_pred_non_zero) / y_true_non_zero)) * 100

    # Return the calculated metrics
    return r2, rmse, mae, mape

# Initialize dictionaries to store metrics for averaging across horizons
metrics = {'H1': {'r2_scores': [], 'rmses': [], 'maes': [], 'mapes': []},
           'H2': {'r2_scores': [], 'rmses': [], 'maes': [], 'mapes': []},
           'H3': {'r2_scores': [], 'rmses': [], 'maes': [], 'mapes': []}}

# Apply naive forecast using TimeSeriesSplit
for train_index, test_index in tscv.split(df_filtered_active_customers):
    train = df_filtered_active_customers.iloc[train_index].copy()  # Create a copy of the train DataFrame
    test = df_filtered_active_customers.iloc[test_index].copy()    # Create a copy of the test DataFrame

    # Shift TOTAL_CONSUMPTION_SUM by 1, 2, and 3 to predict next months based on previous months
    for h in range(1, 4):
        train[f'NAIVE_PREDICTION_H{h}'] = train.groupby('CUSTOMER_ID')['TOTAL_CONSUMPTION_SUM'].shift(h)
        test[f'NAIVE_PREDICTION_H{h}'] = test.groupby('CUSTOMER_ID')['TOTAL_CONSUMPTION_SUM'].shift(h)

    # Evaluate each horizon
    for h in range(1, 4):
        # Drop rows with NaN predictions
        test_naive = test.dropna(subset=[f'NAIVE_PREDICTION_H{h}'])

        # Extract true and predicted values for testing
        y_true_test = test_naive['TOTAL_CONSUMPTION_SUM']
        y_pred_test = test_naive[f'NAIVE_PREDICTION_H{h}']

        # Calculate metrics for the current split and horizon
        r2, rmse, mae, mape = calculate_metrics(y_true_test, y_pred_test)

        # Append metrics to the corresponding horizon lists
        metrics[f'H{h}']['r2_scores'].append(r2)
        metrics[f'H{h}']['rmses'].append(rmse)
        metrics[f'H{h}']['maes'].append(mae)
        metrics[f'H{h}']['mapes'].append(mape)

        # Print the metrics for the current split and horizon
        print(f'--- Metrics for Horizon {h} (Time Series Split) ---')
        print(f'R²: {r2}')
        print(f'RMSE: {rmse}')
        print(f'MAE: {mae}')
        print(f'MAPE: {mape}%\n')

# Calculate and print average metrics across all splits for each horizon
for h in range(1, 4):
    average_r2 = np.mean(metrics[f'H{h}']['r2_scores'])
    average_rmse = np.mean(metrics[f'H{h}']['rmses'])
    average_mae = np.mean(metrics[f'H{h}']['maes'])
    average_mape = np.mean(metrics[f'H{h}']['mapes'])

    print(f'--- Average Metrics Across All Splits for Horizon {h} ---')
    print(f'Average R²: {average_r2}')
    print(f'Average RMSE: {average_rmse}')
    print(f'Average MAE: {average_mae}')
    print(f'Average MAPE: {average_mape}%\n')
