In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd
import numpy as np

In [0]:
# Define the relative path to the Dataset folder
dataset_path = '../Datasets/df_aggregated_month_populated_btp_core.parquet'

# Read DataFrame from Parquet
df_aggregated_month_populated_btp_core = pd.read_parquet(dataset_path, engine='pyarrow')
print(df_aggregated_month_populated_btp_core)

In [None]:
# Custom MAPE function that ignores zero values
def custom_mape(y_true, y_pred):
    # Filter out zero values in y_true to avoid division by zero
    mask = y_true != 0
    return (np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])).mean() * 100

In [0]:
# Load and preprocess your data (assuming your data is in the 'df_aggregated_month_populated_btp_core' DataFrame)
df_horizon = df_aggregated_month_populated_btp_core.copy()

# Define the feature columns to be used for Linear Regression
all_features = [
    'MONTHLY_CONTRACT_NET_VALUE_SUM',  
    'CONTRACT_DURATION_SUM', 
    'OVERCONSUMPTION_COUNT', 
    'ORDER_COUNT',  
    'TOTAL_CONSUMPTION_LAG_1'
]

# Get unique customer IDs
customer_ids = df_horizon['CUSTOMER_ID'].unique()

# Prepare a list to store performance metrics
performance_metrics_list = []

# Define horizons for forecasting
horizons = [1, 2, 3]

# Process each customer
for customer_id in customer_ids:
    print(f"Processing Customer ID: {customer_id}")

    # Filter data for a specific customer
    df_customer = df_horizon[df_horizon['CUSTOMER_ID'] == customer_id]

    # Ensure there is enough data for the model (minimum 3 data points per customer)
    if len(df_customer) < 3:
        continue

    # Sort data by 'DATE' to maintain temporal order
    df_customer = df_customer.sort_values(by='DATE')

    # Initialize metrics storage for each horizon
    customer_metrics = {'Customer_ID': customer_id}

    # Process each horizon separately
    for h in horizons:
        # Shift the target variable (TOTAL_CONSUMPTION_SUM) by the horizon
        df_customer[f'TOTAL_CONSUMPTION_{h}'] = df_customer['TOTAL_CONSUMPTION_SUM'].shift(-h)
        df_customer_filtered = df_customer.dropna(subset=[f'TOTAL_CONSUMPTION_{h}'])  # Drop NaN target values
        
        # Ensure enough data remains after dropping NaNs
        if len(df_customer_filtered) < 3:
            continue

        # Define X (input features) and y (target) for the current horizon
        X = df_customer_filtered[all_features]
        y = df_customer_filtered[f'TOTAL_CONSUMPTION_{h}']

        # Initialize the Linear Regression model
        model = LinearRegression()

        # Fit the model on the entire dataset for the current customer
        model.fit(X, y)

        # Make predictions
        y_pred = model.predict(X)

        # Set negative predictions to 0
        y_pred = np.where(y_pred < 0, 0, y_pred)

        # Calculate performance metrics for the current horizon
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        mae = mean_absolute_error(y, y_pred)
        mape = custom_mape(y, y_pred)

        # Store metrics for the current horizon
        customer_metrics[f'MSE_horizon_{h}'] = mse
        customer_metrics[f'R²_horizon_{h}'] = r2
        customer_metrics[f'MAE_horizon_{h}'] = mae
        customer_metrics[f'MAPE_horizon_{h}'] = mape

    # Append the customer metrics to the performance list
    performance_metrics_list.append(customer_metrics)

# Create a DataFrame from the performance metrics list
performance_metrics_df = pd.DataFrame(performance_metrics_list)

# Save the metrics to a CSV file
performance_metrics_df.to_csv('linear_regression_customer_performance_metrics.csv', index=False)

# Calculate and append the averages of the performance metrics **for each horizon**
average_metrics_per_horizon = {}
for h in horizons:
    avg_mse = performance_metrics_df[f'MSE_horizon_{h}'].mean()
    avg_r2 = performance_metrics_df[f'R²_horizon_{h}'].mean()
    avg_mae = performance_metrics_df[f'MAE_horizon_{h}'].mean()
    avg_mape = performance_metrics_df[f'MAPE_horizon_{h}'].mean()

    # Store the averages in a dictionary
    average_metrics_per_horizon[f'MSE_horizon_{h}'] = avg_mse
    average_metrics_per_horizon[f'R²_horizon_{h}'] = avg_r2
    average_metrics_per_horizon[f'MAE_horizon_{h}'] = avg_mae
    average_metrics_per_horizon[f'MAPE_horizon_{h}'] = avg_mape

# Convert the averages dictionary to a DataFrame and add 'Average' label for Customer_ID
average_metrics_df = pd.DataFrame([average_metrics_per_horizon])
average_metrics_df['Customer_ID'] = 'Average'

# Append the averages to the original performance metrics DataFrame and save it
performance_metrics_df = performance_metrics_df.append(average_metrics_df, ignore_index=True)
performance_metrics_df.to_csv('linear_regression_customer_performance_metrics_with_averages_per_horizon.csv', index=False)

print("Performance metrics saved to 'linear_regression_customer_performance_metrics_with_averages_per_horizon.csv'.")


active contract dataset

In [0]:
# Define the relative path to the Dataset folder
dataset_path = '../Datasets/df_filtered_active_customers.parquet'

# Read DataFrame from Parquet
df_filtered_active_customers = pd.read_parquet(dataset_path, engine='pyarrow')
print(df_filtered_active_customers)

In [0]:
# Custom MAPE function that ignores zero values
def custom_mape(y_true, y_pred):
    # Filter out zero values in y_true to avoid division by zero
    mask = y_true != 0
    return (np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])).mean() * 100

# Load and preprocess your data (assuming your data is in the 'df_filtered_active_customers' DataFrame)
df_horizon = df_filtered_active_customers.copy()

# Define the feature columns to be used for Linear Regression
all_features = [
    'MONTHLY_CONTRACT_NET_VALUE_SUM',  
    'CONTRACT_DURATION_SUM', 
    'OVERCONSUMPTION_COUNT', 
    'ORDER_COUNT',  
    'TOTAL_CONSUMPTION_LAG_1'
]

# Get unique customer IDs
customer_ids = df_horizon['CUSTOMER_ID'].unique()

# Prepare a list to store performance metrics
performance_metrics_list = []

# Define horizons for forecasting
horizons = [1, 2, 3]

# Process each customer
for customer_id in customer_ids:
    print(f"Processing Customer ID: {customer_id}")

    # Filter data for a specific customer
    df_customer = df_horizon[df_horizon['CUSTOMER_ID'] == customer_id]

    # Ensure there is enough data for the model (minimum 3 data points per customer)
    if len(df_customer) < 3:
        continue

    # Sort data by 'DATE' to maintain temporal order
    df_customer = df_customer.sort_values(by='DATE')

    # Initialize metrics storage for each horizon
    customer_metrics = {'Customer_ID': customer_id}

    # Process each horizon separately
    for h in horizons:
        # Shift the target variable (TOTAL_CONSUMPTION_SUM) by the horizon
        df_customer[f'TOTAL_CONSUMPTION_{h}'] = df_customer['TOTAL_CONSUMPTION_SUM'].shift(-h)
        df_customer_filtered = df_customer.dropna(subset=[f'TOTAL_CONSUMPTION_{h}'])  # Drop NaN target values
        
        # Ensure enough data remains after dropping NaNs
        if len(df_customer_filtered) < 3:
            continue

        # Define X (input features) and y (target) for the current horizon
        X = df_customer_filtered[all_features]
        y = df_customer_filtered[f'TOTAL_CONSUMPTION_{h}']

        # Initialize the Linear Regression model
        model = LinearRegression()

        # Fit the model on the entire dataset for the current customer
        model.fit(X, y)

        # Make predictions
        y_pred = model.predict(X)

        # Set negative predictions to 0
        y_pred = np.where(y_pred < 0, 0, y_pred)

        # Calculate performance metrics for the current horizon
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        mae = mean_absolute_error(y, y_pred)
        mape = custom_mape(y, y_pred)

        # Store metrics for the current horizon
        customer_metrics[f'MSE_horizon_{h}'] = mse
        customer_metrics[f'R²_horizon_{h}'] = r2
        customer_metrics[f'MAE_horizon_{h}'] = mae
        customer_metrics[f'MAPE_horizon_{h}'] = mape

    # Append the customer metrics to the performance list
    performance_metrics_list.append(customer_metrics)

# Create a DataFrame from the performance metrics list
performance_metrics_df = pd.DataFrame(performance_metrics_list)

# Save the metrics to a CSV file
performance_metrics_df.to_csv('linear_regression_customer_performance_metrics.csv', index=False)

# Calculate and append the averages of the performance metrics **for each horizon**
average_metrics_per_horizon = {}
for h in horizons:
    avg_mse = performance_metrics_df[f'MSE_horizon_{h}'].mean()
    avg_r2 = performance_metrics_df[f'R²_horizon_{h}'].mean()
    avg_mae = performance_metrics_df[f'MAE_horizon_{h}'].mean()
    avg_mape = performance_metrics_df[f'MAPE_horizon_{h}'].mean()

    # Store the averages in a dictionary
    average_metrics_per_horizon[f'MSE_horizon_{h}'] = avg_mse
    average_metrics_per_horizon[f'R²_horizon_{h}'] = avg_r2
    average_metrics_per_horizon[f'MAE_horizon_{h}'] = avg_mae
    average_metrics_per_horizon[f'MAPE_horizon_{h}'] = avg_mape

# Convert the averages dictionary to a DataFrame and add 'Average' label for Customer_ID
average_metrics_df = pd.DataFrame([average_metrics_per_horizon])
average_metrics_df['Customer_ID'] = 'Average'

# Append the averages to the original performance metrics DataFrame and save it
performance_metrics_df = performance_metrics_df.append(average_metrics_df, ignore_index=True)
performance_metrics_df.to_csv('linear_regression_active_customer_performance_metrics_with_averages_per_horizon.csv', index=False)

print("Performance metrics saved to 'linear_regression_active_customer_performance_metrics_with_averages_per_horizon.csv'.")
