In [0]:
# Import necessary libraries
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

In [0]:
dataset_1_path = '../Datasets/df_aggregated_month_populated_btp_core.parquet'
dataset_2_path = '../Datasets/df_filtered_active_customers.parquet'
df_aggregated = pd.read_parquet(dataset_1_path, engine='pyarrow')
df_filtered = pd.read_parquet(dataset_2_path, engine='pyarrow')


In [0]:
def custom_mape(y_true, y_pred):
    # Calculate MAPE while ignoring zeros in y_true
    non_zero_indices = y_true != 0
    mape = np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100
    return mape

In [0]:
# Objective function for hyperparameter tuning
def objective(params, X_train, y_train):
    model = xgb.XGBRegressor(objective='reg:squarederror', enable_categorical=True, random_state=42, **params)
    tscv = TimeSeriesSplit(n_splits=5)
    cv_scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error')
    mean_rmse = np.sqrt(-cv_scores.mean())
    return {'loss': mean_rmse, 'status': STATUS_OK}

In [0]:
# Map indices to actual hyperparameter values after optimization
def map_index_to_value(best_params):
    best_params['n_estimators'] = [50, 100, 200][best_params['n_estimators']]
    best_params['learning_rate'] = [0.001, 0.05, 0.1][best_params['learning_rate']]
    best_params['max_depth'] = [3, 5, 6][best_params['max_depth']]
    best_params['min_child_weight'] = [1, 3, 5][best_params['min_child_weight']]
    best_params['gamma'] = [0, 0.5, 1][best_params['gamma']]
    best_params['subsample'] = [0.6, 0.8, 1.0][best_params['subsample']]
    best_params['colsample_bytree'] = [0.6, 0.8, 1.0][best_params['colsample_bytree']]
    best_params['reg_alpha'] = [0, 0.5, 1.0][best_params['reg_alpha']]
    best_params['reg_lambda'] = [1.0, 10.0, 20.0][best_params['reg_lambda']]
    return best_params

In [None]:
# Define a function to evaluate the model
def evaluate_model(X_train, y_train, best_params):
    model = xgb.XGBRegressor(objective='reg:squarederror', enable_categorical=True, random_state=42, **best_params)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Perform cross-validation and return metrics
    tscv = TimeSeriesSplit(n_splits=5)
    y_pred = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error')
    
    # Calculate the metrics
    y_pred_mean = -y_pred.mean()
    rmse = np.sqrt(y_pred_mean)
    r2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2').mean()
    mae = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_absolute_error').mean()
    mape = custom_mape(y_train, model.predict(X_train))
    
    return {
        'model': model,
        'RMSE': rmse,
        'R²': r2,
        'MAE': mae,
        'MAPE': mape
    }


In [None]:
# Feature and target variable setup
features = [
    'MONTHLY_CONTRACT_NET_VALUE_SUM', 'LICENSE_COUNT_SUM', 'LATEST_CONTRACT_MIN', 
    'CONTRACT_DURATION_SUM', 'CONTRACT_DURATION_MEAN', 'OVERCONSUMPTION_COUNT', 
    'ORDER_COUNT', 'BUNDLE_INDICATOR', 'INTEGRATION_SUITE', 'CLOUD_INTEGRATION',
    'ACTIVE_CONTRACT', 'TOTAL_CONSUMPTION_LAG_1', 'TOTAL_CONSUMPTION_LAG_2', 
    'TOTAL_CONSUMPTION_LAG_3', 'TOTAL_CONSUMPTION_ROLLING_3', 'TOTAL_CONSUMPTION_ROLLING_6', 
    'MONTH', 'MONTH_SIN', 'MONTH_COS', 'TREND', 'TOTAL_CONSUMPTION_DIFF_1', 
    'TOTAL_CONSUMPTION_CUMSUM', 'TOTAL_CONSUMPTION_EMA_3', 'YEAR', 
    'ISS_TEXT', 'GLOBAL_REGION', 'COUNTRY', 'SAP_MASTER_CODE'
]
X_base = df_aggregated[features].copy()
categorical_columns = ['ISS_TEXT', 'GLOBAL_REGION', 'COUNTRY', 'SAP_MASTER_CODE']
X_base[categorical_columns] = X_base[categorical_columns].astype('category')

In [0]:
# Forecasting and hyperparameter tuning loop
horizons = [1, 2, 3]
results_overall = {}

for h in horizons:
    print(f"\nProcessing horizon: {h} month(s) ahead")
    
    # Align X and y for each horizon
    y = df_aggregated['TOTAL_CONSUMPTION_SUM'].shift(-h).dropna()
    X = X_base.iloc[:-h].copy()
    
    # Hyperparameter search space
    space = {
        'n_estimators': hp.choice('n_estimators', [50, 100, 200]),
        'learning_rate': hp.choice('learning_rate', [0.001, 0.05, 0.1]),
        'max_depth': hp.choice('max_depth', [3, 5, 6]),
        'min_child_weight': hp.choice('min_child_weight', [1, 3, 5]),
        'gamma': hp.choice('gamma', [0, 0.5, 1]),
        'subsample': hp.choice('subsample', [0.6, 0.8, 1.0]),
        'colsample_bytree': hp.choice('colsample_bytree', [0.6, 0.8, 1.0]),
        'reg_alpha': hp.choice('reg_alpha', [0, 0.5, 1.0]),
        'reg_lambda': hp.choice('reg_lambda', [1.0, 10.0, 20.0])
    }
    
    # Hyperparameter tuning with fmin
    print(f"\nRunning hyperparameter tuning for horizon {h} month(s)")
    best_params = fmin(fn=lambda params: objective(params, X, y), space=space, algo=tpe.suggest, max_evals=100, trials=spark_trials)
    best_params_mapped = map_index_to_value(best_params)
    
    # Evaluate model with best parameters
    metrics = evaluate_model(X, y, best_params_mapped)
    results_overall[h] = metrics  # Store results for each horizon

# Output results
for h, metrics in results_overall.items():
    print(f"\nResults for horizon {h} month(s):")
    print(f"Mean RMSE: {metrics['RMSE']}")
    print(f"Mean R²: {metrics['R²']}")
    print(f"Mean MAE: {metrics['MAE']}")
    print(f"Mean MAPE: {metrics['MAPE']}%")

XGboost for active contract Dataset

In [0]:
# Feature and target variable setup
features = [
    'MONTHLY_CONTRACT_NET_VALUE_SUM', 'LICENSE_COUNT_SUM', 'LATEST_CONTRACT_MIN', 
    'CONTRACT_DURATION_SUM', 'CONTRACT_DURATION_MEAN', 'OVERCONSUMPTION_COUNT', 
    'ORDER_COUNT', 'BUNDLE_INDICATOR', 'INTEGRATION_SUITE', 'CLOUD_INTEGRATION',
    'TOTAL_CONSUMPTION_LAG_1', 'TOTAL_CONSUMPTION_LAG_2', 
    'TOTAL_CONSUMPTION_LAG_3', 'TOTAL_CONSUMPTION_ROLLING_3', 'TOTAL_CONSUMPTION_ROLLING_6', 
    'MONTH', 'MONTH_SIN', 'MONTH_COS', 'TREND', 'TOTAL_CONSUMPTION_DIFF_1', 
    'TOTAL_CONSUMPTION_CUMSUM', 'TOTAL_CONSUMPTION_EMA_3', 'YEAR', 
    'ISS_TEXT', 'GLOBAL_REGION', 'COUNTRY', 'SAP_MASTER_CODE'
]
X_base = df_filtered[features].copy()
categorical_columns = ['ISS_TEXT', 'GLOBAL_REGION', 'COUNTRY', 'SAP_MASTER_CODE']
X_base[categorical_columns] = X_base[categorical_columns].astype('category')
# Forecasting and hyperparameter tuning loop
horizons = [1, 2, 3]
results_overall = {}

for h in horizons:
    print(f"\nProcessing horizon: {h} month(s) ahead")
    
    # Align X and y for each horizon
    y = df_filtered['TOTAL_CONSUMPTION_SUM'].shift(-h).dropna()
    X = X_base.iloc[:-h].copy()
    
    # Hyperparameter search space
    space = {
        'n_estimators': hp.choice('n_estimators', [50, 100, 200]),
        'learning_rate': hp.choice('learning_rate', [0.001, 0.05, 0.1]),
        'max_depth': hp.choice('max_depth', [3, 5, 6]),
        'min_child_weight': hp.choice('min_child_weight', [1, 3, 5]),
        'gamma': hp.choice('gamma', [0, 0.5, 1]),
        'subsample': hp.choice('subsample', [0.6, 0.8, 1.0]),
        'colsample_bytree': hp.choice('colsample_bytree', [0.6, 0.8, 1.0]),
        'reg_alpha': hp.choice('reg_alpha', [0, 0.5, 1.0]),
        'reg_lambda': hp.choice('reg_lambda', [1.0, 10.0, 20.0])
    }
    
    # Hyperparameter tuning with fmin
    print(f"\nRunning hyperparameter tuning for horizon {h} month(s)")
    best_params = fmin(fn=lambda params: objective(params, X, y), space=space, algo=tpe.suggest, max_evals=100, trials=spark_trials)
    best_params_mapped = map_index_to_value(best_params)
    
    # Evaluate model with best parameters
    metrics = evaluate_model(X, y, best_params_mapped)
    results_overall[h] = metrics  # Store results for each horizon

# Output results
for h, metrics in results_overall.items():
    print(f"\nResults for horizon {h} month(s):")
    print(f"Mean RMSE: {metrics['RMSE']}")
    print(f"Mean R²: {metrics['R²']}")
    print(f"Mean MAE: {metrics['MAE']}")
    print(f"Mean MAPE: {metrics['MAPE']}%")