See https://www.kaggle.com/code/carlmcbrideellis/regression-prediction-intervals-with-mapie/notebook

In [None]:
import numpy as np
import pandas as pd
import pickle
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from mapie.regression import MapieQuantileRegressor

In [None]:
pd.set_option('display.max_columns', None) # show all columns in a df

In [None]:
X_train = pd.read_csv("split_data/train_features_preprocessed.csv")
y_train = pd.read_csv("split_data/train_target_preprocessed.csv")

In [None]:
X_train.shape

In [None]:
X_calib = pd.read_csv("split_data/calib_features_preprocessed.csv")
y_calib = pd.read_csv("split_data/calib_target_preprocessed.csv")

In [None]:
X_calib.shape

In [None]:
X_val = pd.read_csv("split_data/val_features_preprocessed.csv")
y_val = pd.read_csv("split_data/val_target_preprocessed.csv")

In [None]:
alpha = 0.1 # for 90% target coverage

In [None]:

regressor = LGBMRegressor( n_estimators       = 1000,
                           learning_rate      = 0.05, 
                           max_depth          = 7, 
                           min_child_samples  = 8,
                           random_state       = 42,
                           objective          = 'quantile',
                           alpha              = alpha,
                           verbose = 50
                         )

In [None]:
lgb_model = LGBMRegressor(objective='regression', metric='mae', verbose = 50)

# Define hyperparameters grid
param_grid = {
    'num_leaves': [30, 50], # not ok: should be <= 2^max_depth
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [500, 1000],
    'max_depth': [5, 7],
    'min_child_samples': [5, 8],
    'n_estimators' : [500, 1000]
}

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Use negative MAE because sklearn wants higher values to be better
    cv=5,                               # 5-fold cross-validation                 
    n_jobs=-1                           # Use all CPU cores for faster computation
)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

In [None]:
grid_search.best_params_

In [None]:
alpha = 0.125 

In [None]:
regressor = LGBMRegressor( n_estimators       = 500,
                           learning_rate      = 0.05, 
                           max_depth          = 7, 
                           min_child_samples  = 8,
                           random_state       = 16,
                           num_leaves         = 50,
                           objective          = 'quantile',
                           alpha              = alpha,
                         )

In [None]:
mapie = MapieQuantileRegressor(estimator=regressor, cv="split", alpha=alpha)
mapie.fit(X_train, np.ravel(y_train), X_calib=X_calib, y_calib=np.ravel(y_calib))
y_pred, y_pis = mapie.predict(X_val)

In [None]:
predictions = np.ravel(y_val)
predictions

In [None]:
predictions.columns

In [None]:
predictions = y_val
predictions.columns = ['y_true']
predictions["point prediction"] = y_pred
predictions["lower"] = y_pis.reshape(-1,2)[:,0]
predictions["upper"] = y_pis.reshape(-1,2)[:,1]
predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
alpha_WIS = .20 # instructions assignment

alpha = alpha_WIS

def WIS_and_coverage(y_true, lower, upper, alpha):
        
        if np.isnan(lower)  == True: 
            raise ParticipantVisibleError("lower interval value contains NaN value(s)")
        if np.isinf(lower)  == True: 
            raise ParticipantVisibleError("lower interval value contains inf values(s)")
        if np.isnan(upper)  == True: 
            raise ParticipantVisibleError("upper interval value contains NaN value(s)")
        if np.isinf(upper)  == True: 
            raise ParticipantVisibleError("upper interval value contains inf values(s)")
        # These should not occur in a competition setting
        if np.isnan(y_true) == True:
            raise ParticipantVisibleError("y_true contains NaN value(s)")
        if np.isinf(y_true) == True: 
            raise ParticipantVisibleError("y_true contains inf values(s)")
        
        # WIS for a single interval
        score = np.abs(upper - lower)
        if y_true < np.minimum(upper, lower):
            score += ((2/alpha) * (np.minimum(upper, lower) - y_true))
        if y_true > np.maximum(upper, lower):
            score += ((2/alpha) * (y_true - np.maximum(upper, lower)))
        # coverage for one single row
        coverage  = 1
        if (y_true < np.minimum(upper, lower)) or (y_true > np.maximum(upper, lower)):
            coverage = 0
        return score, coverage

# vectorize the function
v_WIS_and_coverage = np.vectorize(WIS_and_coverage)

In [None]:

def score(y_true, lower, upper, alpha):
        
        y_true = y_true.astype(float)
        lower  = lower.astype(float)
        upper  = upper.astype(float)
        
        WIS_score,coverage = v_WIS_and_coverage(y_true, lower, upper, alpha)
        MWIS     = np.mean(WIS_score)
        coverage = coverage.sum() / coverage.shape[0]
        
        MWIS      = float(MWIS)
        coverage  = float(coverage)
        
        return MWIS, coverage

In [None]:
MWIS, coverage = score(predictions["y_true"], predictions["lower"], predictions["upper"], alpha)

MWIS

In [None]:
predictions["abs_error"].mean()

In [None]:
with open('models/mapie.pkl', 'wb') as file:
    pickle.dump(mapie, file)