See https://www.kaggle.com/code/carlmcbrideellis/regression-prediction-intervals-with-mapie/notebook

In [None]:
import numpy as np
import pandas as pd
import pickle
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from mapie.regression import MapieQuantileRegressor

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_columns', None) # show all columns in a df

In [None]:
X_train = pd.read_csv("split_data/train_features_preprocessed.csv")
y_train = pd.read_csv("split_data/train_target_preprocessed.csv")

In [None]:
X_train.shape

In [None]:
X_calib = pd.read_csv("split_data/calib_features_preprocessed.csv")
y_calib = pd.read_csv("split_data/calib_target_preprocessed.csv")

In [None]:
y_calib.shape

In [None]:
X_val = pd.read_csv("split_data/val_features_preprocessed.csv")
y_val = pd.read_csv("split_data/val_target_preprocessed.csv")

In [None]:
y_val.shape

In [None]:
alpha = 0.1 # for 90% target coverage

In [None]:

regressor = LGBMRegressor( n_estimators       = 1000,
                           learning_rate      = 0.05, 
                           max_depth          = 7, 
                           min_child_samples  = 8,
                           random_state       = 42,
                           objective          = 'quantile',
                           alpha              = alpha,
                           verbose = 50
                         )

In [None]:
lgb_model = LGBMRegressor(objective='regression', metric='mae', verbose = 50)

# Define hyperparameters grid
param_grid = {
    'num_leaves': [30, 50], # not ok: should be <= 2^max_depth
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [500, 1000],
    'max_depth': [5, 7],
    'min_child_samples': [5, 8],
    'n_estimators' : [500, 1000]
}

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Use negative MAE because sklearn wants higher values to be better
    cv=5,                               # 5-fold cross-validation                 
    n_jobs=-1                           # Use all CPU cores for faster computation
)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

In [None]:
grid_search.best_params_

In [None]:
alpha = 0.125 

In [None]:
regressor = LGBMRegressor( n_estimators       = 500,
                           learning_rate      = 0.05, 
                           max_depth          = 7, 
                           min_child_samples  = 8,
                           random_state       = 16,
                           num_leaves         = 50,
                           objective          = 'quantile',
                           alpha              = alpha,
                         )

In [None]:
mapie = MapieQuantileRegressor(estimator=regressor, cv="split", alpha=alpha)
mapie.fit(X_train, np.ravel(y_train), X_calib=X_calib, y_calib=np.ravel(y_calib))
y_pred, y_pis = mapie.predict(X_val)

In [None]:
predictions_mapie = np.ravel(y_val)
predictions_mapie

In [None]:
y_pred.shape

In [None]:
predictions_mapie.columns

In [None]:
predictions_mapie = y_val
#predictions_mapie.columns = ['y_true']
predictions_mapie["point prediction"] = y_pred
predictions_mapie["lower"] = y_pis.reshape(-1,2)[:,0]
predictions_mapie["upper"] = y_pis.reshape(-1,2)[:,1]
predictions_mapie["abs_error"] = abs(predictions_mapie ["point prediction"] - predictions_mapie ["y_true"])
# take a quick look
predictions_mapie 

In [None]:

def WIS_and_coverage(y_true, lower, upper, alpha):
        
        if np.isnan(lower)  == True: 
            raise ParticipantVisibleError("lower interval value contains NaN value(s)")
        if np.isinf(lower)  == True: 
            raise ParticipantVisibleError("lower interval value contains inf values(s)")
        if np.isnan(upper)  == True: 
            raise ParticipantVisibleError("upper interval value contains NaN value(s)")
        if np.isinf(upper)  == True: 
            raise ParticipantVisibleError("upper interval value contains inf values(s)")
        # These should not occur in a competition setting
        if np.isnan(y_true) == True:
            raise ParticipantVisibleError("y_true contains NaN value(s)")
        if np.isinf(y_true) == True: 
            raise ParticipantVisibleError("y_true contains inf values(s)")
        
        # WIS for a single interval
        score = np.abs(upper - lower)
        if y_true < np.minimum(upper, lower):
            score += ((2/alpha) * (np.minimum(upper, lower) - y_true))
        if y_true > np.maximum(upper, lower):
            score += ((2/alpha) * (y_true - np.maximum(upper, lower)))
        # coverage for one single row
        coverage  = 1
        if (y_true < np.minimum(upper, lower)) or (y_true > np.maximum(upper, lower)):
            coverage = 0
        return score, coverage

# vectorize the function
v_WIS_and_coverage = np.vectorize(WIS_and_coverage)

In [None]:

def score(y_true, lower, upper, alpha):
        
        y_true = y_true.astype(float)
        lower  = lower.astype(float)
        upper  = upper.astype(float)
        
        WIS_score,coverage = v_WIS_and_coverage(y_true, lower, upper, alpha)
        MWIS     = np.mean(WIS_score)
        coverage = coverage.sum() / coverage.shape[0]
        
        MWIS      = float(MWIS)
        coverage  = float(coverage)
        
        return MWIS, coverage

In [None]:
MWIS, coverage = score(predictions_mapie["y_true"], predictions_mapie["lower"], predictions_mapie["upper"], alpha = .20)

MWIS

In [None]:
predictions_mapie["abs_error"].mean()

In [None]:
with open('models/mapie.pkl', 'wb') as file:
    pickle.dump(mapie, file)

## Gradient boosting regressor (sklearn)

https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-quantile-py

Alternative for MAPIE:

https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-quantile-py


In [None]:
y_train = np.ravel(y_train)

In [None]:
# gbmodel = GradientBoostingRegressor(loss = 'absolute_error')

all_models = {}
common_params = dict(
    learning_rate=0.05,
    n_estimators=200,
    max_depth=2,
    min_samples_leaf=9,
    min_samples_split=9,
)
for alpha in [0.10, 0.5, 0.90]:
    gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params)
    all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)

In [None]:
# gbmodel = GradientBoostingRegressor(loss = 'absolute_error')

all_models = {}
common_params = dict(
    learning_rate=0.05,
    n_estimators=200,
    max_depth=2,
    min_samples_leaf=9,
    min_samples_split=9,
)
for alpha in [0.10, 0.5, 0.90]:
    gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params)
    all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)

In [None]:
# gbmodel = GradientBoostingRegressor(loss = 'absolute_error')

# {'learning_rate': 0.1,
#  'max_depth': 10,
#  'min_samples_leaf': 20,
#  'min_samples_split': 60}

all_models = {}
common_params = dict(
    learning_rate=0.1,
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=20,
    min_samples_split=60,
)
for alpha in [0.05, 0.5, 0.95]:
    gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params)
    all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)

In [None]:
gbr_mae = GradientBoostingRegressor(loss="absolute_error", **common_params)
all_models["mae"] = gbr_mae.fit(X_train, y_train)

In [None]:
y_pred = all_models["mae"].predict(X_calib)
y_lower = all_models["q 0.05"].predict(X_calib)
y_upper = all_models["q 0.95"].predict(X_calib)
y_med = all_models["q 0.50"].predict(X_calib)

In [None]:
# predictions = y_calib
# predictions.columns = ['y_true']
predictions = pd.DataFrame()
predictions['y_true'] = y_calib
predictions["point prediction"] = y_pred
predictions["med"] = y_med
predictions["lower"] = y_lower
predictions["upper"] = y_upper
predictions["midpoint"] = (y_upper+y_lower)/2

predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
predictions["abs_error_med"] = abs(predictions["med"] - predictions["y_true"])
predictions["abs_error_mid"] = abs(predictions["midpoint"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
MWIS, coverage = score(predictions["y_true"], predictions["lower"], predictions["upper"], alpha = .20)

MWIS

In [None]:
predictions["abs_error"].mean()

In [None]:
predictions["abs_error_med"].mean()

In [None]:
predictions["abs_error_mid"].mean()

In [None]:
invalid_rows = predictions[~((predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']))]
display(invalid_rows)

In [None]:
predictions['point prediction'] = np.where(
    (predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']),
    predictions['point prediction'],
    predictions['midpoint']
)
predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])

In [None]:
invalid_rows = predictions[~((predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']))]
display(invalid_rows)

In [None]:
predictions["abs_error"].mean()

In [None]:
gbr = GradientBoostingRegressor(loss="absolute_error", verbose = 50)

# Define hyperparameters grid
param_grid = dict(
    learning_rate=[0.05, 0.1, 0.2],
    max_depth=[5, 10, 15, 20],
    min_samples_leaf=[1, 5, 10, 20, 25],
    min_samples_split=[20, 30, 50, 60, 70],
)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Use negative MAE because sklearn wants higher values to be better
    cv=5,                               # 5-fold cross-validation                 
    n_jobs=-1                           # Use all CPU cores for faster computation
)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

In [None]:
grid_search.best_params_

In [None]:
with open('models/gbr_sklearn_models.pkl', 'wb') as file:
    pickle.dump(all_models, file)

In [None]:
y_pred = all_models["mae"].predict(X_val)
y_lower = all_models["q 0.05"].predict(X_val)
y_upper = all_models["q 0.95"].predict(X_val)
y_med = all_models["q 0.50"].predict(X_val)

In [None]:
# predictions = y_val
#predictions.columns = ['y_true']
predictions = pd.DataFrame()
predictions['y_true'] = y_val
predictions["point prediction"] = y_pred
predictions["med"] = y_med
predictions["lower"] = y_lower
predictions["upper"] = y_upper

predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
predictions["abs_error_med"] = abs(predictions["med"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
MWIS, coverage = score(predictions["y_true"], predictions["lower"], predictions["upper"], alpha = .20)

MWIS

In [None]:
predictions["abs_error"].mean()

In [None]:
predictions["abs_error_med"].mean()

## With lat/lon instead of province

In [None]:
X_train_2 = pd.read_csv("split_data/train_features_preprocessed_2.csv")
y_train_2 = pd.read_csv("split_data/train_target_preprocessed_2.csv")

In [None]:
X_train_2.shape

In [None]:
y_train_2 = np.ravel(y_train_2)

In [None]:
gbr = GradientBoostingRegressor(loss="absolute_error", verbose = 50)

# Define hyperparameters grid
param_grid = dict(
    learning_rate=[0.05, 0.1, 0.2],
    max_depth=[5, 10, 15, 20, 30, 40],
    min_samples_leaf=[10, 20, 25, 30, 40],
    min_samples_split=[20, 30, 50, 60, 70],
)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Use negative MAE because sklearn wants higher values to be better
    cv=5,                               # 5-fold cross-validation                 
    n_jobs=-1                           # Use all CPU cores for faster computation
)

grid_search.fit(X_train_2, y_train_2)

print("Best parameters found: ", grid_search.best_params_)

In [None]:
grid_search.best_params_

In [None]:

all_models = {}
common_params = dict(
    learning_rate=0.1,
    n_estimators=200,
    max_depth=30,
    min_samples_leaf=40,
    min_samples_split=20,
)
for alpha in [0.05, 0.5, 0.95]:
    gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params)
    all_models["q %1.2f" % alpha] = gbr.fit(X_train_2, y_train_2)

In [None]:
gbr_mae = GradientBoostingRegressor(loss="absolute_error", **common_params)
all_models["mae"] = gbr_mae.fit(X_train_2, y_train_2)

In [None]:
X_calib_2 = pd.read_csv("split_data/calib_features_preprocessed_2.csv")
y_calib_2 = pd.read_csv("split_data/calib_target_preprocessed_2.csv")

In [None]:
y_pred = all_models["mae"].predict(X_calib_2)
y_lower = all_models["q 0.05"].predict(X_calib_2)
y_upper = all_models["q 0.95"].predict(X_calib_2)
y_med = all_models["q 0.50"].predict(X_calib_2)

In [None]:
# predictions = y_calib
# predictions.columns = ['y_true']
predictions = pd.DataFrame()
predictions['y_true'] = y_calib_2
predictions["point prediction"] = y_pred
predictions["med"] = y_med
predictions["lower"] = y_lower
predictions["upper"] = y_upper
predictions["midpoint"] = (y_upper+y_lower)/2

predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
predictions["abs_error_med"] = abs(predictions["med"] - predictions["y_true"])
predictions["abs_error_mid"] = abs(predictions["midpoint"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
invalid_rows = predictions[~((predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']))]
display(invalid_rows)

In [None]:
predictions['point prediction'] = np.where(
    (predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']),
    predictions['point prediction'],
    predictions['midpoint']
)
predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])

In [None]:
invalid_rows = predictions[~((predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']))]
display(invalid_rows)

In [None]:
MWIS, coverage = score(predictions["y_true"], predictions["lower"], predictions["upper"], alpha = .20)

MWIS

In [None]:
predictions["abs_error"].mean()

In [None]:
with open('models/gbr_sklearn_models_2.pkl', 'wb') as file:
    pickle.dump(all_models, file)

## Attempt 3

Not uploaded because worse results than attempt 2

- Area is imputed taking number of bedrooms into account in addition to subtype
- A few cases with latitudes and longitudes outside of Belgium are removed --> improved quality of imputed (mean) lat and lon
- Statbel data no longer used (external source that needs to be updated): take median price per area (!) per type (is appartment Y/N) per zipcode from training data
- <s>Flag price drop added

In [None]:
X_train_3 = pd.read_csv("split_data/train_features_preprocessed_3.csv")
y_train_3 = pd.read_csv("split_data/train_target_preprocessed_3.csv")

In [None]:
X_train_3.shape

In [None]:
X_train_3 = X_train_3.drop(['price_dropped', 'lon_missing'], axis = 1)

In [None]:
y_train_3 = np.ravel(y_train_3)

In [None]:
grid_search.best_params_

In [None]:
gbr = GradientBoostingRegressor(loss="absolute_error", verbose = 50)

# Define hyperparameters grid
# param_grid = dict(
#     learning_rate=[0.05, 0.1, 0.2],
#     max_depth=[5, 10, 15, 20, 25, 30, 35],
#     min_samples_leaf=[20, 30, 40, 50],
#     min_samples_split=[20, 30, 50, 60, 70],
# )

param_grid = {
    'max_depth': [15, 20, 25, 30, 35],
    'min_samples_split': [10, 15, 20, 25],
    'min_samples_leaf': [5, 10, 15, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 150]
}

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Use negative MAE because sklearn wants higher values to be better
    cv=5,                               # 5-fold cross-validation                 
    n_jobs=-1                           # Use all CPU cores for faster computation
)

grid_search.fit(X_train_3, y_train_3)

print("Best parameters found: ", grid_search.best_params_)

In [None]:
grid_search.best_params_

In [None]:
all_models = {}
common_params = dict(
    learning_rate=0.05,
    n_estimators=150,
    max_depth=35,
    min_samples_leaf=20,
    min_samples_split=15,
)
for alpha in [0.05, 0.5, 0.95]:
    gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params)
    all_models["q %1.2f" % alpha] = gbr.fit(X_train_3, y_train_3)

In [None]:
gbr_mae = GradientBoostingRegressor(loss="absolute_error", **common_params)
all_models["mae"] = gbr_mae.fit(X_train_3, y_train_3)

In [None]:
X_valid_3 = pd.read_csv("split_data/calib_features_preprocessed_3.csv")
y_valid_3 = pd.read_csv("split_data/calib_target_preprocessed_3.csv")

X_valid_3 = X_valid_3.drop('price_dropped', axis = 1)

In [None]:
y_pred = all_models["mae"].predict(X_valid_3)
y_lower = all_models["q 0.05"].predict(X_valid_3)
y_upper = all_models["q 0.95"].predict(X_valid_3)
y_med = all_models["q 0.50"].predict(X_valid_3)

In [None]:
predictions = pd.DataFrame()
predictions['y_true'] = y_valid_3
predictions["point prediction"] = y_pred
predictions["med"] = y_med
predictions["lower"] = y_lower
predictions["upper"] = y_upper
predictions["midpoint"] = (y_upper+y_lower)/2

predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
predictions["abs_error_med"] = abs(predictions["med"] - predictions["y_true"])
predictions["abs_error_mid"] = abs(predictions["midpoint"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
invalid_rows = predictions[~((predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']))]
display(invalid_rows)

In [None]:
predictions['point prediction'] = np.where(
    (predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']),
    predictions['point prediction'],
    predictions['midpoint']
)
predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])

In [None]:
invalid_rows = predictions[~((predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']))]
display(invalid_rows)

In [None]:
MWIS, coverage = score(predictions["y_true"], predictions["lower"], predictions["upper"], alpha = .20)

MWIS

In [None]:
predictions["abs_error"].mean()

Worst predictions on most expensive (outliers?) houses

predictions.sort_values("abs_error", ascending = False).head(30)

In [None]:
X_valid_3[y_valid_3['price'] == 999000]

## Attempt 3b

Changes applied in attempt 3, but/and:
- Use Statbel data again instead of median prices calculated based on training data
- Median price per area: calculated per group of first 3 zipcode digits (more granular) instead of first 2
- Flag indicating if last 2 (but not 3) or last 3 digits are 0 - indication of bigger cities?

Last two changes are attempts to predict higher prices better (regional differences related to price)

In [None]:
X_train_3b = pd.read_csv("split_data/train_features_preprocessed_3b.csv")
y_train_3b = pd.read_csv("split_data/train_target_preprocessed_3b.csv")

In [None]:
y_train_3b = np.ravel(y_train_3b)
X_train_3b.shape

In [None]:
grid_search.best_params_

In [None]:
gbr = GradientBoostingRegressor(loss="absolute_error", verbose = 50)

# param_grid = {
#     'max_depth': [25, 30, 35],
#     'min_samples_split': [10, 15, 20, 25],
#     'min_samples_leaf': [5, 10, 15, 20],
#     'learning_rate': [0.01, 0.05, 0.1]
#     #, 'n_estimators': [100, 150]
# }

param_grid = dict(
    learning_rate=[0.05, 0.1, 0.2],
    n_estimators = [100, 150, 200],
    max_depth=[20, 25, 30, 35],
    min_samples_leaf=[20, 30, 40],
    min_samples_split=[15, 20, 25, 30]    
)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Use negative MAE because sklearn wants higher values to be better
    cv=5,                               # 5-fold cross-validation                 
    n_jobs=-1                           # Use all CPU cores for faster computation
)

grid_search.fit(X_train_3b, y_train_3b)

print("Best parameters found: ", grid_search.best_params_)

In [None]:
grid_search.best_params_

In [None]:
all_models = {}

common_params = dict(
    learning_rate=0.05,
    n_estimators=200,
    max_depth=25,
    min_samples_leaf=30,
    min_samples_split=20,
)

# try params of model 2
# common_params = dict(
#     learning_rate=0.1,
#     n_estimators=200,
#     max_depth=30,
#     min_samples_leaf=40,
#     min_samples_split=20,
# )

for alpha in [0.05, 0.5, 0.95]:
    gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params)
    all_models["q %1.2f" % alpha] = gbr.fit(X_train_3b, y_train_3b)

In [None]:
gbr_mae = GradientBoostingRegressor(loss="absolute_error", **common_params)
all_models["mae"] = gbr_mae.fit(X_train_3b, y_train_3b)

In [None]:
X_calib_3b = pd.read_csv("split_data/calib_features_preprocessed_3b.csv")
y_calib_3b = pd.read_csv("split_data/calib_target_preprocessed_3b.csv")

In [None]:
y_pred = all_models["mae"].predict(X_calib_3b)
y_lower = all_models["q 0.05"].predict(X_calib_3b)
y_upper = all_models["q 0.95"].predict(X_calib_3b)
y_med = all_models["q 0.50"].predict(X_calib_3b)

In [None]:
# predictions = y_calib
# predictions.columns = ['y_true']
predictions = pd.DataFrame()
predictions['y_true'] = y_calib_3b
predictions["point prediction"] = y_pred
predictions["med"] = y_med
predictions["lower"] = y_lower
predictions["upper"] = y_upper
predictions["midpoint"] = (y_upper+y_lower)/2

predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
predictions["abs_error_med"] = abs(predictions["med"] - predictions["y_true"])
predictions["abs_error_mid"] = abs(predictions["midpoint"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
invalid_rows = predictions[~((predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']))]
display(invalid_rows)

In [None]:
predictions['point prediction'] = np.where(
    (predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']),
    predictions['point prediction'],
    predictions['midpoint']
)
predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])

In [None]:
invalid_rows = predictions[~((predictions['point prediction'] >= predictions['lower']) & (predictions['point prediction'] <= predictions['upper']))]
display(invalid_rows)

In [None]:
MWIS, coverage = score(predictions["y_true"], predictions["lower"], predictions["upper"], alpha = .20)

MWIS

In [None]:
predictions["abs_error"].mean()

In [None]:
predictions.sort_values("abs_error", ascending = False).head(30)

In [None]:
X_calib_3b[y_calib_3b['price'] == 835000]

## Test TabPFN

In [None]:
import os
os.environ["TABPFN_ALLOW_CPU_LARGE_DATASET"] = "1"

In [None]:
import pandas as pd
import numpy as np

In [None]:
from tabpfn import TabPFNRegressor

In [None]:
X_train_3_1 = pd.read_csv("split_data/train_features_preprocessed_3_1.csv")
y_train_3_1 = pd.read_csv("split_data/train_target_preprocessed_3_1.csv")

In [None]:
y_train_3_1 = np.ravel(y_train_3_1)

In [None]:
y_train_3_1.shape

In [None]:
# help(TabPFNRegressor)

In [None]:
regressor = TabPFNRegressor(device = "cpu", fit_mode = "low_memory")  
regressor.fit(X_train_3_1, y_train_3_1)

In [None]:
X_calib_2 = pd.read_csv("split_data/calib_features_preprocessed_2.csv")
y_calib_2 = pd.read_csv("split_data/calib_target_preprocessed_2.csv")

In [None]:
predictions = regressor.predict(X_calib_2)

In [None]:
y_pred = predictions
y_pred

In [None]:
# predictions = y_calib
# predictions.columns = ['y_true']
predictions = pd.DataFrame()
predictions['y_true'] = y_calib_2
predictions["point prediction"] = y_pred
#predictions["med"] = y_med
#predictions["lower"] = y_lower
#predictions["upper"] = y_upper
#predictions["midpoint"] = (y_upper+y_lower)/2

predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
#predictions["abs_error_med"] = abs(predictions["med"] - predictions["y_true"])
#predictions["abs_error_mid"] = abs(predictions["midpoint"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
# Predict quantiles
quantiles = [0.05, 0.5, 0.95]
quantile_predictions = regressor.predict(
    X_calib_2,
    output_type="quantiles",
    quantiles=quantiles,
)
#for q, q_pred in zip(quantiles, quantile_predictions):
#    print(f"Quantile {q} MAE:", mean_absolute_error(y_test, q_pred))

In [None]:
quantile_predictions[1]

In [None]:
predictions = pd.DataFrame()
predictions['y_true'] = y_calib_2
predictions["point prediction"] = y_pred
predictions["med"] = quantile_predictions[1]
predictions["lower"] = quantile_predictions[0]
predictions["upper"] = quantile_predictions[2]
predictions["midpoint"] = (quantile_predictions[0]+quantile_predictions[2])/2

predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
predictions["abs_error_med"] = abs(predictions["med"] - predictions["y_true"])
predictions["abs_error_mid"] = abs(predictions["midpoint"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
MWIS, coverage = score(predictions["y_true"], predictions["lower"], predictions["upper"], alpha = .20)

MWIS

In [None]:
predictions["abs_error"].mean()

### With larger (almost 10000) training set

In [None]:
X_train_4_1 = pd.read_csv("split_data/train_features_preprocessed_4_1.csv")
y_train_4_1 = pd.read_csv("split_data/train_target_preprocessed_4_1.csv")

In [None]:
from datetime import datetime

In [None]:
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
y_train_4_1 = np.ravel(y_train_4_1)

In [None]:
# regressor4 = TabPFNRegressor(device = "cpu")  
# regressor4.fit(X_train_4_1, y_train_4_1)

In [None]:
pred_y = regressor4.predict(X_calib_2)
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
pred_y

In [None]:
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
# predictions = y_calib
# predictions.columns = ['y_true']
predictions = pd.DataFrame()
predictions['y_true'] = y_calib_2
predictions["point prediction"] = pred_y
#predictions["med"] = y_med
#predictions["lower"] = y_lower
#predictions["upper"] = y_upper
#predictions["midpoint"] = (y_upper+y_lower)/2

predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
#predictions["abs_error_med"] = abs(predictions["med"] - predictions["y_true"])
#predictions["abs_error_mid"] = abs(predictions["midpoint"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
predictions["abs_error"].mean()

In [None]:
quantiles = [0.05, 0.5, 0.95]
quantile_predictions4 = regressor4.predict(
    X_calib_2,
    output_type="quantiles",
    quantiles=quantiles,
)

print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
quantiles

In [None]:
predictions = pd.DataFrame()
predictions['y_true'] = y_calib_2
predictions["point prediction"] = pred_y
predictions["med"] = quantile_predictions4[1]
predictions["lower"] = quantile_predictions4[0]
predictions["upper"] = quantile_predictions4[2]
predictions["midpoint"] = (quantile_predictions4[0]+quantile_predictions4[2])/2

predictions["abs_error"] = abs(predictions["point prediction"] - predictions["y_true"])
predictions["abs_error_med"] = abs(predictions["med"] - predictions["y_true"])
predictions["abs_error_mid"] = abs(predictions["midpoint"] - predictions["y_true"])
# take a quick look
predictions

In [None]:
MWIS, coverage = score(predictions["y_true"], predictions["lower"], predictions["upper"], alpha = .20)

MWIS

In [None]:
predictions["abs_error"].mean()