In [21]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Data preprocessing for HCMC survey dataset"""

__author__ = "Anna Buch, Heidelberg University"
__email__ = "a.buch@stud.uni-heidelberg.de"

## Feature selection done by eXtreme Gradient Boosting (XGBoost)


In [22]:
import os, sys
import copy as cp
from glob import glob
import numpy as np
import pandas as pd


import pickle
import joblib

from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.metrics import f1_score, confusion_matrix, mean_absolute_error
from sklearn.preprocessing import QuantileTransformer, quantile_transform, PowerTransformer, power_transform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, StratifiedKFold, RepeatedStratifiedKFold, RepeatedKFold, cross_val_score, cross_validate
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import PredictionErrorDisplay 
from sklearn.pipeline import Pipeline
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns


sys.path.insert(0, "../../../")
import utils.utils_feature_selection as fs
import utils.utils_evaluation as e
import utils.utils_figures as f
import utils.settings as s
import utils.pipelines_continous as p

s.init()
seed = s.seed

pd.set_option('display.max_columns', None)
plt.figure(figsize=(20, 10))


import warnings
warnings.filterwarnings('ignore')


transformation = True ##False
targets = ["Target_contentloss_euro", "Target_relative_contentloss_euro", "Target_businessreduction"]
target = targets[0]

<Figure size 2000x1000 with 0 Axes>

In [23]:
df_candidates = pd.read_excel("../../../input_survey_data/input_data_contentloss.xlsx")
#df_candidates = pd.read_excel("../../../input_survey_data/input_data_businessreduction.xlsx")


In [24]:
# ### use only relative loss as target or only absolute loss as target
#df_candidates = df_candidates.drop(["Target_contentloss_euro", "shp_content_value_euro"], axis=1)
df_candidates = df_candidates.drop(["Target_relative_contentloss_euro"], axis=1)


# print(f"Removing {df_candidates.loc[df_candidates[target]==0,:].shape[0]} zero loss records")
# df_candidates = df_candidates.loc[df_candidates[target]!=0,:]

# print(f"Keeping {df_candidates.shape} damage cases for model training and evaluation")


### Test remove further features

In [25]:

# df_candidates.columns
# df_candidates = df_candidates[[
#     'Target_contentloss_euro', 'inundation_duration_h', 'water_depth_cm',
#     'contaminations.0', 'flowvelocity', 
#     'emergency_measures.1','emergency_measures.2', 
#     #'emergency_measures.3', 'emergency_measures.4',
#     #'emergency_measures.6', 'emergency_measures.7', 'emergency_measures.8',
#     'emergency_measures.9', 'overall_problem_house',
#     #'protect_valuables_impl', 'water_barriers_impl',
#     #'pumping_equipment_impl', 'elevation_building_impl',
#     #'resistant_material_building_impl', 'electricity_higher_impl',
#     #'flood_protections_impl', 'flood_experience',
#     'elevation_building_height_cm', 'elevation_rel2surrounding_cat', 'bage',
#     'b_area', 'hh_monthly_income_cat', 'shp_owner', 'shp_sector',
#     'shp_employees', 'shp_avgmonthly_sale_cat', 'shp_finance_investments',
#     #'shp_risk_tolerance', 'shp_monetary_resources4prevention',
#     #'resilience_city_protection', 'resilience_left_alone',
#     #'resilience_neighbor_management',
#     #'perception_who_responsible4protection.Rank1', 'contaminations_light',
#     #'contaminations_heavy', '
#      'shp_content_value_euro',
#     'shp_registered_capital_euro'
# ]]

df_candidates = df_candidates.drop([
        "contaminations_light", "contaminations_heavy", "emergency_measures.9",
        "emergency_measures.6", "emergency_measures.7", # BETTER DROP test rmv da nur binary
        "elevation_rel2surrounding_cat",  # TEST
        #"shp_profits_last5years", # TEST
        "shp_monetary_resources4prevention", # TEST
        'shp_finance_investments',
        'shp_risk_tolerance',
        'resilience_city_protection', 'resilience_left_alone',
        'resilience_neighbor_management', 
        # already in d_cleaning RM 'resilience_more_future_affected','resilience_govern_careing', 'resilience_govern_careing_increases',
        'perception_who_responsible4protection.Rank1',
        # ALREADY RM in d_cleaning: 'perception_private_economy_future' 
        ], axis=1)


In [26]:
df_candidates.columns


## delete features with more than 10% missing values
print("Percentage of missing values per feature [%]\n", df_candidates.isna().mean().sort_values(ascending=False)[:15]  * 100) 
#df_candidates = df_candidates[df_candidates.columns[df_candidates.isna().mean() < 0.10]]  # drop feautres with more than 10% missing values
#print(df_candidates.isna().sum(axis=0).sort_values(ascending=False))
## --> drops content values if threshold == 15%

# print(df_candidates.Target_relative_contentloss_euro.describe())
# print(df_candidates.Target_relative_contentloss_euro.isna().sum())

# df_candidates["Target_relative_contentloss_log_euro"] = np.log1p(df_candidates.Target_relative_contentloss_euro) # natural log
# #df_candidates["Target_relative_contentloss_log_euro"] = np.lognormal(df_candidates.Target_relative_contentloss_euro)
# #
# df_candidates["Target_relative_contentloss_log_euro"].describe()



Percentage of missing values per feature [%]
 elevation_building_height_cm           15.869018
shp_content_value_euro                 15.869018
resilience_govern_careing_increases    13.602015
shp_registered_capital_euro            11.838791
bage                                    6.801008
hh_monthly_income_cat                   6.045340
Target_contentloss_euro                 3.778338
inundation_duration_h                   2.267003
b_area                                  0.503778
water_depth_cm                          0.251889
emergency_measures.3                    0.000000
emergency_measures.2                    0.000000
contaminations.0                        0.000000
shp_avgmonthly_sale_cat                 0.000000
shp_employees                           0.000000
dtype: float64


## Fit model 

In [27]:
learning_rate = [ 0.0001, 0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.8] # store outside, for plotting
#n_estimators = [ 50, 100, 200, 300, 500, 800] # 30, 50, 70, 100, 200, 300, 400], 

param_grid = {
              'model__n_estimators': [3, 5, 8, 10, 12, 15, 18, 20 ,30 ,40], # get only low train scores with this
              'model__max_depth': [1, 2, 3, 5, 7, 10, 15],              #'model__max_leaves': [0, 3, 5],
            # 'model__colsample_bytree': [0.1, 0.3, 0.5, 0.7, 1.0 ], # Percentage of columns to be randomly samples for each tree
            # 'model__colsample_bynode': [0.1, 0.3, 0.5, 0.7, 1.0], # nbr of feautres for each split point
             #'model__learning_rate': learning_rate,  # == eta
             # 'model__gamma': [0.1, 0.2, 0.3, 0.5 ] , # min_split_loss -  larger gamma is, the more conservative the algorithm is
            #'model__subsample': [0.0, 0.2, 0.5, 0.6, 0.8, 0.9],  # define subsample of train st prior to growing trees, prevent overfitting
             # 'model__reg_alpha': [0.0, 0.5, 1.0, 2.0, 4.0, 5.0, 6.0 ,7.0],   # Lasso Regularization term on weights , higher values = more consrvative 
             # 'model__reg_lambda': [0.0,  0.05, 0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0],  # Ridge Regularization term on weights ,  higher values = more consrvative
              #'model__min_child_weight': [0, 1, 2, 3, 4,],
            #   "model__max_delta_step":  [0, 3, 5, 6, 7],           # for LogisticReg good to solve imbalance 
              'model__objective': [None, 'reg:squarederror', 'reg:logistic', 'reg:absoluteerror'],#'multi:softprob,'reg:squarederror','reg:models_trained'],
          #  # 'model__tree_method': ["hist", "gpu_hist"],
           'model__booster': ["gblinear"], # [None, "gblinear", "gbtree"],
           "model__validate_parameters":[True],
              }

# from sklearn.ensemble import RandomForestRegressor, XGBRFRegressor, GradientBoostingRegressor

param_grid_transform = {
              'model__regressor__n_estimators': [3, 5, 8, 10, 12, 15, 18, 20 ,30 ,40], # get only low train scores with this
              'model__regressor__max_depth': [1, 2, 3, 5, 7, 10, 15],              #'model__max_leaves': [0, 3, 5],
            # 'model__colsample_bytree': [0.1, 0.3, 0.5, 0.7, 1.0 ], # Percentage of columns to be randomly samples for each tree
            # 'model__colsample_bynode': [0.1, 0.3, 0.5, 0.7, 1.0], # nbr of feautres for each split point
             #'model__learning_rate': learning_rate,  # == eta
             # 'model__gamma': [0.1, 0.2, 0.3, 0.5 ] , # min_split_loss -  larger gamma is, the more conservative the algorithm is
            #'model__subsample': [0.0, 0.2, 0.5, 0.6, 0.8, 0.9],  # define subsample of train st prior to growing trees, prevent overfitting
             # 'model__reg_alpha': [0.0, 0.5, 1.0, 2.0, 4.0, 5.0, 6.0 ,7.0],   # Lasso Regularization term on weights , higher values = more consrvative 
             # 'model__reg_lambda': [0.0,  0.05, 0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0],  # Ridge Regularization term on weights ,  higher values = more consrvative
              #'model__min_child_weight': [0, 1, 2, 3, 4,],
            #   "model__max_delta_step":  [0, 3, 5, 6, 7],           # for LogisticReg good to solve imbalance 
              'model__regressor__objective': [None, 'reg:squarederror', 'reg:logistic', 'reg:absoluteerror'],#'multi:softprob,'reg:squarederror','reg:models_trained'],
          #  # 'model__tree_method': ["hist", "gpu_hist"],
           'model__regressor__booster': ["gblinear"], # [None, "gblinear", "gbtree"],
           "model__regressor__validate_parameters":[True],
              }

# "model__regressor__booster"
# Objective candidate: multi:softmax
# Objective candidate: multi:softprob
# Objective candidate: reg:squarederror
# Objective candidate: reg:squaredlogerror
# Objective candidate: reg:logistic
## Objective candidate: reg:linear
# Objective candidate: reg:pseudohubererror
# Objective candidate: reg:gamma
# Objective candidate: reg:absoluteerror

## DOC: https://xgboost.readthedocs.io/en/stable/parameter.html



In [28]:
X_train.describe()

Unnamed: 0,inundation_duration_h,water_depth_cm,contaminations.0,flowvelocity,emergency_measures.1,emergency_measures.2,emergency_measures.3,emergency_measures.4,emergency_measures.8,overall_problem_house,protect_valuables_impl,water_barriers_impl,pumping_equipment_impl,elevation_building_impl,resistant_material_building_impl,electricity_higher_impl,flood_protections_impl,flood_experience,elevation_building_height_cm,bage,b_area,hh_monthly_income_cat,shp_owner,shp_sector,shp_employees,shp_avgmonthly_sale_cat,resilience_govern_careing_increases,shp_content_value_euro,shp_registered_capital_euro
count,281.0,285.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,239.0,268.0,285.0,268.0,286.0,286.0,286.0,286.0,245.0,240.0,254.0
mean,0.046026,0.23351,0.08042,0.492133,0.402098,0.426573,0.706294,0.534965,0.045455,0.909091,0.477273,0.551573,0.628497,0.442308,0.958042,0.746503,0.975524,0.734266,0.243724,0.193507,0.077115,0.333156,0.029178,0.109118,0.027972,0.19021,0.285714,0.074413,0.00869
std,0.110663,0.181609,0.272418,0.310177,0.491181,0.495446,0.456257,0.49965,0.208664,0.287984,0.484324,0.470356,0.469566,0.413789,0.187284,0.418839,0.150481,0.249942,0.148839,0.145611,0.09846,0.160124,0.130201,0.202263,0.067409,0.192969,0.286299,0.083064,0.064125
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.006263,0.09396,0.0,0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.75,1.0,0.6,0.153846,0.1,0.030364,0.285714,0.0,0.0,0.0,0.0,0.0,0.031078,0.000474
50%,0.010438,0.194631,0.0,0.5,0.0,0.0,1.0,1.0,0.0,1.0,0.25,0.75,1.0,0.75,1.0,1.0,1.0,0.8,0.230769,0.17,0.062753,0.285714,0.017241,0.077922,0.030303,0.2,0.25,0.058501,0.001
75%,0.02714,0.328859,0.0,0.75,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.75,1.0,1.0,1.0,1.0,0.346154,0.24,0.089069,0.428571,0.022989,0.13961,0.030303,0.4,0.5,0.085923,0.003105
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
## iterate over piplines. Each piplines contains precrosseing methods and several  classifier
pipelines = ["pipe_xgb"]
eval_set_list = []

for pipe_name in pipelines:

    print( f"\nApply XGBoost on {target}, with pipeline {pipe_name}:")

    ## load sinlge pipeline
    pipe = joblib.load(f'./pipelines/{pipe_name}.pkl')
        

    df_candidates_t = df_candidates

    ## for Box-cox transformation target values have to be > 0.0, therefore a constant is added to all target values
    ## due that it is a constant, it doesnt influence model performance
    df_candidates_t[target] = df_candidates_t[target] + 0.0001

    ## drop samples where target is nan
    #df_candidates_t = df_candidates_t.dropna()

    #print("Amount of missing target values should be zero: ", df_candidates_t[target].isna().sum())
    print("Uses ", df_candidates_t.shape[0], " records, from those have ", 
        { (df_candidates_t[target][df_candidates_t[target]==0.0]).count() }, f" records zero {target.split('_')[1]}")

    ## drop samples where target is nan
    print(f"Dropping {df_candidates_t[f'{target}'].isna().sum()} records from entire dataset due that these values are nan in target variable")
    df_candidates_t = df_candidates_t[ ~df_candidates_t[f"{target}"].isna()]




    X_unscaled = df_candidates_t.drop(target, axis=1)  # remove  target from X
    y = df_candidates_t[target]
 
    ## test train split
    X_train, X_test, y_train, y_test = train_test_split(
        X_unscaled, y, test_size=0.25, 
        random_state=seed, shuffle=True)
    
    ## save evaluation set for later usage in feature importance
    eval_set =  pd.concat([y_test, X_test], axis=1) #[(X_test, y_test)]
    eval_set_list.append({pipe_name : eval_set})
      
    print("Training set size", X_train.shape[0])
    print("Test set size", X_test.shape[0])

    ## normalize data 
    X_train, X_test = fs.normalize_X(X_train, X_test)
        
    ## Hyperparmaters and CV
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=seed)
    model_cv = RandomizedSearchCV(
        estimator= pipe,
        param_distributions=param_grid, #
        #param_distributions=param_bag_grid, 
        cv=cv, 
        scoring= "neg_mean_absolute_error",##"neg_mean_absolute_error", #"neg_mean_squared_error",#"r2" ,#"neg_mean_absolute_error",   #TODO classifcation: test also e.g "f1" or recall or "f1_micro", "neg_mean_absolute_error",
        refit=True,   ## Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting.
                        ## If refit=False, clf.fit() will have no effect because the GridSearchCV object inside the pipeline will be reinitialized after fit().
                        ## ! When refit=True, the GridSearchCV will be refitted with the best scoring parameter combination on the whole data that is passed in fit()
        random_state=seed
    )
    ## Fit model
    model_cv.fit(X_train, y_train)   
    # ## Fit best model on training set
    # model_cv.fit(
    #     X_train, y_train,
    #     model__early_stopping_rounds=3,
    #     model__eval_metric="mae",
    #     model__eval_set=[(X_test, y_test)],
    #     model__verbose=False
    #     )
    model =  model_cv.best_estimator_

    if transformation:
       
        model_log_pipe = joblib.load(f'./pipelines/pipe_xgb_log.pkl')
        model_log_cv = RandomizedSearchCV(
            estimator= model_log_pipe,
            param_distributions=param_grid_transform,
            cv=cv, 
            scoring= "neg_mean_absolute_error",##"neg_mean_absolute_error", #"neg_mean_squared_error",#"r2" ,#"neg_mean_absolute_error",   #TODO classifcation: test also e.g "f1" or recall or "f1_micro", "neg_mean_absolute_error",
            refit=True,        
            random_state=seed
        )
        ## Fit model with log-transformed target
        model_log_cv.fit(X_train, y_train)   
        model_log = model_log_cv.best_estimator_


        ## quantile transformed target
        model_quantile_pipe = joblib.load(f'./pipelines/pipe_xgb_quantile.pkl')
        model_quantile_cv = RandomizedSearchCV(
            estimator= model_quantile_pipe,
            param_distributions=param_grid_transform, 
            cv=cv, 
            scoring= "neg_mean_absolute_error",##"neg_mean_absolute_error", #"neg_mean_squared_error",#"r2" ,#"neg_mean_absolute_error",   #TODO classifcation: test also e.g "f1" or recall or "f1_micro", "neg_mean_absolute_error",
            refit=True,        
            random_state=seed
        )
        ## Fit model with qunatile-transformed target
        model_quantile_cv.fit(X_train, y_train)   
        model_quantile = model_quantile_cv.best_estimator_


        ## Box cox 
        model_boxcox_pipe = joblib.load(f'./pipelines/pipe_xgb_boxcox.pkl')
        model_boxcox_cv = RandomizedSearchCV(
            estimator= model_boxcox_pipe,
            param_distributions=param_grid_transform,
            cv=cv, 
            scoring= "neg_mean_absolute_error",##"neg_mean_absolute_error", #"neg_mean_squared_error",#"r2" ,#"neg_mean_absolute_error",   #TODO classifcation: test also e.g "f1" or recall or "f1_micro", "neg_mean_absolute_error",
            refit=True,        
            random_state=seed
        )
        ## Fit model with reciprocal-transformed target
        model_boxcox_cv.fit(X_train, y_train)   
        model_boxcox = model_boxcox_cv.best_estimator_


        ## square root 
        model_sqrt_pipe = joblib.load(f'./pipelines/pipe_xgb_sqrt.pkl')
        model_sqrt_cv = RandomizedSearchCV(
            estimator= model_sqrt_pipe,
            param_distributions=param_grid_transform,
            cv=cv, 
            scoring= "neg_mean_absolute_error",##"neg_mean_absolute_error", #"neg_mean_squared_error",#"r2" ,#"neg_mean_absolute_error",   #TODO classifcation: test also e.g "f1" or recall or "f1_micro", "neg_mean_absolute_error",
            refit=True,        
            random_state=seed
        )
        ## Fit model with reciprocal-transformed target
        model_sqrt_cv.fit(X_train, y_train)   
        model_sqrt = model_sqrt_cv.best_estimator_


    ## Evaluate
    ## print evaluation report + check for overfitting 
    print("\nEvaluation without transformed target:")
    print("\nTraining set")
    y_pred_train = model.predict(X_train)
    e.evaluation_report(y_train, y_pred_train)

    print("\nTesting set")
    y_pred = model.predict(X_test)
    e.evaluation_report(y_test, y_pred)
    #print(e.compute_score(y_test, y_pred))

    if transformation:
        for model in [{model_log:"natural log"}, {model_quantile: "quantile"}, {model_boxcox:"box-cox"}, {model_sqrt:"sqrt"}]:
            transf_type = list(model.values())[0]
            model = list(model.keys()) [0]
            #transf_type = str(model).split("transformer=")[1].split("Transformer")[0]
            print(f"\nEvaluation with {transf_type}-transfomred target:")
            print("\nTraining set")
            y_pred_train = model.predict(X_train)
            e.evaluation_report(y_train, y_pred_train)

            print("\nTesting set")
            y_pred = model.predict(X_test)
            e.evaluation_report(y_test, y_pred)
            #print(e.compute_score(y_test, y_pred))



Apply XGBoost on Target_contentloss_euro, with pipeline pipe_xgb:
Uses  397  records, from those have  {226}  records zero contentloss
Dropping 15 records from entire dataset due that these values are nan in target variable
Training set size 286
Test set size 96
Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Param

ValueError: 
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\compose\_target.py", line 250, in fit
    self._fit_transformer(y_2d)
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\compose\_target.py", line 184, in _fit_transformer
    self.transformer_.fit(y)
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\preprocessing\_data.py", line 3123, in fit
    self._fit(X, y=y, force_transform=False)
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\preprocessing\_data.py", line 3147, in _fit
    X = self._check_input(X, in_fit=True, check_positive=True)
  File "c:\Anaconda\envs\py396_c3\lib\site-packages\sklearn\preprocessing\_data.py", line 3397, in _check_input
    raise ValueError(
ValueError: The Box-Cox transformation can only be applied to strictly positive data


In [None]:
print(stats.describe(y_test), "\n")
print(stats.describe(model.predict(X_test)))
print(stats.describe(model_log.predict(X_test)))
print(stats.describe(model_quantile.predict(X_test)))
print(stats.describe(model_boxcox.predict(X_test)))

# kurtosis= peak from gaussian distrib, Negative kurtosis means that the data has fewer extreme outliers than a normal distribution.

## TODO make a boxplot

In [None]:
# importances = e.permutation_feature_importance(model, X_test, y_test, repeats=5, seed=seed)

# df_importance = pd.DataFrame(
#     {"importances" : importances[0]},
#     index=X_train.columns.to_list(),
#     ) 
# df_importance = df_importance.sort_values("importances", ascending=False)  # get most important features to the top
# print("Most important features:", df_importance.iloc[:5].index.to_list())
# #df_importance = df_importance.loc[df_importance.importances >= 0.000000, : ]

# df_importance

In [215]:
model_cv.best_params_

y_train

##  MAE: + in target unit +  less likely to be affected by extreme values.
## MAPE: r (MAPE) quantifies the average absolute difference between 
## the anticipated and observed values of the target variable as a percentage of the observed value. 
## The method works well for assessing models where the target variable spans a broad range of scales

# 75 % in train mit 300 trress, subsam0.8, maxdep=3, aber bad teest R2


## R²-Score: 0.46
# {'model__validate_parameters': True,
#  'model__n_estimators': 800,
#  'model__max_depth': 10,
#  'model__eta': 0.2}


5         0.0
180       0.0
48      793.2
252       0.0
79       36.8
        ...  
74        0.0
110    1189.8
278       0.0
361    1473.0
106       0.0
Name: Target_contentloss_euro, Length: 324, dtype: float64

## Evaluation

In [179]:
## reload models

# fi_cols =  df_importance.loc[df_importance.importances >= 0.0000001 , : ].index.to_list()#.shape
# print(len(fi_cols))
# print(fi_cols)

# # model_cv = RandomizedSearchCV(
# #     estimator=XGBRegressor(), 
# #     param_distributions=param_grid, #
# #     #param_distributions=param_bag_grid, 
# #     cv=cv, 
# #     scoring= "neg_mean_absolute_error",##"neg_mean_absolute_error", #"neg_mean_squared_error",#"r2" ,#"neg_mean_absolute_error",   #TODO classifcation: test also e.g "f1" or recall or "f1_micro", "neg_mean_absolute_error",
# #     refit=False,   ## Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting.
# #                     ## If refit=False, clf.fit() will have no effect because the GridSearchCV object inside the pipeline will be reinitialized after fit().
# #                     ## ! When refit=True, the GridSearchCV will be refitted with the best scoring parameter combination on the whole data that is passed in fit()
# #     random_state=seed
# # )
# # Fit model
# #model =XGBRegressor()# model_cv.best_estimator_
# model.fit(X_train.loc[:, fi_cols], y_train)   

# #print('Train R^2 Score : %.3f'%model_cv.best_estimator_.score(X_train.loc[:, fi_cols], y_train))
# #print('Test R^2 Score : %.3f'%model_cv.best_estimator_.score(X_test.loc[:, fi_cols], y_test))
# #print("CV score: ", model_cv.best_score_ ,  model_cv.best_estimator_.score(X_train, y_train),  model_cv.best_estimator_.score(X_test, y_test))
# ## Evaluate
# ## print evaluation report + check for overfitting 
# print("\nTraining set")
# y_pred_train = model.predict(X_train.loc[:, fi_cols])
# #y_pred_train = model_cv.best_estimator_.predict(X_train)
# e.evaluation_report(y_train, y_pred_train)

# print("\nTesting set")
# #y_pred = model_cv.best_estimator_.predict(X_test)
# y_pred = model.predict(X_test.loc[:, fi_cols])
# e.evaluation_report(y_test, y_pred)


In [None]:


##pipelines = ["pipe_bag_en"]#, 
pipelines = ["pipe_en" ]#
pipe_name = pipelines[0]

#model_eval = pickle.load(open(f"./models_trained/best_elasticnet_{target}_{pipe_name}.sav", 'rb'))
#elastic_net_eval.get_params()



In [180]:
print('R^2 training set', round(model.score(X_train, y_train)*100, 2), ' %')
print('R^2 test set', round(model.score(X_test, y_test)*100, 2))

learning_rate = [0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]



## Plot tagret vs its log-transformed version

## Plot tagret vs its log-transformed version

In [181]:
## Plot learning rate see if    
means = model_cv.cv_results_['mean_test_score']
stds = model_cv.cv_results_['std_test_score']
params = model_cv.cv_results_['params']

# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

# plot results
# scores = np.array(means).reshape(len(learning_rate), len(n_estimators))
# for i, value in enumerate(learning_rate):
#     plt.plot(n_estimators, scores[i], label='learning_rate: ' + str(value))
# for mean, stdev, param in zip(means, stds, params):
# 	print("%f (%f) with: %r" % (mean, stdev, param))
# plot
#plt.use('Agg')
import matplotlib
matplotlib.use('Agg')


plt.errorbar(learning_rate, means, yerr=stds)
plt.legend()
plt.xlabel('learning_rate')
plt.ylabel('Log Loss')
plt.savefig('log_loss_vs_learning_rate.png')
plt.show()
# scores = np.array(means).reshape(len(learning_rate), len(n_estimators))
# for i, value in enumerate(learning_rate):
#     plt.plot(n_estimators, scores[i], label='learning_rate: ' + str(value))
# plt.legend()
# plt.xlabel('learning_rate')
# plt.ylabel('Log Loss')
# plt.savefig('n_estimators_vs_learning_rate.png')

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


In [182]:
#plt.savefig(f"./models_trained/FI_{target}.png", bbox_inches='tight')
#sns_plot.figure.savefig("output.png")
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))


0.6694820470445055
-0.008858364862553536


### Plot prediction error 

In [183]:
#y_pred_log_train = model_log.predict(X_train)
y_pred = model.predict(X_test)
y_pred_log = model_log.predict(X_test)
y_pred_quantile = model_quantile.predict(X_test)
y_pred_boxcox = model_boxcox.predict(X_test)

f, (ax0, ax1) = plt.subplots( 2, 4,  sharey="row", figsize=(15, 8)) # 


PredictionErrorDisplay.from_predictions(
    y_test,
    y_pred,
    kind="actual_vs_predicted",
    ax=ax0[0],
    scatter_kwargs={"alpha": 0.5},
)
PredictionErrorDisplay.from_predictions(
    y_test,
    y_pred_log,
    kind="actual_vs_predicted",
    ax=ax0[1],
    scatter_kwargs={"alpha": 0.5},
)
PredictionErrorDisplay.from_predictions(
    y_test,
    y_pred_quantile,
    kind="actual_vs_predicted",
    ax=ax0[2],
    scatter_kwargs={"alpha": 0.5},
)
PredictionErrorDisplay.from_predictions(
    y_test,
    y_pred_boxcox,
    kind="actual_vs_predicted",
    ax=ax0[3],
    scatter_kwargs={"alpha": 0.5},
)
ax0[0].set_title("XGBoost regression \n without target transformation")
ax0[1].set_title("XGBoost regression \n with target log-transformation")
ax0[2].set_title("XGBoost regression \n with target quantile-transformation")
ax0[3].set_title("XGBoost regression \n with target boxcox-transformation")

ax0[0].set_ylim(0, 300)


# Add the score in the legend of each axis
for ax, y_pred in zip([ax0[0], ax0[1], ax0[2],ax0[3]], [y_pred, y_pred_log, y_pred_quantile, y_pred_boxcox]):
    for name, score in e.compute_score(y_test, y_pred).items():
        ax.plot([], [], " ", label=f"{name}={score}")
    ax.legend(loc="upper left")


# plot the residuals vs the predicted values
PredictionErrorDisplay.from_predictions(
    y_test,
    y_pred,
    kind="residual_vs_predicted",
    ax=ax1[0],
    scatter_kwargs={"alpha": 0.5},
)
PredictionErrorDisplay.from_predictions(
    y_test,
    y_pred_log,
    kind="residual_vs_predicted",
    ax=ax1[1],
    scatter_kwargs={"alpha": 0.5},
)
PredictionErrorDisplay.from_predictions(
    y_test,
    y_pred_quantile,
    kind="residual_vs_predicted",
    ax=ax1[2],
    scatter_kwargs={"alpha": 0.5},
)
PredictionErrorDisplay.from_predictions(
    y_test,
    y_pred_boxcox,
    kind="residual_vs_predicted",
    ax=ax1[3],
    scatter_kwargs={"alpha": 0.5},
)
ax1[0].set_title("XGBoost regression \n without target transformation")
ax1[1].set_title("XGBoost regression \n with target log-transformation")
ax1[2].set_title("XGBoost regression \n with target quantile-transformation")
ax1[3].set_title("XGBoost regression \n with target boxcox-transformation")

#ax1[0].set_ylim(0,200)


#f.suptitle("Synthetic data", y=1.05)
plt.tight_layout()


## reload models


#model_eval = pickle.load(open(f"./models_trained/xgboost_{target}_{pipe_name}.sav", 'rb'))
#model_eval.get_params()
#dir(model_eval)#.feature_importances_[model_eval.feature_importances_>0.015].shape

In [185]:
model

### Feature importance
Have the same feature importance method across all applied ML models


In [152]:

# df_importance_package = pd.DataFrame({
#     "name" : X_train.columns.to_list(),
#     "importances" : model.feature_importances_,
#      }) 
# df_importance_package.sort_values("importances", ascending=False)[:10]

In [201]:
# ## Permuation feature importance
# result = e.permutation_feature_importance(model, X_test, y_test, repeats=5, seed=seed)

# df_importance = pd.DataFrame({
#     "name" : X_train.columns.to_list(),
#     "importances" : result[0],
# #    "importances" : np.abs(result[0]),
#      }) 
# df_importance = df_importance.sort_values("importances", ascending=True)  # get most important features to the top
# df_importance.tail(18)

Unnamed: 0,name,importances
3,flowvelocity,0.002089
21,hh_monthly_income_cat,0.00329
22,shp_owner,0.003907
11,water_barriers_impl,0.00408
17,flood_experience,0.005342
5,emergency_measures.2,0.005879
27,resilience_more_future_affected,0.006793
26,shp_profits_last5years,0.011007
0,inundation_duration_h,0.015523
25,shp_avgmonthly_sale_cat,0.018028


# plot feature importance


In [None]:
# reload evalation set
eval_set = eval_set_list[0][pipe_name]


importances = e.permutation_feature_importance(model_boxcox, 
#importances = e.permutation_feature_importance(model, 
                                               X_test,
                                               y_test,
                                               #eval_set.drop(target, axis=1), 
                                               #eval_set[target], 
                                               repeats=5, seed=seed
                                               )

## feature importance scores
df_importance = pd.DataFrame({
    "name" : X_unscaled.columns.to_list(),
    "importances" : importances[0],
     }) 

# drop features which dont reduce the loss
df_importance = df_importance.loc[df_importance.importances > 0.0000, : ] 
df_importance = df_importance.sort_values("importances", ascending=True)


#fig, ax = plt.subplots(figsize=(12,5))
plt.figure(figsize=(8, 10))
plt.barh(df_importance.name, df_importance.importances)
#plt.barh(df_importance.name[-18:], df_importance.importances[-18:])

plt.xticks(
    #ticks = range(len(selected_feat)),
    #labels = X_unscaled.iloc[:,selected_feat],
    rotation = 90
    )
plt.title(f"XGBoost: Feature Importances for {target}")
plt.show()

## save importnace scores  and figure
filepath = f'./models_evaluation/best_xgb_importance_scores_{target}_{pipe_name}'
#if not glob(filepath):
#    df_importance.to_csv(filename, index = False)


#plt.savefig(f'../../../figures/best_en_feature_importance_{target}_{pipe_name}.png', bbox_inches='tight')





## left overs

##### Hierarchical clustering on Spearman rank correlation

Select only feautres with low collienarity to solve disadvantage of perumation feature importance.
Randomizing one feature would lead to only small importance score - the model performance wouldnt be move influenced - due that the information is included in other correlated features. Removing one feature keeps the similar inforamtion in the other feautres unchanged and the model learns from the correlated feature. Therefore apply hierachical clustering to select less correlated features

See also:
- Brill 2020 (dissertation)
- https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html # code example

In [None]:
# import scipy.cluster.hierarchy as shc

# plt.figure(figsize=(15, 10))
# plt.title("Customers Dendrogram")

# # Selecting Annual Income and Spending Scores by index
# selected_data = X_train.dropna()
# selected_data = selected_data.T # only possible with out nan
# clusters = shc.linkage(selected_data, 
#             method='ward', optimal_ordering=False,
#             metric="euclidean")
# shc.dendrogram(Z=clusters, 
#                #p=20, # p -> value for truncation mode
#                orientation="right",
#                labels=X_train.columns
#                ) 
# plt.show()

# ## TODO adapt with spearman rank order



In [None]:

# from scipy.stats import spearmanr
# from scipy.spatial.distance import squareform
# from scipy.cluster.hierarchy import ward, dendrogram

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
# corr = spearmanr(X_unscaled_no_nan).correlation

# # Ensure the correlation matrix is symmetric
# corr = (corr + corr.T) / 2
# np.fill_diagonal(corr, 1)

# # We convert the correlation matrix to a distance matrix before performing
# # hierarchical clustering using Ward's linkage.
# distance_matrix = 1 - np.abs(corr)
# dist_linkage = ward(distance_matrix, checks=False )
# dendro = dendrogram(
#     dist_linkage, labels=X_unscaled_no_nan.columns.tolist(), ax=ax1, leaf_rotation=90
# )
# dendro_idx = np.arange(0, len(dendro["ivl"]))

# ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
# ax2.set_xticks(dendro_idx)
# ax2.set_yticks(dendro_idx)
# ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
# ax2.set_yticklabels(dendro["ivl"])
# fig.tight_layout()

# cluster_ids = shc.fcluster(dist_linkage, 1, criterion="distance")
# cluster_id_to_feature_ids = defaultdict(list)
# for idx, cluster_id in enumerate(cluster_ids):
#     cluster_id_to_feature_ids[cluster_id].append(idx)
# selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]

# X_train_sel = X_train[:, selected_features]
# X_test_sel = X_test[:, selected_features]



In [None]:
# closs hyperapram , no model__early_stopping_rounds, repeatedcv wit h10 folds
## best train R2: ntree=30, max_depth =1, no furhter params

# learning_rate = [ 0.00001, 0.0001, 0.001, 0.1, 0.2]#, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9] # store outside, for plotting
# n_estimators = [ 50, 100, 200, 300, 500, 800]

# param_grid = {'model__n_estimators': n_estimators,
#     #'model__n_estimators': [ 3, 5, 10, 20], # get only low train scores with this
#               'model__max_depth': [1, 2, 3, 5, 7, 8, 10, 15],
#               #'model__max_leaves': [0, 3, 5],
#              # 'model__colsample_bytree': [ 0.3, 0.5, 0.7, 1.0 ], # Percentage of columns to be randomly samples for each tree
#              # 'model__colsample_bynode': [ 0.3, 0.5, 0.7, 1.0], # nbr of feautres for each split point
#              # 'model__eta': learning_rate,  # == eta
#             #   'model__gamma': [0.2, 0.3, 0.5, 0.8, 1, 3] , # min_split_loss -  larger gamma is, the more conservative the algorithm is
#               'model__subsample': [0.0, 0.2, 0.5, 0.6, 0.8, 0.9],  # define subsample of train st prior to growing trees, prevent overfitting
#             #  'model__reg_alpha': [0.5, 1.0, 2.0, 4.0, 5.0, 6.0 ,7.0],   # Lasso Regularization term on weights , higher values = more consrvative 
#             #  'model__reg_lambda': [0.0, 0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0],  # Ridge Regularization term on weights ,  higher values = more consrvative
#             #   'model__min_child_weight': [0, 1, 2, 3, 4,],
#             #   "model__max_delta_step":  [0, 3, 5, 6, 7],           # for LogisticReg good to solve imbalance 
#           #   'model__objective': [None, 'reg:absoluteerror'],#'multi:softprob,'reg:squarederror','reg:models_trained'],
#           #  # 'model__tree_method': ["hist", "gpu_hist"],
#           #   'model__booster': [None, "gblinear", "gbtree"],
#             "model__validate_parameters":[True],
#               }

# # 'model__scale_pos_weight': [0.0, 0.3, 0.5, 0.7, 0.9, 1.0],  # only  for clasifcation: handle imbalance, ratio between negative and positive examples

# # Objective candidate: multi:softmax
# # Objective candidate: multi:softprob
# # Objective candidate: reg:squarederror
# # Objective candidate: reg:squaredlogerror
# # Objective candidate: reg:logistic
# ## Objective candidate: reg:linear
# # Objective candidate: reg:pseudohubererror
# # Objective candidate: reg:gamma
# # Objective candidate: reg:absoluteerror

# ## DOC: https://xgboost.readthedocs.io/en/stable/parameter.html
