In [187]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Data preprocessing for HCMC survey dataset"""

__author__ = "Anna Buch, Heidelberg University"
__email__ = "a.buch@stud.uni-heidelberg.de"

## Feature selection done by eXtreme Gradient Boosting (XGBoost)


In [188]:
import os, sys
import copy as cp
from glob import glob
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, confusion_matrix, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, StratifiedKFold, RepeatedStratifiedKFold, RepeatedKFold, cross_val_score, cross_validate
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.pipeline import Pipeline

import pickle
import joblib
import matplotlib.pyplot as plt
import seaborn as sns


sys.path.insert(0, "../../../")
import utils.utils_feature_selection as fs
import utils.utils_evaluation as e
import utils.utils_figures as f
import utils.settings as s
import utils.pipelines_continous as p

s.init()
seed = s.seed

pd.set_option('display.max_columns', None)
plt.figure(figsize=(20, 10))


import warnings
warnings.filterwarnings('ignore')


In [203]:
df_candidates = pd.read_excel("../../../input_survey_data/input_data_contentloss.xlsx")
#df_candidates = pd.read_excel("../../../input_survey_data/input_data_businessreduction.xlsx")

targets = ["Target_contentloss_euro", "Target_businessreduction"]
target = targets[0]

df_candidates.tail(2)


Unnamed: 0,Target_contentloss_euro,inundation_duration_h,water_depth_cm,contaminations.0,flowvelocity,emergency_measures.1,emergency_measures.2,emergency_measures.3,emergency_measures.4,emergency_measures.6,emergency_measures.7,emergency_measures.8,emergency_measures.9,overall_problem_house,protect_valuables_impl,water_barriers_impl,pumping_equipment_impl,elevation_building_impl,resistant_material_building_impl,electricity_higher_impl,flood_protections_impl,flood_experience,elevation_building_height_cm,elevation_rel2surrounding_cat,bage,b_area,hh_monthly_income_cat,shp_owner,shp_sector,shp_employees,shp_avgmonthly_sale_cat,shp_finance_investments,shp_risk_tolerance,shp_monetary_resources4prevention,resilience_city_protection,resilience_left_alone,resilience_neighbor_management,perception_who_responsible4protection.Rank1,contaminations_light,contaminations_heavy,shp_content_value_euro,shp_registered_capital_euro
395,0.0,4.0,70.0,0,1,1,0,1,0,1,0,0,0,1,1,5,1,1,5,5,5,5,70.0,1,,130.0,,1,17,2,3,1,3.0,3.0,1.0,5,1.0,2.0,1,0,,11047.7
396,0.0,3.0,100.0,0,1,1,0,1,0,0,0,0,0,0,1,5,5,5,5,5,5,4,,0,5.0,33.0,1.0,1,11,2,3,1,3.0,4.0,,5,,3.0,1,0,,736.5


In [204]:
## delete features with more than 10% missing values

print("Percentage of missing valeus per feature\n", df_candidates.isna().mean().sort_values(ascending=False)[:15] ) 
## --> kepp threshold by 15% less would delete important features e.g. content values, registerd capitaletc.

Percentage of missing valeus per feature
 elevation_building_height_cm                   0.158690
shp_content_value_euro                         0.158690
shp_registered_capital_euro                    0.118388
perception_who_responsible4protection.Rank1    0.070529
shp_risk_tolerance                             0.070529
bage                                           0.068010
hh_monthly_income_cat                          0.060453
shp_monetary_resources4prevention              0.045340
resilience_city_protection                     0.037783
Target_contentloss_euro                        0.037783
resilience_neighbor_management                 0.027708
inundation_duration_h                          0.022670
b_area                                         0.005038
water_depth_cm                                 0.002519
emergency_measures.7                           0.000000
dtype: float64


## select only damage cases

In [205]:

# print(f"Removing {df_candidates.loc[df_candidates[target]==0,:].shape[0]} zero loss records")
# df_candidates = df_candidates.loc[df_candidates[target]!=0,:]

# print(f"Keeping {df_candidates.shape} damage cases for model training and evaluation")


In [206]:
# df_candidates.columns
# df_candidates = df_candidates[[
#     'Target_contentloss_euro', 'inundation_duration_h', 'water_depth_cm',
#     'contaminations.0', 'flowvelocity', 
#     'emergency_measures.1','emergency_measures.2', 
#     #'emergency_measures.3', 'emergency_measures.4',
#     #'emergency_measures.6', 'emergency_measures.7', 'emergency_measures.8',
#     'emergency_measures.9', 'overall_problem_house',
#     #'protect_valuables_impl', 'water_barriers_impl',
#     #'pumping_equipment_impl', 'elevation_building_impl',
#     #'resistant_material_building_impl', 'electricity_higher_impl',
#     #'flood_protections_impl', 'flood_experience',
#     'elevation_building_height_cm', 'elevation_rel2surrounding_cat', 'bage',
#     'b_area', 'hh_monthly_income_cat', 'shp_owner', 'shp_sector',
#     'shp_employees', 'shp_avgmonthly_sale_cat', 'shp_finance_investments',
#     #'shp_risk_tolerance', 'shp_monetary_resources4prevention',
#     #'resilience_city_protection', 'resilience_left_alone',
#     #'resilience_neighbor_management',
#     #'perception_who_responsible4protection.Rank1', 'contaminations_light',
#     #'contaminations_heavy', '
#      'shp_content_value_euro',
#     'shp_registered_capital_euro'
# ]]

df_candidates = df_candidates.drop([
        "contaminations_light", "contaminations_heavy", "emergency_measures.9",
        "emergency_measures.6", "emergency_measures.7", # BETTER DROP test rmv da nur binary
        "elevation_rel2surrounding_cat",  # TEST
        #"shp_profits_last5years", # TEST
        "shp_monetary_resources4prevention", # TEST
        'shp_finance_investments',
        'shp_risk_tolerance',
        'resilience_city_protection', 'resilience_left_alone',
        'resilience_neighbor_management', 
        # already in d_cleaning RM 'resilience_more_future_affected','resilience_govern_careing', 'resilience_govern_careing_increases',
        'perception_who_responsible4protection.Rank1',
        # ALREADY RM in d_cleaning: 'perception_private_economy_future' 
        ], axis=1)


In [207]:
df_candidates.columns

Index(['Target_contentloss_euro', 'inundation_duration_h', 'water_depth_cm',
       'contaminations.0', 'flowvelocity', 'emergency_measures.1',
       'emergency_measures.2', 'emergency_measures.3', 'emergency_measures.4',
       'emergency_measures.8', 'overall_problem_house',
       'protect_valuables_impl', 'water_barriers_impl',
       'pumping_equipment_impl', 'elevation_building_impl',
       'resistant_material_building_impl', 'electricity_higher_impl',
       'flood_protections_impl', 'flood_experience',
       'elevation_building_height_cm', 'bage', 'b_area',
       'hh_monthly_income_cat', 'shp_owner', 'shp_sector', 'shp_employees',
       'shp_avgmonthly_sale_cat', 'shp_content_value_euro',
       'shp_registered_capital_euro'],
      dtype='object')

#### test remove three highest targets

In [208]:
# print("Highest content damages: \n", df_candidates[target].nlargest(5) )#.index

# highest_closs_obs = df_candidates[target].nlargest(3).index.to_list()
# df_candidates = df_candidates.drop(highest_closs_obs, axis=0)
# print("Dropped three records with the three highest content losses")

### Target varibale distribution

In [209]:
# df_candidates[target].hist(bins=10, figsize=(8, 8))
# plt.ylim(1, 20)
# plt.xlim(1, 20000)
# plt.show()

#df_candidates[target].describe()


In [210]:
# plt.figure(figsize=(8, 8))

# print(df_candidates[target].describe())

# closses = df_candidates[target] #df_candidates[target][df_candidates[target] != 0.0 ]
# closses.hist(bins=2000, figsize=(8, 8))
# plt.ylim(1, 20)
# plt.xlim(1, 20000)
# plt.show()

## Fit model 

In [211]:
# importances = e.permutation_feature_importance(model, X_test, y_test, repeats=5, seed=seed)

# df_importance = pd.DataFrame(
#     {"importances" : importances[0]},
#     index=X_train.columns.to_list(),
#     ) 
# df_importance = df_importance.sort_values("importances", ascending=False)  # get most important features to the top
# print("Most important features:", df_importance.iloc[:5].index.to_list())
# #df_importance = df_importance.loc[df_importance.importances >= 0.000000, : ]

# df_importance

In [212]:
learning_rate = [ 0.0001, 0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.8] # store outside, for plotting
#n_estimators = [ 50, 100, 200, 300, 500, 800] # 30, 50, 70, 100, 200, 300, 400], 

param_grid = {#'model__n_estimators': n_estimators,
              'model__n_estimators': [3, 5, 8, 10, 12, 15, 18, 20 ,30 ,40], # get only low train scores with this
              'model__max_depth': [1, 2, 3, 5, 7, 10, 15],              #'model__max_leaves': [0, 3, 5],
            # 'model__colsample_bytree': [0.1, 0.3, 0.5, 0.7, 1.0 ], # Percentage of columns to be randomly samples for each tree
            # 'model__colsample_bynode': [0.1, 0.3, 0.5, 0.7, 1.0], # nbr of feautres for each split point
             #'model__learning_rate': learning_rate,  # == eta
             # 'model__gamma': [0.1, 0.2, 0.3, 0.5 ] , # min_split_loss -  larger gamma is, the more conservative the algorithm is
            #'model__subsample': [0.0, 0.2, 0.5, 0.6, 0.8, 0.9],  # define subsample of train st prior to growing trees, prevent overfitting
             # 'model__reg_alpha': [0.0, 0.5, 1.0, 2.0, 4.0, 5.0, 6.0 ,7.0],   # Lasso Regularization term on weights , higher values = more consrvative 
             # 'model__reg_lambda': [0.0,  0.05, 0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0],  # Ridge Regularization term on weights ,  higher values = more consrvative
              #'model__min_child_weight': [0, 1, 2, 3, 4,],
            #   "model__max_delta_step":  [0, 3, 5, 6, 7],           # for LogisticReg good to solve imbalance 
              'model__objective': [None, 'reg:squarederror', 'reg:logistic', 'reg:absoluteerror'],#'multi:softprob,'reg:squarederror','reg:models_trained'],
          #  # 'model__tree_method': ["hist", "gpu_hist"],
           'model__booster': ["gblinear"], # [None, "gblinear", "gbtree"],
            "model__validate_parameters":[True],
              }

# from sklearn.ensemble import RandomForestRegressor, XGBRFRegressor, GradientBoostingRegressor

# param_grid = {#'model__n_estimators': n_estimators,
#               'n_estimators': [3, 5, 8, 10, 12, 15, 18, 20 ,30 ,40], # 30, 50, 70, 100, 200, 300, 400], # get only low train scores with this
#               'max_depth': [1, 2, 3, 5, 7, 10, 15],              #'model__max_leaves': [0, 3, 5],
#               #'min_samples_split': np.arange(0.0, 1.0, 0.1)
#              #'model__learning_rate': learning_rate,  # == eta
#             #'model__subsample': [0.0, 0.2, 0.5, 0.6, 0.8, 0.9],  # define subsample of train st prior to growing trees, prevent overfitting
#              # 'model__reg_alpha': [0.0, 0.5, 1.0, 2.0, 4.0, 5.0, 6.0 ,7.0],   # Lasso Regularization term on weights , higher values = more consrvative 
#              # 'model__reg_lambda': [0.0,  0.05, 0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0],  # Ridge Regularization term on weights ,  higher values = more consrvative
#             #   'model__min_child_weight': [0, 1, 2, 3, 4,],
#             #"model__validate_parameters":[True],
#               }
# 'model__scale_pos_weight': [0.0, 0.3, 0.5, 0.7, 0.9, 1.0],  # only  for clasifcation: handle imbalance, ratio between negative and positive examples

# Objective candidate: multi:softmax
# Objective candidate: multi:softprob
# Objective candidate: reg:squarederror
# Objective candidate: reg:squaredlogerror
# Objective candidate: reg:logistic
## Objective candidate: reg:linear
# Objective candidate: reg:pseudohubererror
# Objective candidate: reg:gamma
# Objective candidate: reg:absoluteerror

## DOC: https://xgboost.readthedocs.io/en/stable/parameter.html


In [213]:
## iterate over both targets and store results 

fi_threshold = 0.000
eval_set_list = []

#importances_threshold = {"target_contentloss_euro": 0.000, "Target_businessreduction": 0.000 }

plt.ioff()  # Prevent plt showing stuff

print(target)


## iterate over piplines. Each piplines contains precrosseing methods and several  classifier
pipelines = ["pipe_xgb"]#, "pipe_ximput_xgb"]
    

for pipe_name in pipelines:

    print( f"\nApply XGBoost on {target}, with pipeline {pipe_name}:")

    ## load sinlge pipeline
    pipe = joblib.load(f'./pipelines/{pipe_name}.pkl')
    
    
    df_candidates_t = df_candidates

    ## TEST run xgb with and without nan in X
    ## clean df from remaining records containg nan
    #df_candidates_t = df_candidates_t.dropna()
    #df_candidates_t = df_candidates_t[df_candidates_t[target]!=0.0]

    #print("Amount of missing target values should be zero: ", df_candidates_t[target].isna().sum())
    print("Uses ", df_candidates_t.shape[0], " records, from those have ", 
        { (df_candidates_t[target][df_candidates_t[target]==0.0]).count() }, f" records zero {target.split('_')[1]}")


    ## drop samples where target is nan
    print(f"Dropping {df_candidates_t[f'{target}'].isna().sum()} records from entire dataset due that these values are nan in target variable")
    df_candidates_t = df_candidates_t[ ~df_candidates_t[f"{target}"].isna()]


    # if pipe_name != "pipe_ximput_xgb":
    #     pass
    #     # ## drop instances where target is nan
    #     # print("Before dropping records with nan", df_candidates_t.shape)
    #     # df_candidates_t = df_candidates_t.dropna()
    #     # print("After dropping records with nan", df_candidates_t.shape)
    # else:
    #     ##impute nans in X
    #     for c in df_candidates_t.drop(targets, axis=1).columns: 
    #         #df_candidates_t[f"{c}"].fillna(value=np.nanmedian(df_candidates_t[f"{c}"]), inplace=True)
    #         df_candidates_t[c].fillna(df_candidates_t[c].median(), inplace=True)
    
    # split into predictors and target variable
    X_unscaled = df_candidates_t.drop([target], axis=1)  # remove targets from X
    y = df_candidates_t[target]
    
    ## test train split
    X_train, X_test, y_train, y_test = train_test_split(
        X_unscaled, y, test_size=0.15, 
        random_state=seed, shuffle=True
    )
    
    ## save evaluation set for later usage in feature importance
    eval_set =  pd.concat([y_test, X_test], axis=1) #[(X_test, y_test)]
    eval_set_list.append({pipe_name : eval_set})
    
    print("Training set size", X_train.shape[0])
    print("Test set size", X_test.shape[0])

    ## normalize data 
    X_train, X_test = fs.normalize_X(X_train, X_test)

    ## Hyperparmaters and CV
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=seed) 
    #cv = RepeatedKFold(n_splits=20, n_repeats=2, random_state=seed)

    model_cv = RandomizedSearchCV(   #         #GridSearchCV(
        estimator=pipe, 
        param_distributions=param_grid, 
        random_state=seed,
        cv=cv,
        scoring= "neg_mean_absolute_error",
        #best_ntree_score(XGBRegressor, X_train, y_train) , #"neg_mean_absolute_error",   #
        refit=True,   ## Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting.
                        ## If refit=False, clf.fit() will have no effect because the GridSearchCV object inside the pipeline will be reinitialized after fit().
                        ## ! When refit=True, the GridSearchCV will be refitted with the best scoring parameter combination on the whole data that is passed in fit()
        verbose=False,
    )
    #         # Nested CV with parameter optimization
    # nested_score = cross_val_score(model_cv, X=X_train, y=y_train, cv=outer_cv)
    # nested_scores = nested_score.mean()
    # print("nested_scores", nested_scores)

    ## Fit best model on training set
    model_cv.fit(
        X_train, y_train,
        model__early_stopping_rounds=3,
        model__eval_metric="mae",
        model__eval_set=[(X_test, y_test)],
        model__verbose=False
        )

    print(f"Best hyperparams: {model_cv.best_params_}")
    # print("Train R^2 Score : %.1f" %model_cv.best_estimator_.score(X_train, y_train))
    #print("MAE of best model: %.1f" %model_cv.best_score_," on iteration ", model_cv.best_estimator_.best_iteration)  

    # fit model again with best hyperparams
    print("Create new XGBoost model based on best hyperparameters")
        ## ** Need to train new model to obtin feautre importance scores **
    #model = model_cv.best_estimator_
    model = XGBRegressor(
                n_estimators = model_cv.best_params_['model__n_estimators'], 
                max_depth = model_cv.best_params_['model__max_depth'],
                #max_leaves = model_cv.best_params_['model__max_leaves'],
                # colsample_bynode = model_cv.best_params_['model__colsample_bynode'],
                # colsample_bytree = model_cv.best_params_['model__colsample_bytree'],
                #learning_rate = model_cv.best_params_['model__learning_rate'],
                # gamma = model_cv.best_params_['model__gamma'],
                # reg_alpha = model_cv.best_params_['model__reg_alpha'],  # Lasso Regularization term on weights 
                # reg_lambda = model_cv.best_params_['model__reg_lambda'],
                # min_child_weight = model_cv.best_params_['model__min_child_weight'],
                # scale_pos_weight = model_cv.best_params_['model__scale_pos_weight'],
                # max_delta_step = model_cv.best_params_['model__max_delta_step'],
                # #objective = model_cv.best_params_['model__objective'],
                #model__booster = model_cv.best_params_['model__booster'],
                random_state=seed,
            )
    model.fit(X_train, y_train,
        early_stopping_rounds=50,
        eval_metric="mae",
        eval_set=[(X_test, y_test)],
        verbose=False
    ) # no early stop due that done before

    ## store best trained model for evaluation
    filename = f'./models_trained/xgboost_{target}_{pipe_name}.sav'
    #pickle.dump(model_cv.best_estimator_, open(filename, 'wb'))
    pickle.dump(model, open(filename, 'wb'))

    ## Evaluate model
    # print(f"Training set score (R^2): {round(model.score(X_train, y_train), 2)}")  # how well did the model on the training set
    # print(f"Test set score (R^2): {model_cv.score(X_test, y_test)}")   # .. compared to the unseen test set for overfitting - acutal not needed
    # r2 = variance explained by model / total variance --> higher r2= better fitted model

    ## get signifcant features based on absolute coeff values
    print("\nSelect features based on permutation feature importance")

    # ## select significant features byPermuation feature importance
    importances = e.permutation_feature_importance(model, X_test, y_test, repeats=5, seed=seed)

    df_importance = pd.DataFrame(
        {"importances" : importances[0]},
        index=X_train.columns.to_list(),
        ) 
    df_importance = df_importance.sort_values("importances", ascending=False)  # get most important features to the top
    print("Most important features:", df_importance.iloc[:5].index.to_list())
    df_importance = df_importance.loc[df_importance.importances >= 0.000000, : ]
    #df_importance.head(5)
    # ## write selected predictors and response to disk
    fs.save_selected_features(
        X_train, 
        pd.DataFrame(y_train, columns=[target]), 
        df_importance.T.columns, 
        filename=f"../../../input_survey_data/selected_predictors/fs_xgboost_{target.split('_')[1]}_{pipe_name}.xlsx"
    )

    ## Evaluate
    ## print evaluation report + check for overfitting 
    print("\nTraining set")
    y_pred_train = model.predict(X_train)
    #y_pred_train = model_cv.best_estimator_.predict(X_train)
    e.evaluation_report(y_train, y_pred_train,
                        filepath=f"./models_evaluation/xgboost/eval_train_{target.split('_')[1]}_{pipe_name}.csv")

    print("\nTesting set")
    #y_pred = model_cv.best_estimator_.predict(X_test)
    y_pred = model.predict(X_test)
    e.evaluation_report(y_test, y_pred, 
                        filepath=f"./models_evaluation/xgboost/eval_test_{target.split('_')[1]}_{pipe_name}.csv")


Target_contentloss_euro

Apply XGBoost on Target_contentloss_euro, with pipeline pipe_xgb:
Uses  397  records, from those have  {226}  records zero contentloss
Dropping 15 records from entire dataset due that these values are nan in target variable
Training set size 324
Test set size 58
Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth

In [215]:
model_cv.best_params_

y_train

##  MAE: + in target unit +  less likely to be affected by extreme values.
## MAPE: r (MAPE) quantifies the average absolute difference between 
## the anticipated and observed values of the target variable as a percentage of the observed value. 
## The method works well for assessing models where the target variable spans a broad range of scales

# 75 % in train mit 300 trress, subsam0.8, maxdep=3, aber bad teest R2


## R²-Score: 0.46
# {'model__validate_parameters': True,
#  'model__n_estimators': 800,
#  'model__max_depth': 10,
#  'model__eta': 0.2}

5         0.0
180       0.0
48      793.2
252       0.0
79       36.8
        ...  
74        0.0
110    1189.8
278       0.0
361    1473.0
106       0.0
Name: Target_contentloss_euro, Length: 324, dtype: float64

#### Save best model, its hyperparamters, and evlation scores and selected features

In [177]:
# ## Stored best xgb model for business reduction: best_xgboost_Target_businessreduction_pipe_xgb.sav
# ## Its hyperaprams xgb_best_param_model_cv.best_params_  # early_stop=3

# ## best model and hyperparams (here early stoping =3)
# filename = f'./models_trained/best_xgb_{target}_{pipe_name}.sav'
# if not glob(filename):
#     pickle.dump(model, open(filename, 'wb'))

# best_params = model_cv.best_params_
# best_params = pd.DataFrame({"hyperparameter":best_params.keys(), "value":best_params.values()})
# filepath = f'./models_evaluation/best_xgb_hyperparams_{target}_{pipe_name}.sav'
# if not glob(filepath):
#     best_params.to_csv(filename, index = False)

# ## eval report
# y_pred_train = model.predict(X_train)
# e.evaluation_report(y_train, y_pred_train,
#                     X_unscaled.shape[1], 
#                     filepath=f"./models_evaluation/xgboost/best_eval_train_{target.split('_')[1]}_{pipe_name}.csv")
# y_pred = model.predict(X_test)
# e.evaluation_report(y_test, y_pred, 
#                     X_unscaled.shape[1], 
#                     filepath=f"./models_evaluation/xgboost/best_eval_test_{target.split('_')[1]}_{pipe_name}.csv")

# ## selected features
# fs.save_selected_features(
#         X_train, 
#         pd.DataFrame(y_train, columns=[target]), 
#         df_importance.T.columns, 
#         filename=f"../../../input_survey_data/selected_predictors/best_fs_xgb_{target.split('_')[1]}_{pipe_name}.xlsx"
#     )

model_cv.best_estimator_

In [200]:
df_importance.sort_values("importances", ascending=False)

Unnamed: 0,importances
water_depth_cm,0.287608
bage,0.138476
shp_registered_capital_euro,0.107294
perception_private_economy_future,0.0709
resistant_material_building_impl,0.04024
elevation_building_height_cm,0.035991
elevation_building_impl,0.032262
shp_content_value_euro,0.021485
shp_avgmonthly_sale_cat,0.018028
inundation_duration_h,0.015523


In [179]:
# fi_cols =  df_importance.loc[df_importance.importances >= 0.0000001 , : ].index.to_list()#.shape
# print(len(fi_cols))
# print(fi_cols)

# # model_cv = RandomizedSea
# rchCV(
# #     estimator=XGBRegressor(), 
# #     param_distributions=param_grid, #
# #     #param_distributions=param_bag_grid, 
# #     cv=cv, 
# #     scoring= "neg_mean_absolute_error",##"neg_mean_absolute_error", #"neg_mean_squared_error",#"r2" ,#"neg_mean_absolute_error",   #TODO classifcation: test also e.g "f1" or recall or "f1_micro", "neg_mean_absolute_error",
# #     refit=False,   ## Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting.
# #                     ## If refit=False, clf.fit() will have no effect because the GridSearchCV object inside the pipeline will be reinitialized after fit().
# #                     ## ! When refit=True, the GridSearchCV will be refitted with the best scoring parameter combination on the whole data that is passed in fit()
# #     random_state=seed
# # )
# # Fit model
# #model =XGBRegressor()# model_cv.best_estimator_
# model.fit(X_train.loc[:, fi_cols], y_train)   

# #print('Train R^2 Score : %.3f'%model_cv.best_estimator_.score(X_train.loc[:, fi_cols], y_train))
# #print('Test R^2 Score : %.3f'%model_cv.best_estimator_.score(X_test.loc[:, fi_cols], y_test))
# #print("CV score: ", model_cv.best_score_ ,  model_cv.best_estimator_.score(X_train, y_train),  model_cv.best_estimator_.score(X_test, y_test))
# ## Evaluate
# ## print evaluation report + check for overfitting 
# print("\nTraining set")
# y_pred_train = model.predict(X_train.loc[:, fi_cols])
# #y_pred_train = model_cv.best_estimator_.predict(X_train)
# e.evaluation_report(y_train, y_pred_train)

# print("\nTesting set")
# #y_pred = model_cv.best_estimator_.predict(X_test)
# y_pred = model.predict(X_test.loc[:, fi_cols])
# e.evaluation_report(y_test, y_pred)


# ## fit model again with best hyperparams
# print("Create new XGB model based on selected features:")


In [180]:
learning_rate = [0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


In [181]:
## Plot learning rate see if    
means = model_cv.cv_results_['mean_test_score']
stds = model_cv.cv_results_['std_test_score']
params = model_cv.cv_results_['params']

# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

# plot results
# scores = np.array(means).reshape(len(learning_rate), len(n_estimators))
# for i, value in enumerate(learning_rate):
#     plt.plot(n_estimators, scores[i], label='learning_rate: ' + str(value))
# for mean, stdev, param in zip(means, stds, params):
# 	print("%f (%f) with: %r" % (mean, stdev, param))
# plot
#plt.use('Agg')
import matplotlib
matplotlib.use('Agg')


plt.errorbar(learning_rate, means, yerr=stds)
plt.legend()
plt.xlabel('learning_rate')
plt.ylabel('Log Loss')
plt.savefig('log_loss_vs_learning_rate.png')
plt.show()
# scores = np.array(means).reshape(len(learning_rate), len(n_estimators))
# for i, value in enumerate(learning_rate):
#     plt.plot(n_estimators, scores[i], label='learning_rate: ' + str(value))
# plt.legend()
# plt.xlabel('learning_rate')
# plt.ylabel('Log Loss')
# plt.savefig('n_estimators_vs_learning_rate.png')

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


In [182]:
#plt.savefig(f"./models_trained/FI_{target}.png", bbox_inches='tight')
#sns_plot.figure.savefig("output.png")
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))


0.6694820470445055
-0.008858364862553536


## Reload model

In [183]:
## reload models


#model_eval = pickle.load(open(f"./models_trained/xgboost_{target}_{pipe_name}.sav", 'rb'))
#model_eval.get_params()
#dir(model_eval)#.feature_importances_[model_eval.feature_importances_>0.015].shape

In [185]:
model

### Feature importance
Have the same feature importance method across all applied ML models


In [152]:

# df_importance_package = pd.DataFrame({
#     "name" : X_train.columns.to_list(),
#     "importances" : model.feature_importances_,
#      }) 
# df_importance_package.sort_values("importances", ascending=False)[:10]

In [201]:
## Permuation feature importance
result = e.permutation_feature_importance(model, X_test, y_test, repeats=5, seed=seed)

df_importance = pd.DataFrame({
    "name" : X_train.columns.to_list(),
    "importances" : result[0],
#    "importances" : np.abs(result[0]),
     }) 
df_importance = df_importance.sort_values("importances", ascending=True)  # get most important features to the top
df_importance.tail(18)

Unnamed: 0,name,importances
3,flowvelocity,0.002089
21,hh_monthly_income_cat,0.00329
22,shp_owner,0.003907
11,water_barriers_impl,0.00408
17,flood_experience,0.005342
5,emergency_measures.2,0.005879
27,resilience_more_future_affected,0.006793
26,shp_profits_last5years,0.011007
0,inundation_duration_h,0.015523
25,shp_avgmonthly_sale_cat,0.018028


In [202]:
# plot feature importance

# f.plot_feature_importance(df_importance.importances, n=10, figure_size=(20, 15), target=target)

fig, ax = plt.subplots(figsize=(12,10))

# drop features which dont reduce the loss
df_importance = df_importance.loc[df_importance.importances >= 0.0000, : ] 
#sorted_idx = model_eval.feature_importances_.argsort()
#plt.barh(df_importance.name[-5:], df_importance.importances[-5:])
plt.barh(df_importance.name[-18:], df_importance.importances[-18:])
#plt.bar(X_train.columns[sorted_idx[:15]], model_eval.feature_importances_[sorted_idx[:15]])
plt.xticks(
   # ticks = range(len(X_train.columns[sorted_idx[:15]])),
   # labels =X_train.columns[sorted_idx[:15],],
    rotation = 90
    )
plt.title(f"Feature Importances for {target}")

filepath = f'./models_evaluation/best_xgb_importance_scores_{target}_{pipe_name}'
if not glob(filepath):
    df_importance.to_csv(filename, index = False)

plt.savefig(f'../../../figures/best_xgb_feature_importance_{target}_{pipe_name}2.png', bbox_inches='tight')

plt.show()


##### Hierarchical clustering on Spearman rank correlation

Select only feautres with low collienarity to solve disadvantage of perumation feature importance.
Randomizing one feature would lead to only small importance score - the model performance wouldnt be move influenced - due that the information is included in other correlated features. Removing one feature keeps the similar inforamtion in the other feautres unchanged and the model learns from the correlated feature. Therefore apply hierachical clustering to select less correlated features

See also:
- Brill 2020 (dissertation)
- https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html # code example

In [None]:
# import scipy.cluster.hierarchy as shc

# plt.figure(figsize=(15, 10))
# plt.title("Customers Dendrogram")

# # Selecting Annual Income and Spending Scores by index
# selected_data = X_train.dropna()
# selected_data = selected_data.T # only possible with out nan
# clusters = shc.linkage(selected_data, 
#             method='ward', optimal_ordering=False,
#             metric="euclidean")
# shc.dendrogram(Z=clusters, 
#                #p=20, # p -> value for truncation mode
#                orientation="right",
#                labels=X_train.columns
#                ) 
# plt.show()

# ## TODO adapt with spearman rank order



In [None]:

# from scipy.stats import spearmanr
# from scipy.spatial.distance import squareform
# from scipy.cluster.hierarchy import ward, dendrogram

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
# corr = spearmanr(X_unscaled_no_nan).correlation

# # Ensure the correlation matrix is symmetric
# corr = (corr + corr.T) / 2
# np.fill_diagonal(corr, 1)

# # We convert the correlation matrix to a distance matrix before performing
# # hierarchical clustering using Ward's linkage.
# distance_matrix = 1 - np.abs(corr)
# dist_linkage = ward(distance_matrix, checks=False )
# dendro = dendrogram(
#     dist_linkage, labels=X_unscaled_no_nan.columns.tolist(), ax=ax1, leaf_rotation=90
# )
# dendro_idx = np.arange(0, len(dendro["ivl"]))

# ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
# ax2.set_xticks(dendro_idx)
# ax2.set_yticks(dendro_idx)
# ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
# ax2.set_yticklabels(dendro["ivl"])
# fig.tight_layout()


## Reste

In [None]:
# closs hyperapram , no model__early_stopping_rounds, repeatedcv wit h10 folds
## best train R2: ntree=30, max_depth =1, no furhter params

# learning_rate = [ 0.00001, 0.0001, 0.001, 0.1, 0.2]#, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9] # store outside, for plotting
# n_estimators = [ 50, 100, 200, 300, 500, 800]

# param_grid = {'model__n_estimators': n_estimators,
#     #'model__n_estimators': [ 3, 5, 10, 20], # get only low train scores with this
#               'model__max_depth': [1, 2, 3, 5, 7, 8, 10, 15],
#               #'model__max_leaves': [0, 3, 5],
#              # 'model__colsample_bytree': [ 0.3, 0.5, 0.7, 1.0 ], # Percentage of columns to be randomly samples for each tree
#              # 'model__colsample_bynode': [ 0.3, 0.5, 0.7, 1.0], # nbr of feautres for each split point
#              # 'model__eta': learning_rate,  # == eta
#             #   'model__gamma': [0.2, 0.3, 0.5, 0.8, 1, 3] , # min_split_loss -  larger gamma is, the more conservative the algorithm is
#               'model__subsample': [0.0, 0.2, 0.5, 0.6, 0.8, 0.9],  # define subsample of train st prior to growing trees, prevent overfitting
#             #  'model__reg_alpha': [0.5, 1.0, 2.0, 4.0, 5.0, 6.0 ,7.0],   # Lasso Regularization term on weights , higher values = more consrvative 
#             #  'model__reg_lambda': [0.0, 0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0],  # Ridge Regularization term on weights ,  higher values = more consrvative
#             #   'model__min_child_weight': [0, 1, 2, 3, 4,],
#             #   "model__max_delta_step":  [0, 3, 5, 6, 7],           # for LogisticReg good to solve imbalance 
#           #   'model__objective': [None, 'reg:absoluteerror'],#'multi:softprob,'reg:squarederror','reg:models_trained'],
#           #  # 'model__tree_method': ["hist", "gpu_hist"],
#           #   'model__booster': [None, "gblinear", "gbtree"],
#             "model__validate_parameters":[True],
#               }

# # 'model__scale_pos_weight': [0.0, 0.3, 0.5, 0.7, 0.9, 1.0],  # only  for clasifcation: handle imbalance, ratio between negative and positive examples

# # Objective candidate: multi:softmax
# # Objective candidate: multi:softprob
# # Objective candidate: reg:squarederror
# # Objective candidate: reg:squaredlogerror
# # Objective candidate: reg:logistic
# ## Objective candidate: reg:linear
# # Objective candidate: reg:pseudohubererror
# # Objective candidate: reg:gamma
# # Objective candidate: reg:absoluteerror

# ## DOC: https://xgboost.readthedocs.io/en/stable/parameter.html


In [None]:
cluster_ids = shc.fcluster(dist_linkage, 1, criterion="distance")
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]

X_train_sel = X_train[:, selected_features]
X_test_sel = X_test[:, selected_features]


In [None]:
## permutation based FI (build in func from skilearn)

# perm_importance = permutation_importance(xgb, X_test, y_test)
# The visualization of the importance:

# sorted_idx = perm_importance.importances_mean.argsort()
# plt.barh(boston.feature_names[sorted_idx], perm_importance.importances_mean[sorted_idx])
# plt.xlabel("Permutation Importance")