In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Data preprocessing for HCMC survey dataset"""

__author__ = "Anna Buch, Heidelberg University"
__email__ = "a.buch@stud.uni-heidelberg.de"


# Feature selection done by elastic net 

Before a lasso regression was applied, but lead to not profound set of predictors, each model run lead to a nearly complete new selection of feautres. Therefore and in regard to strong multicollinearity (see pearson Correlation plot) a Elastic Net is tested.

Elastic Net combines feature elimination from Lasso and feature coefficient reduction from the Ridge model to improve your model’s predictions.

*Sources*
Geron 2019: https://learning.oreilly.com/library/view/hands-on-machine-learning/9781492032632/ch04.html#idm45022190228392

In [2]:
import sys
import numpy as np
import pandas as pd

import pickle
import joblib

from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, RepeatedStratifiedKFold, RepeatedKFold, cross_val_score, cross_validate
from sklearn.compose import TransformedTargetRegressor

from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt

sys.path.insert(0, "../../../")
import utils.utils_feature_selection as fs
import utils.utils_evaluation as e
import utils.utils_figures as f
import utils.settings as s
import utils.pipelines_continous as p

s.init()
seed = s.seed

import warnings
warnings.filterwarnings('ignore')

%matplotlib

In [None]:
df_candidates = pd.read_excel("../../../input_survey_data/input_data_businessreduction.xlsx")
#df_candidates = pd.read_excel("../../../input_survey_data/input_data_contentloss.xlsx")

print(df_candidates.shape)
df_candidates.tail(2)


(397, 50)


Unnamed: 0,Target_businessreduction,inundation_duration_h,water_depth_cm,contaminations.0,flowvelocity,warning_time_h,emergency_measures.1,emergency_measures.2,emergency_measures.3,emergency_measures.4,...,resilience_left_alone,resilience_neighbor_management,perception_who_responsible4protection.Rank1,perception_private_economy_future,contaminations_light,contaminations_heavy,shp_suppliers_HCMC,shp_content_value_euro,elevation_m,shp_registered_capital_euro
395,,4.0,70.0,0,1,,1,0,1,0,...,5,1.0,2.0,3.0,1,0,1,,1.83886,11047.7
396,0.0,3.0,100.0,0,1,,1,0,1,0,...,5,,3.0,3.0,1,0,1,,1.87277,736.5


## Clean dataset from features and records with too many missing values


In [None]:
print("Percentage of missing valeus per feature\n", df_candidates.isna().mean().sort_values(ascending=False)[:15] ) 

Percentage of missing valeus per feature
shp_content_value_euro                         0.158690
elevation_building_height_cm                   0.158690
shp_registered_capital_euro                    0.118388
Target_businessreduction                       0.090680
shp_risk_tolerance                             0.070529
perception_who_responsible4protection.Rank1    0.070529
bage                                           0.068010
perception_private_economy_future              0.065491
hh_monthly_income_cat                          0.060453
resilience_govern_careing                      0.057935
shp_monetary_resources4prevention              0.045340
resilience_more_future_affected                0.037783
shp_profits_last5years                         0.037783
dtype: float64


In [None]:
## delete features with more than 10% missing values
print("Percentage of missing valeus per feature\n", df_candidates.isna().mean().sort_values(ascending=False)[:15] ) 
#df_candidates = df_candidates[df_candidates.columns[df_candidates.isna().mean() < 0.10]]  # drop feautres with more than 10% missing values
#print(df_candidates.isna().sum(axis=0).sort_values(ascending=False))
## --> drops content values if threshold == 15%

# drop warning time due to 77% nan
df_candidates = df_candidates.drop("warning_time_h", axis=1)

Percentage of missing valeus per feature
shp_content_value_euro                         0.158690
elevation_building_height_cm                   0.158690
shp_registered_capital_euro                    0.118388
Target_businessreduction                       0.090680
shp_risk_tolerance                             0.070529
perception_who_responsible4protection.Rank1    0.070529
bage                                           0.068010
perception_private_economy_future              0.065491
hh_monthly_income_cat                          0.060453
resilience_govern_careing                      0.057935
shp_monetary_resources4prevention              0.045340
resilience_more_future_affected                0.037783
shp_profits_last5years                         0.037783
dtype: float64


In [None]:
# ## TODO fix missing values by filling or removing them:
# ## Idea: remove nan in targets (->get 2 ds) and fill missing values in candidate predictors
# ## or think about to remove predictors with very many missing values , even they are significant correlated to target eg. warning time

## number of missing values for each predictor and for each case
# df_candidates_t.isna().sum().sort_values(ascending=False)
#t = df_candidates_t.drop(["perception_govern_support_future", "warning_time_h"], axis=1)
# t.isna().sum().sum()

# df_candidates.isna().sum(axis=1).sort_values(ascending=False) # nan per case


## select only damage cases

In [None]:
targets = ["Target_contentloss_euro", "Target_businessreduction"]
target = targets[1]


In [None]:
print(f"Removing {df_candidates.loc[df_candidates[target]==0,:].shape[0]} zero loss records")
df_candidates = df_candidates.loc[df_candidates[target]!=0,:]

print(f"Keeping {df_candidates.shape} damage cases for model training and evaluation")

Removing 149 zero loss records
Keeping (248, 49) damage cases for model training and evaluation


## Fit model and select features

In [None]:
## Hyperparameter ranges to test

param_grid = {
    "model__alpha": [ 0.01, 0.1, 1, 2, 3, 4],      # alpha: constant mulitplies penality term, alpha = 0 is equivalent to an OLS solved by the LinearRegression
    "model__max_iter": [1,2,3,4,5, 10],
    "model__l1_ratio": [0.0, 0.25, 0.5, 0.75, 1.0],     # r = 0, equivalent to Ridge Regression (=L2),  r = 1 equivalent to Lasso Regression (=L1) 
    "model__tol": [0.5, 0.7, 1.0, 2.0 ,5.0, 10.0],
    "model__selection": ["cyclic", "random"],
    "model__positive": [True],
    "model__random_state": [seed],
}

## TODO fix  this current workaround with beginning of names for pipes with BaggingRegressor
param_bag_grid = { k.replace('model', 'bagging__estimator') : v for (k, v) in param_grid.items()}



param_bagging = {
    'bootstrap': [True, False],
    'random_state': [seed]
    # 'bootstrap_features': [True, False],
    # 'n_estimators': [20,50,100],
    # 'max_samples': [0.5,1.0, X_train.shape[0]//2,],
    # 'max_features': [0.5,1.0, X_train.shape[1]//2,],
    # oob_score
}
## TODO add hyperparams for Bagging: 
## current defaults: n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False


In [None]:
## iterate over both targets and store results 

eval_set_list = []

#targets = ["Target_contentloss_euro", "Target_businessreduction"]
print(target)

## iterate over piplines. Each piplines contains precrosseing methods and several  classifier
pipelines = ["pipe_bag_en"]#, 
#pipelines = ["pipe_en" ]#, "pipe_ximput_logr"]


for pipe_name in pipelines:

    print( f"\nApply Elastic Net on {target}, with pipeline {pipe_name}:")

    ## load sinlge pipeline
    pipe = joblib.load(f'./pipelines/{pipe_name}.pkl')
        

    df_candidates_t = df_candidates

   
    ## clean df from remaining records containg nan
    df_candidates_t = df_candidates_t.dropna()

    ## clean df from nan in target
    #df_candidates_t = df_candidates_t[ ~df_candidates_t[f"{target}"].isna()]

    # ##impute nans in X
    # for c in df_candidates_t.drop(target, axis=1): 
    #     #df_candidates_t[f"{c}"].fillna(value=np.nanmedian(df_candidates_t[f"{c}"]), inplace=True)
    #     df_candidates_t[c].fillna(df_candidates_t[c].median(), inplace=True)
 
     # print("Uses ", df_candidates_t.shape[0], " records, from those have ", 
    #       { (df_candidates_t[target][df_candidates_t[target]==0.0]).count() }, f" records zero {target.split('_')[1]}")

        
    X_unscaled = df_candidates_t.drop(target, axis=1)  # remove bth targets from X
    y = df_candidates_t[target]
 
    ## test train split
    X_train, X_test, y_train, y_test = train_test_split(
        X_unscaled, y, test_size=0.15, 
        random_state=seed, shuffle=True)
    ## save evaluation set for later usage in feature importance
    eval_set =  pd.concat([y_test, X_test], axis=1) #[(X_test, y_test)]
    eval_set_list.append({pipe_name : eval_set})

    ## normalize data 
    X_train, X_test = fs.normalize_X(X_train, X_test)
        
    print("Training size", X_train.shape)
    print("Test size", X_test.shape)
    
    # model = {'model': BaggingRegressor,   # default bootstrap=True
    #     'kwargs': {'estimator': ElasticNet()},  # TODO: pass 'random_state':seed to baggingregressor
    #     'parameters': param_grid,
    #     }
    ## Hyperparmaters and CV
    #cv = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)        #  StratifiedKFold = fold contains same percantega of class as in orignal training set, addresees imbalancing
    #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)        #  StratifiedKFold = fold contains same percantega of class as in orignal training set, addresees imbalancing
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=seed)
    model_cv = RandomizedSearchCV(
        estimator=pipe, 
        param_distributions=param_bag_grid, #param_bag_grid, 
        cv=cv, 
        scoring="neg_mean_absolute_error", #"neg_mean_squared_error",#"r2" ,#"neg_mean_absolute_error",   #TODO classifcation: test also e.g "f1" or recall or "f1_micro", "neg_mean_absolute_error",
        refit=True,   ## Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting.
                        ## If refit=False, clf.fit() will have no effect because the GridSearchCV object inside the pipeline will be reinitialized after fit().
                        ## ! When refit=True, the GridSearchCV will be refitted with the best scoring parameter combination on the whole data that is passed in fit()
        #random_state=seed
    )
    ## Fit model
    model_cv.fit(X_train, y_train)   
    
    print('Train R^2 Score : %.3f'%model_cv.best_estimator_.score(X_train, y_train))
    print('Test R^2 Score : %.3f'%model_cv.best_estimator_.score(X_test, y_test))
    print("CV score: ", model_cv.best_score_ ,  model_cv.best_estimator_.score(X_train, y_train),  model_cv.best_estimator_.score(X_test, y_test))

 
    ## fit model again with best hyperparams
    print("Create new Elastic Net model based on best hyperparameters")
    model =  model_cv.best_estimator_
    model.fit(X_train, y_train)

    ## store best trained model for evaluation
    filename = f'./models_trained/elasticnet_{target}_{pipe_name}.sav'
    pickle.dump(model, open(filename, 'wb'))


    ## predict unseen data
    y_pred = model.predict(X_test)

    print('Train R^2 Score : %.3f'%model.score(X_train, y_train)) # how well did the model on the training set
    print('Test R^2 Score : %.3f'%model.score(X_test, y_test)) # .. compared to the unseen test set for overfitting 
    #print("MAE of best model: %.3f" % elastic_net_cv.best_score_)  # TODO check why MAE nan 
    rmse = np.square(np.subtract(np.array(y_test).reshape(-1), y_pred)).mean()#np.sqrt(mean_squared_error(y_test, y_pred))
    #np.sqrt(mean_squared_error(y_test, y_pred))
    print("RMSE:  {:.2f}".format(rmse), "euros or in %") 


    ## Feature importance + selection
    importances = e.permutation_feature_importance(model, X_test, y_test, repeats=5, seed=seed)

    df_importance = pd.DataFrame(
        {"importances" : importances[0]},
        index=X_train.columns.to_list(),
        ) 
    df_importance = df_importance.sort_values("importances", ascending=False)  # get most important features to the top
    print("Most important features:", df_importance.iloc[:5].index.to_list())
    df_importance = df_importance.loc[df_importance.importances >= 0.000 , : ]
    #df_importance.head(5)
    # ## write selected predictors and response to disk
    fs.save_selected_features(
        X_train, 
        pd.DataFrame(y_train, columns=[target]), 
        df_importance.T.columns, 
        filename=f"../../../input_survey_data/selected_predictors/fs_elasticnet_{target.split('_')[1]}_{pipe_name}.xlsx"
    )

    ## Evaluate
    ## print evaluation report + check for overfitting 
    print("\nTraining set")
    y_pred_train = model.predict(X_train)
    #y_pred_train = model_cv.best_estimator_.predict(X_train)
    e.evaluation_report(y_train, y_pred_train,
                         X_unscaled.shape[1], 
                         filepath=f"./models_evaluation/elastic_net/eval_train_{target.split('_')[1]}_{pipe_name}.csv")

    print("\nTesting set")
    y_pred = model.predict(X_test)
    e.evaluation_report(y_test, y_pred, 
                        X_unscaled.shape[1], 
                        filepath=f"./models_evaluation/elastic_net/eval_test_{target.split('_')[1]}_{pipe_name}.csv")
      


Target_businessreduction

Apply Elastic Net on Target_businessreduction, with pipeline pipe_bag_en:
Training size (99, 48)
Test size (18, 48)
Train R^2 Score : 0.377
Test R^2 Score : 0.018
CV score:  -18.397567897453758 0.377119846841124 0.018313742895916585
Create new Elastic Net model based on best hyperparameters
Train R^2 Score : 0.410
Test R^2 Score : 0.089
RMSE:  645.47 euros or in %
Most important features: ['emergency_measures.7', 'resilience_neighbor_management', 'resilience_govern_careing', 'emergency_measures.6', 'elevation_building_impl']
total features: 48
selected features: 33
dropped features: 15
selected features: 

Saving model to disk: ../../../input_survey_data/selected_predictors/fs_elasticnet_businessreduction_pipe_bag_en.xlsx

Training set

    Model Performance:
        Mean Squared Error: 403.7
        Root Mean Square Error: 20.1
        Mean Absolute Error: 15.4
        Mean Absolute Percentage Error: 92.9
        R²-Score: 0.41
        Adjusted R²-Score: -0.1

In [None]:
# #Snippet from: https://ubc-cs.github.io/cpsc330/lectures/08_hyperparameter-optimization.html#exhaustive-grid-search-sklearn-model-selection-gridsearchcv
#pd.DataFrame(elastic_net_cv.cv_results_).set_index("rank_test_score").sort_index().T
pd.DataFrame(model_cv.cv_results_)[
        [                                   # only important cv evaluation metrics 
            "mean_test_score",
            "mean_fit_time",
            "rank_test_score",
            "param_bagging__estimator__alpha",
            "param_bagging__estimator__l1_ratio",
            "param_bagging__estimator__max_iter"
        ]
    ].set_index("rank_test_score").sort_index().T


rank_test_score,1,2,3,4,5,6,7,8,9,10
mean_test_score,-18.542947,-18.555561,-18.643039,-18.828993,-19.410316,-19.860513,-20.008853,-20.185228,-20.255587,-20.290236
mean_fit_time,0.034736,0.037187,0.033127,0.045321,0.031629,0.043003,0.031868,0.031609,0.038029,0.037148
param_bagging__estimator__alpha,0.1,0.01,0.01,1.0,1.0,1.0,4.0,3.0,4.0,4.0
param_bagging__estimator__l1_ratio,0.5,0.5,0.0,1.0,0.75,0.0,0.0,0.25,0.25,1.0
param_bagging__estimator__max_iter,1.0,2.0,5.0,3.0,4.0,2.0,10.0,2.0,5.0,4.0


## Evaluation

In [None]:
## reload models

print(targets)
target = targets[1]

pipelines = ["pipe_bag_en"]#, 
#pipelines = ["pipe_en" ]#
pipe_name = pipelines[0]

model_eval = pickle.load(open(f"./models_trained/elasticnet_{target}_{pipe_name}.sav", 'rb'))
#elastic_net_eval.get_params()


['Target_contentloss_euro', 'Target_businessreduction']


In [None]:
print('R^2 training set', round(model_eval.score(X_train, y_train)*100, 2), ' %')
# print('R^2 test set', round(elastic_net.score(X_test, y_test)*100, 2))

R^2 training set 36.12  %


### Feature importance

In [None]:
#model_eval.get_params()
model_eval

In [None]:
print("if alpha < 0.5 used more Ridge regularization: \n" , model_eval.get_params())

if alpha < 0.5 used more Ridge regularization: 
 {'memory': None, 'steps': [('bagging', BaggingRegressor(estimator=ElasticNet(alpha=0.1, max_iter=1, positive=True,
                                      random_state=42, tol=0.7)))], 'verbose': False, 'bagging': BaggingRegressor(estimator=ElasticNet(alpha=0.1, max_iter=1, positive=True,
                                      random_state=42, tol=0.7)), 'bagging__base_estimator': 'deprecated', 'bagging__bootstrap': True, 'bagging__bootstrap_features': False, 'bagging__estimator__alpha': 0.1, 'bagging__estimator__copy_X': True, 'bagging__estimator__fit_intercept': True, 'bagging__estimator__l1_ratio': 0.5, 'bagging__estimator__max_iter': 1, 'bagging__estimator__positive': True, 'bagging__estimator__precompute': False, 'bagging__estimator__random_state': 42, 'bagging__estimator__selection': 'cyclic', 'bagging__estimator__tol': 0.7, 'bagging__estimator__warm_start': False, 'bagging__estimator': ElasticNet(alpha=0.1, max_iter=1, positive=True,

In [91]:
# plot feature importance
#fig, ax = plt.subplots(figsize=(12,5))

# relaod evalation set
eval_set = eval_set_list[0][pipe_name]

importances = e.permutation_feature_importance(model_eval, 
                                               eval_set.drop(target, axis=1), 
                                               eval_set[target], 
                                               repeats=5, seed=seed
                                               )

## feature importance scores
df_importance = pd.DataFrame({
    "name" : X_unscaled.columns.to_list(),
    "importances" : importances[0], #np.absolute(model_eval.coef_),
     }) 

# drop features which dont reduce the loss
df_importance = df_importance.loc[df_importance.importances > 11, : ] 
df_importance = df_importance.sort_values("importances", ascending=True)

plt.figure(figsize=(8, 10))
plt.barh(df_importance.name, df_importance.importances)
plt.xticks(
    #ticks = range(len(selected_feat)),
    #labels = X_unscaled.iloc[:,selected_feat],
    rotation = 90
    )
plt.title(f"Feature Importances for {target}")
plt.show()

In [None]:
# coef = pd.Series(elastic_net.coef_, index = X_train.columns)
# important_features = pd.concat([coef.sort_values().head(10),
#                      coef.sort_values().tail(10)])
# important_features.plot(kind = "barh")
# plt.title("Coefficients in the ElasticNet Model")

### Plot optimal number of features

In [None]:
elastic_net_eval.c

In [None]:
# cv results
cv_results = pd.DataFrame(elastic_net_eval.cv_results_)
cv_results


AttributeError: 'ElasticNet' object has no attribute 'cv_results_'

In [None]:

# plotting cv results
plt.figure(figsize=(16,6))

plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')