In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Data preprocessing for HCMC survey dataset"""

__author__ = "Anna Buch, Heidelberg University"
__email__ = "a.buch@stud.uni-heidelberg.de"


# Feature selection done by Conditional Inference Trees 

CIT uses p-value as one-a-split criterion instead of using homogeneity. The algorithm will pick the feature with the least p-value and will start splitting from it. Then it will keep going until it no longer finds statistically significant p-value or some other criteria have met such as minimum node size or max split. 

In [3]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score# , confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, train_test_split, RepeatedStratifiedKFold, cross_val_score

import matplotlib.pyplot as plt

from utils_feature_selection import r_ctree_statistics, save_selected_features


seed = np.random.seed(11)

import warnings
warnings.filterwarnings('ignore')


# ruff check ./model_preprocessing/Feature_selection/utils_feature_selection.py --fix

##### install Rpackage for ctree in python

In [4]:
# load r library initally, enables the %%R magic
# %load_ext rpy2.ipython

import rpy2
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr, data
import rpy2.robjects.packages as rpackages


# get basic R packages
utils = importr('utils')
base = importr('base')
dplyr = importr('dplyr')
stats = importr("stats")

# pandas.DataFrames to R dataframes 
from rpy2.robjects import pandas2ri, Formula
pandas2ri.activate()

# print r df in html
import rpy2.ipython.html
rpy2.ipython.html.init_printing()


# get partykit library containing ctree , ctree_controls etc
partykit = importr('partykit')
partykit



rpy2.robjects.packages.Package as a <module 'partykit'>

### input data

In [5]:
df_candidates = pd.read_excel("../../input_survey_data/input_data_business_2.xlsx")
print(df_candidates.shape)
df_candidates.tail(2)


(397, 60)


Unnamed: 0,Target_contentloss_euro,Target_businessreduction,inundation_duration_h,water_depth_cm,contaminations.0,contaminations.1,contaminations.2,contaminations.3,contaminations.4,flowvelocity,...,resilience_govern_careing,resilience_govern_careing_increases,resilience_left_alone,resilience_neighbor_management,perception_who_responsible4protection.Rank1,perception_govern_support_future,perception_private_economy_future,shp_content_value_euro,shp_registered_capital_euro,elevation_m
395,0.0,,4.0,70.0,0,1,0,0,1,1,...,1.0,1.0,5,1.0,2.0,1.0,3.0,,11047.7,1.83886
396,0.0,0.0,3.0,100.0,0,1,0,0,1,1,...,,,5,,3.0,,3.0,,736.5,1.87277


In [6]:
targets = ["Target_contentloss_euro", "Target_businessreduction"]
target = targets[0]

## TODO make entire wokflow as loop over both target variables
# ## iterate over both targets and store results 
# for target in ["Target_contentloss_euro", "Target_businessreduction"]:

#     print( f"Apply Elastic Net on {target}:\n")
#     y = df_candidates[target]


## remove cases where target information is missing
df_candidates = df_candidates[ ~df_candidates[f"{target}"].isna()]
print(df_candidates.shape)


X = df_candidates.drop(targets, axis=1)
y = df_candidates[target]



(386, 60)


#### Split Data

In [7]:
## test train split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, 
    random_state=seed, shuffle=False
)

train = pd.concat([y_train, X_train], axis=1)
test = pd.concat([y_test, X_test], axis=1)
#print(train.head(2))



### Model fit

In [8]:
###############  CV with gridSearch TODO   ###################
# ## specify model
# cit_model = partykit.ctree(Formula('Target_contentloss_euro ~ .'),  
#                                 data=train
#                           )
# ## hyperparameter tunning 
# param_dist = [{'mincriterion': 0.95}]
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# cit_model_cv = GridSearchCV(estimator = cit_model, 
#                          param_grid = param_dist
#                          )#, scoring= 'r2', cv = folds, verbose = 1,return_train_score=True)      

# # # fit the model
# cit_model_cv.fit(X_train, y_train)
  

#################### without Hyperparameter tunning  #####################
# fit ctree
## Minciterion = confidence level (smaller values => larger trees; e.g mincriterion=0.8, p-value must be smaller than 0.2 in order for a node to split)
cit_model = partykit.ctree(Formula(f'{target} ~ .'),  
                                data=train,
                                control = partykit.ctree_control(mincriterion = 0.8)
                          )


## store trained model for evaluation
filename = f'./models_trained/cit_{target}'
pickle.dump(cit_model, open(filename, 'wb'))

# keras = install.packages("keras")
# # Save the model
# keras.save_model_hdf5(model, "model.h5")
# # Recreate the exact same model purely from the file
# new_model <- load_model_hdf5("model.h5")



## Feature selection

In [9]:
# ## actual p-values (without log)
# strucchange = importr("strucchange")
# strucchange.sctest(cit_model, node = 1)[1]  # p values

In [10]:
## get statisitcs to obtain important feautres

cit_stats = r_ctree_statistics(cit_model)
cit_stats.columns = X.columns
print(cit_stats)

           inundation_duration_h  water_depth_cm  contaminations.0   
statistic               2.080154       11.993719          0.143349  \
p_value                 0.999915        0.030494          1.000000   
criterion              -9.373248       -0.030969        -70.800235   

           contaminations.1  contaminations.2  contaminations.3   
statistic          0.149846          0.087360          0.030447  \
p_value            1.000000          1.000000          1.000000   
criterion        -69.576261        -84.629148       -114.650836   

statistic          1.199385      3.896394        1.645725  \
p_value            1.000000      0.943685        0.999998   
criterion        -18.527556     -2.876792      -12.909114   

           emergency_measures.1  ...  resilience_govern_careing   
statistic              0.462363  ...                   1.487844  \
p_value                1.000000  ...                   1.000000   
criterion            -39.800464  ...                 -14.600782  

The log rather than the p-value is used because it is numerically much more stable when used for comparisons, computing the minimal value, etc. Note that the p-values can become extremely small when significant. 


statistic DEF: 

citrerion DEF: 



In [11]:
## get signifcant features
selected_feat = cit_stats.loc[:, cit_stats.loc["p_value",:]<= 0.05]

## write selected predictors to disk
save_selected_features(X_train, y_train, selected_feat.columns, filename=f"../../input_survey_data/fs_cit_{target}.xlsx")


total features: 58
selected features: 10
dropped features: 48
selected features: 
['water_depth_cm', 'b_area', 'hh_monthly_income_cat', 'shp_sector', 'shp_employees', 'shp_suppliers_location.3', 'shp_suppliers_location.4', 'perception_who_responsible4protection.Rank1', 'shp_content_value_euro', 'shp_registered_capital_euro']

Saving model to disk: ../../input_survey_data/fs_cit_Target_contentloss_euro.xlsx


(386, 58)

In [174]:
## Predict
cit_pred = stats.predict(cit_model, test, type="response") #  type = "prob" # conditional class probabilities
cit_pred  = base.round(cit_pred)

## get back to python dtypes
cit_pred = np.array(cit_pred)
y_test = np.array(y_test)
y_test = np.array(y_test)
y_test = np.array(y_test)

#np.mean((y_test - cit_pred)**2)


## Model evaluation

In [178]:
cit_model_p = pickle.load(open(f"./models_trained/cit_{target}", 'rb'))
type(cit_model_p)
#cit_model_p.__dir__()

['ro',
 'rx',
 'rx2',
 '__module__',
 '__doc__',
 '_vector',
 '_html_template',
 '__init__',
 '_iter_repr',
 '__repr__',
 '_repr_html_',
 'from_length',
 '__parameters__',
 '__abstractmethods__',
 '_abc_impl',
 '_add_rops',
 '__add__',
 '__getitem__',
 '__setitem__',
 'names',
 'items',
 'sample',
 'repr_format_elt',
 '_iter_formatted',
 '__repr_content__',
 '__annotations__',
 '__rname__',
 '_RObjectMixin__tempfile',
 '_RObjectMixin__file',
 '_RObjectMixin__fifo',
 '_RObjectMixin__sink',
 '_RObjectMixin__close',
 '_RObjectMixin__readlines',
 '_RObjectMixin__unlink',
 '_RObjectMixin__show',
 '_RObjectMixin__print',
 '_RObjectMixin__slots',
 'slots',
 '__str__',
 '__getstate__',
 '__setstate__',
 'r_repr',
 'rclass',
 '__dict__',
 '__weakref__',
 '__slots__',
 '__hash__',
 '__getattribute__',
 '__setattr__',
 '__delattr__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__new__',
 '__reduce_ex__',
 '__reduce__',
 '__subclasshook__',
 '__init_subclass__',
 '__format_

In [187]:
cit_model_p.names
cit_model_p = np.array(cit_model_p.rx(3))
cit_model_p

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (6,) + inhomogeneous part.

In [163]:
# define the model evaluation by k-fold CV
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(cit_model_p, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

print('Mean Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

InvalidParameterError: The 'estimator' parameter of check_scoring must be an object implementing 'fit'. Got <rpy2.robjects.vectors.ListVector object at 0x00000282FCAEBDC0> [RTYPES.VECSXP]
R classes: ('constparty', 'party')
[Lis..., Lis..., Lis..., Lan..., NUL..., Lis..., Sex..., Sex...]
  1: <class 'rpy2.rinterface.ListSexpVector'>
  <rpy2.rinterface.ListSexpVector object at 0x00000282DF5FD0C0> [RTYPES.VECSXP]
  2: <class 'rpy2.rinterface.ListSexpVector'>
  <rpy2.rinterface.ListSexpVector object at 0x00000282DF5FA0C0> [RTYPES.VECSXP]
  3: <class 'rpy2.rinterface.ListSexpVector'>
  <rpy2.rinterface.ListSexpVector object at 0x00000282DF5FD0C0> [RTYPES.VECSXP]
  4: <class 'rpy2.rinterface.LangSexpVector'>
  <rpy2.rinterface.LangSexpVector object at 0x00000282DF5FA0C0> [RTYPES.LANGSXP]
  5: <class 'rpy2.rinterface_lib.sexp.NULLType'>
  <rpy2.rinterface_lib.sexp.NULLType object at 0x00000282DEFB6780> [RTYPES.NILSXP]
  6: <class 'rpy2.rinterface.ListSexpVector'>
  <rpy2.rinterface.ListSexpVector object at 0x00000282DF5FA0C0> [RTYPES.VECSXP]
  7: <class 'rpy2.rinterface.SexpClosure'>
  <rpy2.rinterface.SexpClosure object at 0x00000282DF5FD0C0> [RTYPES.CLOSXP]
  8: <class 'rpy2.rinterface.SexpClosure'>
  <rpy2.rinterface.SexpClosure object at 0x00000282F202A5C0> [RTYPES.CLOSXP] instead.

In [None]:
# compare r2 for train and test sets (for all polynomial fits)
print("R-squared values: \n")

for i, degree in enumerate(degrees):
    train_r2 = round(sklearn.metrics.r2_score(y_train, y_train_pred[:, i]), 2)
    test_r2 = round(sklearn.metrics.r2_score(y_test, y_test_pred[:, i]), 2)
    print("Polynomial degree {0}: train score={1}, test score={2}".format(degree, 
                                                                         train_r2, 
                                                                         test_r2))

In [None]:
# param_dist = {'n_estimators': [10, 100, 200, 500],
#               'max_depth': [1, 3, 5, 10,20],
#               'colsample_bynode': [0.1, 0.3] # nbr of feautres for each split point
#               #'subsample': 0.8  # define subsample of train st, xgb has not bootstrapping
#               }



(265,)

In [None]:
## iterate over both targets and store results 

for target in ["Target_contentloss_euro", "Target_businessreduction"]:

    print( f"Apply Elastic Net on {target}:\n")
    y = df_candidates[target]

    ## normalize data 
    scaler = MinMaxScaler() 
    X = scaler.fit_transform(pd.DataFrame(X_unscaled))
    y = scaler.fit_transform(pd.DataFrame(y))
    
    ## test train split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, 
        random_state=seed, shuffle=True
    )
    X_train = pd.DataFrame(X_train)

    # classes have to start from zero on for lasso regression, make continous variables as categorical 
    #le = LabelEncoder()
    #y_train = le.fit_transform(y_train)

    ## set up model
    ## TODO adapt ratio (l1_ratio) between ridge and lasso reg: 
    # r = 0, equivalent to Ridge Regression,  r = 1 equivalent to Lasso Regression
    elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=seed)
    elastic_net = SelectFromModel(elastic_net) 
    elastic_net.fit(X_train, y_train) 
    #elastic_net.predict([[1.5]])


    print("Elastic Net:")
    selected_feat = X_train.columns[(elastic_net.get_support())]
    not_selected_feat = X_train.columns[~(elastic_net.get_support())]

    print("total features: {}".format((X_train.shape[1])))
    print("selected features: {}".format(len(selected_feat)))
    print("dropped features: \n{}\n".format(X_unscaled.columns[not_selected_feat].to_list()))
    ## print("features with coefficients shrank to zero: {}".format(np.sum(elastic_net.estimator_.coef_ == 0)))
    #print(f"Selected features: \n{X_unscaled.columns[selected_feat]}")
    #X_train[(ridge_.estimator_.coef_ == 0)#]

    ## store trained model for evaluation
    filename = f'./models_trained/elastic_net{target}.sav'
    pickle.dump(elastic_net, open(filename, 'wb'))


    
    ## write selected features from training set to disk
    train = pd.concat([y_train, X_train], axis=1)
    df_elastic_net = train[[target] + X_unscaled.columns[selected_feat].to_list()]
    #df_elastic_net.info()
    df_elastic_net.to_excel(f"../../input_survey_data/fs_elasticnet_{target}.xlsx", index=False)



    # ## predict unseen X_test set
    # y_lasso_pred = elastic_net.predict(X_test)


Apply Elastic net on Target_contentloss_euro:



ValueError: Input X contains NaN.
ElasticNet does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values