In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Data preprocessing for HCMC survey dataset"""

__author__ = "Anna Buch, Heidelberg University"
__email__ = "a.buch@stud.uni-heidelberg.de"


# Feature selection done by Conditional Inference Trees 

CIT uses p-value as one-a-split criterion instead of using homogeneity. The algorithm will pick the feature with the least p-value and will start splitting from it. Then it will keep going until it no longer finds statistically significant p-value or some other criteria have met such as minimum node size or max split. 

In [2]:
import numpy as np
import pandas as pd

import pickle

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import accuracy_score# , confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split, RepeatedStratifiedKFold, cross_val_score

import matplotlib.pyplot as plt


seed = np.random.seed(11)

import warnings
warnings.filterwarnings('ignore')



##### install Rpackage for ctree in python

In [3]:
# load r library initally, enables the %%R magic
# %load_ext rpy2.ipython

import rpy2
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr, data
import rpy2.robjects.packages as rpackages


# get basic R packages
utils = importr('utils')
base = importr('base')
dplyr = importr('dplyr')

# pandas.DataFrames to R dataframes 
from rpy2.robjects import pandas2ri, Formula
pandas2ri.activate()

# print r df in html
import rpy2.ipython.html
rpy2.ipython.html.init_printing()


# get partykit library containing ctree , ctree_controls etc
partykit = rpackages.importr('partykit')
partykit



rpy2.robjects.packages.Package as a <module 'partykit'>

In [4]:
## list attributes for partykit package
partykit.__dir__()

['__name__',
 '__doc__',
 '__package__',
 '__loader__',
 '__spec__',
 '_env',
 '__rname__',
 '_translation',
 '_rpy2r',
 '_exported_names',
 '_symbol_r2python',
 '_symbol_resolve',
 '___NAMESPACE___',
 '___S3MethodsTable___',
 '_create_cond_list',
 '_ctree_select',
 '_ctree_split',
 '_ctree_test',
 '_ctree_test_1d',
 '_ctree_test_2d',
 '_ctree_test_internal',
 '_deparse_variables',
 '_extree_node',
 '_extree_surrogates',
 '_get_path',
 '_get_psplits',
 '_get_term_labels',
 '_list_rules_party',
 '_logrank_trafo',
 '_make_formatinfo_simpleparty',
 '_median_survival_time',
 '_mfluc_test',
 '_mob_grow_getlevels',
 '_names_party',
 '_nobs_party',
 '_objfun_select',
 '_objfun_split',
 '_objfun_test',
 '_onLoad',
 '_packageName',
 '_partysplit',
 '_perturb',
 '_plot_node',
 '_pred_density',
 '_pred_ecdf',
 '_pred_factor',
 '_pred_factor_response',
 '_pred_numeric_response',
 '_pred_quantile',
 '_pred_Surv',
 '_pred_Surv_response',
 '_predict_party_constparty',
 '_resample',
 '_response_class'

### input data

In [5]:
df_candidates = pd.read_excel("../../input_survey_data/input_data_business_2.xlsx")
print(df_candidates.shape)
df_candidates.tail(2)


(397, 60)


Unnamed: 0,Target_contentloss_euro,Target_businessreduction,inundation_duration_h,water_depth_cm,contaminations.0,contaminations.1,contaminations.2,contaminations.3,contaminations.4,flowvelocity,...,resilience_govern_careing,resilience_govern_careing_increases,resilience_left_alone,resilience_neighbor_management,perception_who_responsible4protection.Rank1,perception_govern_support_future,perception_private_economy_future,shp_content_value_euro,shp_registered_capital_euro,elevation_m
395,0.0,,4.0,70.0,0,1,0,0,1,1,...,1.0,1.0,5,1.0,2.0,1.0,3.0,,11047.7,1.83886
396,0.0,0.0,3.0,100.0,0,1,0,0,1,1,...,,,5,,3.0,,3.0,,736.5,1.87277


In [6]:
targets = ["Target_contentloss_euro", "Target_businessreduction"]
target = targets[0]

## remove cases where target information is missing
df_candidates = df_candidates[ ~df_candidates[f"{target}"].isna()]
print(df_candidates.shape)


X = df_candidates.drop(targets, axis=1)
y = df_candidates[target]



(386, 60)


#### Split Data

In [7]:
## test train split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, 
    random_state=seed, shuffle=False
)

train = pd.concat([y_train, X_train], axis=1)
test = pd.concat([y_test, X_test], axis=1)
#print(train.head(2))


## fit ctree
## Minciterion = confidence level
cit_model = partykit.ctree(Formula('Target_contentloss_euro ~ .'),  
                                data=train,
                                control = partykit.ctree_control(mincriterion = 0.95)
                          )


In [10]:
cit_model.__dir__();
#partykit.predict_party();

In [18]:
## Predict

cit_pred = round(partykit.predict_party(cit_model, test, type="response"))

##### Confusion Matrix #####
dplyr.table(test["Target_contentloss_euro"], cit_pred, dnn=c("Actual", "Predicted"))

R[write to console]: Error in names(party)[id] : invalid subscript type 'list'



RRuntimeError: Error in names(party)[id] : invalid subscript type 'list'


In [None]:
# param_dist = {'n_estimators': [10, 100, 200, 500],
#               'max_depth': [1, 3, 5, 10,20],
#               'colsample_bynode': [0.1, 0.3] # nbr of feautres for each split point
#               #'subsample': 0.8  # define subsample of train st, xgb has not bootstrapping
#               }



(265,)

In [None]:
## iterate over both targets and store results 

for target in ["Target_contentloss_euro", "Target_businessreduction"]:

    print( f"Apply Elastic Net on {target}:\n")
    y = df_candidates[target]

    ## normalize data 
    scaler = MinMaxScaler() 
    X = scaler.fit_transform(pd.DataFrame(X_unscaled))
    y = scaler.fit_transform(pd.DataFrame(y))
    
    ## test train split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, 
        random_state=seed, shuffle=True
    )
    X_train = pd.DataFrame(X_train)

    # classes have to start from zero on for lasso regression, make continous variables as categorical 
    #le = LabelEncoder()
    #y_train = le.fit_transform(y_train)

    ## set up model
    ## TODO adapt ratio (l1_ratio) between ridge and lasso reg: 
    # r = 0, equivalent to Ridge Regression,  r = 1 equivalent to Lasso Regression
    elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=seed)
    elastic_net = SelectFromModel(elastic_net) 
    elastic_net.fit(X_train, y_train) 
    #elastic_net.predict([[1.5]])


    print("Elastic Net:")
    selected_feat = X_train.columns[(elastic_net.get_support())]
    not_selected_feat = X_train.columns[~(elastic_net.get_support())]

    print("total features: {}".format((X_train.shape[1])))
    print("selected features: {}".format(len(selected_feat)))
    print("dropped features: \n{}\n".format(X_unscaled.columns[not_selected_feat].to_list()))
    ## print("features with coefficients shrank to zero: {}".format(np.sum(elastic_net.estimator_.coef_ == 0)))
    #print(f"Selected features: \n{X_unscaled.columns[selected_feat]}")
    #X_train[(ridge_.estimator_.coef_ == 0)#]

    ## store trained model for evaluation
    filename = f'./models_trained/elastic_net{target}.sav'
    pickle.dump(elastic_net, open(filename, 'wb'))


    
    ## write selected features from training set to disk
    train = pd.concat([y_train, X_train], axis=1)
    df_elastic_net = train[[target] + X_unscaled.columns[selected_feat].to_list()]
    #df_elastic_net.info()
    df_elastic_net.to_excel(f"../../input_survey_data/fs_elasticnet_{target}.xlsx", index=False)



    # ## predict unseen X_test set
    # y_lasso_pred = elastic_net.predict(X_test)


Apply Elastic net on Target_contentloss_euro:



ValueError: Input X contains NaN.
ElasticNet does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values