## Flushot Boruta with Multi-Step Column-Transformer

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.svm import SVC

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

### Reading the data:

In [2]:
# read the csv-files and take the respondent_id column as index:

X_train_df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
y_train_df = pd.read_csv("training_set_labels.csv", index_col="respondent_id")
X_test_df = pd.read_csv("test_set_features.csv", index_col="respondent_id")

Check the columns and their data-types:

In [3]:
# for convenience the code producing the column-name : dtype output of the data-frame is commented out:

#for i,t in zip(X_train_df.dtypes.index,X_train_df.dtypes):
#    print(f"{i} : {t}")

# Result:
""" 
h1n1_concern : float64
h1n1_knowledge : float64
behavioral_antiviral_meds : float64
behavioral_avoidance : float64
behavioral_face_mask : float64
behavioral_wash_hands : float64
behavioral_large_gatherings : float64
behavioral_outside_home : float64
behavioral_touch_face : float64
doctor_recc_h1n1 : float64
doctor_recc_seasonal : float64
chronic_med_condition : float64
child_under_6_months : float64
health_worker : float64
health_insurance : float64
opinion_h1n1_vacc_effective : float64
opinion_h1n1_risk : float64
opinion_h1n1_sick_from_vacc : float64
opinion_seas_vacc_effective : float64
opinion_seas_risk : float64
opinion_seas_sick_from_vacc : float64
age_group : object
education : object
race : object
sex : object
income_poverty : object
marital_status : object
rent_or_own : object
employment_status : object
hhs_geo_region : object
census_msa : object
household_adults : float64
household_children : float64
employment_industry : object
employment_occupation : object

"""

' \nh1n1_concern : float64\nh1n1_knowledge : float64\nbehavioral_antiviral_meds : float64\nbehavioral_avoidance : float64\nbehavioral_face_mask : float64\nbehavioral_wash_hands : float64\nbehavioral_large_gatherings : float64\nbehavioral_outside_home : float64\nbehavioral_touch_face : float64\ndoctor_recc_h1n1 : float64\ndoctor_recc_seasonal : float64\nchronic_med_condition : float64\nchild_under_6_months : float64\nhealth_worker : float64\nhealth_insurance : float64\nopinion_h1n1_vacc_effective : float64\nopinion_h1n1_risk : float64\nopinion_h1n1_sick_from_vacc : float64\nopinion_seas_vacc_effective : float64\nopinion_seas_risk : float64\nopinion_seas_sick_from_vacc : float64\nage_group : object\neducation : object\nrace : object\nsex : object\nincome_poverty : object\nmarital_status : object\nrent_or_own : object\nemployment_status : object\nhhs_geo_region : object\ncensus_msa : object\nhousehold_adults : float64\nhousehold_children : float64\nemployment_industry : object\nemployment

From the data-types and the meaning of the columns, we select the appropriate transformations to convert data into numeric form.

### Defining the columns on which to perform what:

In [4]:
ohe_columns = ["race", "sex", "marital_status", "rent_or_own", "employment_status", "hhs_geo_region", "census_msa", "employment_industry", "employment_occupation"]
ordinal_columns = ['age_group','education', 'income_poverty']
numeric_columns = X_train_df.columns[X_train_df.dtypes == "float64"]

### Use a Pipeline for multi-step Transformation of a Column-Set:

If there is more than one transformation to be executed on one and the same set of columns, like here numeric_columns, then these steps have to be
collected inside a Pipeline - it does not work to do these step one after the other directly in the column-transformer, as every step will augment the data-frame with columns containing the result of the transformation-step - which could be expected, since a ColumnTransformer is not a pipeline, i.e. does not know anything about first-step, second-step, third-step...

Defining the Pipeline:

In [5]:
num_pipe = Pipeline([
    ("SimpleImputer", SimpleImputer(strategy="median", missing_values=np.nan)),
    ("PostImp_StandardScaler", StandardScaler(copy=False))
])

Defining the Column-Transformer: <br>
<br>
We will use a one-hot-encoder on the data-frame before we will send the data to the column-transformer, but we can allready define the column-transformer here.

In [6]:
from sklearn.preprocessing import OrdinalEncoder


full_columnTransformer = ColumnTransformer(
    transformers= [
        ("numerical_pipeline", num_pipe, numeric_columns),
        ("ordinal_preprocessing", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1, encoded_missing_value=-1), ordinal_columns),
    ],
    remainder="passthrough"
)

One-hot-encoding the training- and the test-data: <br>
We print out the shape of the data-frame to see if something happend with the data-frame.

In [7]:
print(f"X_train_df.shape: {X_train_df.shape}")
X_train_df = pd.get_dummies(data=X_train_df, columns=ohe_columns, dummy_na=True)
print(f"X_train_df.shape: {X_train_df.shape}")
X_test_df = pd.get_dummies(data=X_test_df, columns=ohe_columns, dummy_na=True)

X_train_df.shape: (26707, 35)
X_train_df.shape: (26707, 105)


Obviously some columns have been added to the data-frame under the one-hot-encoding...

Save the column-names of the data-frame to because the column-transformer will only return a numpy-array and we might re-construct our pandas data-frame from this later on.

In [8]:
X_columns = X_train_df.columns # contains the columns including those after one-hot-encoding

X_train_dfnp = full_columnTransformer.fit_transform(X_train_df)
X_test_dfnp = full_columnTransformer.transform(X_test_df)
print(f"X_train_dfnp.shape: {X_train_dfnp.shape}")
print(f"X_train_df.shape: {X_train_df.shape}")

X_train_dfnp.shape: (26707, 105)
X_train_df.shape: (26707, 105)


Check if all missing values have been removed by imputation and ordinal-encoding:

In [9]:
np.isnan(X_train_dfnp).sum().sum()

0

### Transform the multi-lable classification values to single lable:

The multiclass labels have to be encoded as single class labels, because the Boruta Algorithm expects a 1d array as input.

In [10]:
# transform the multi-lable classification values to single lable:

singleY={"[0 0]": 0, "[0 1]": 1, "[1 0]":2, "[1 1]": 3}
y_train_single = np.array([singleY[str(y)] for y in y_train_df.values])
y_train_single.shape

(26707,)

In [11]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, max_depth=5)

boruta_selector = BorutaPy(clf, random_state=42, verbose=2)
#sel = boruta_selector.fit_transform(X_train_dfnp, y_train_single)
print(X_train_dfnp.shape)
boruta_selector.fit(X_train_dfnp, y_train_single)

# time: 46.4s - 
# time - when all columns are ordinal encoded: 2m8.1s

(26707, 105)
Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	105
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	105
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	105
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	105
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	105
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	105
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	105
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	46
Tentative: 	3
Rejected: 	56
Iteration: 	9 / 100
Confirmed: 	46
Tentative: 	3
Rejected: 	56
Iteration: 	10 / 100
Confirmed: 	46
Tentative: 	3
Rejected: 	56
Iteration: 	11 / 100
Confirmed: 	46
Tentative: 	3
Rejected: 	56
Iteration: 	12 / 100
Confirmed: 	46
Tentative: 	3
Rejected: 	56
Iteration: 	13 / 100
Confirmed: 	46
Tentative: 	3
Rejected: 	56
Iteration: 	14 / 100
Confirmed: 	46
Tentative: 	2
Rejected: 	57
Iteration: 	15 / 100
Confirmed: 	46
Tentative: 	2
Rejected: 	57
Iteration: 	16 / 100
Confirmed: 	46


In [12]:
print(f"Ranks of the features: {boruta_selector.ranking_}")
print(f"Boruta proposed selection of features: {boruta_selector.support_}")

Ranks of the features: [ 1  1  9  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1 22  1 58  1  1 58  1  1  6  1  1  8  1  1  1  4 29 10 34 20
 26 24  7 28 13 35 58 14 25 17 58 32 23 48 50  1  1 20 42 44 54 37 55  5
 56  1 17 12 49 14  1 19  1 41 44  1  1 46 30 17 37 52 46 35 32 53  1 39
 10 39  3 43  2 30 27 51  1]
Boruta proposed selection of features: [ True  True False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True False  True  True False  True  True
 False  True  True False  True  True  True False False False False False
 False False False False False False False False False False False False
 False False False  True  True False False False False False False False
 False  True False False False False  True False  True False False  True
  True False False False False False False False False False  True False
 False False False False False Fa

In [13]:
# keep only the columns that have ranking==1:
X_train_df_boruta = boruta_selector.transform(X_train_dfnp)
X_columns_boruta = X_columns[boruta_selector.support_]
X_test_df_boruta = boruta_selector.transform(X_test_dfnp)
print(f"With Boruta ranking we retain {X_train_df_boruta.shape[1]} features from {X_train_dfnp.shape[1]}")

With Boruta ranking we retain 46 features from 105


In [16]:
X_columns_boruta

Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'income_poverty', 'household_adults', 'household_children',
       'race_Black', 'race_Hispanic', 'race_White', 'sex_Female', 'sex_Male',
       'marital_status_Married', 'marital_status_Not Married',
       'rent_or_own_Own', 'rent_or_own_Rent', 'employment_status_Employed',
       'employment_status_Not in Labor Force', 'employment_status_Unemployed',
       'employment_industry_fcxhlnwr', 'employment_industry_haxffmxo'

In [14]:
# save the boruta-selected data:
X_train_Boruta_df = pd.DataFrame(X_train_df_boruta, columns=X_columns_boruta)
X_train_Boruta_df.to_csv("X_train_Boruta_df.csv", index_label="respondent_id")
X_test_Boruta_df = pd.DataFrame(X_test_df_boruta, columns=X_columns_boruta)
X_test_Boruta_df.to_csv("X_test_Boruta_df.csv", index_label="respondent_id")

XB_train, XB_eval, yb_train, yb_eval = train_test_split(X_train_Boruta_df, y_train_single, test_size=0.33, shuffle=True, stratify=y_train_single, random_state=42)

In [15]:
clf = LogisticRegression(random_state=42 ,penalty="l2", C=1)
clf.fit(XB_train, yb_train)
clf.score(XB_eval, yb_eval)
# Result:
# 0.6786929884275017

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6786929884275017

In [None]:
'''
# make your own selection, which columns to keep:

# select the first features with rank <= max_rank:


def get_boruta_selected_features(max_rank, df, boruta_ranking):
    """select the first features with Boruta-rank <= max_rank
    return these features of df as a pandas dataframe
    """
    features_df = pd.DataFrame(df.columns, columns=['feature'])
    features_df['rank']= boruta_ranking
    features_df.sort_values('rank', inplace=True, ascending=True)

    # selection:
    selected = features_df[features_df["rank"] <= max_rank]
    return df[selected["feature"].values]

rank = 2
db = get_boruta_selected_features(max_rank=rank, df=X_train_dfnp, boruta_ranking= boruta_selector.ranking_)
db.head()
'''

In [15]:
'''
# save the boruta-selected data:
X_trainB_df = get_boruta_selected_features(max_rank=1, df=X_train_df, boruta_ranking= boruta_selector.ranking_)
X_trainB_df.to_csv("X_train_df_boruta.csv", index_label="respondent_id")
X_testB_df = get_boruta_selected_features(max_rank=1, df=X_test_df, boruta_ranking= boruta_selector.ranking_)
X_testB_df.to_csv("X_test_df_boruta.csv", index_label="respondent_id")

XB_train, XB_eval, yb_train, yb_eval = train_test_split(X_trainB_df, y_train_single, test_size=0.33, shuffle=True, stratify=y_train_single, random_state=42)
'''