## Tree-based Model Feature Selection

In [7]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve, auc, _scorer
)
from sklearn.tree import export_graphviz
import xgboost as xgb
from xgboost import XGBClassifier as xgbclass
from scipy import stats
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, f1_score,
    classification_report
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import RFECV
import pickle

root_path = "../../Data/GoogleDrive/"

In [3]:
# Import pickled models
with open(root_path + "rf_model_full.pkl", "rb") as f:
    rf_model = pickle.load(f)

with open(root_path + "xgboost_model_full.pkl", "rb") as f:
    xgb_model = pickle.load(f)

In [57]:
# Load data from parquet files
X_train = pd.read_parquet(root_path + "X_train.parquet")
X_test = pd.read_parquet(root_path + "X_test.parquet")
y_train = pd.read_parquet(root_path + "y_train.parquet")
y_test = pd.read_parquet(root_path + "y_test.parquet")

### Random Forest

In [49]:
# check the models
# print(rf_model)
# print(xgb_model)

# check features
print(X_train.columns)

Index(['onehot__State_Alabama', 'onehot__State_Alaska',
       'onehot__State_Arizona', 'onehot__State_Arkansas',
       'onehot__State_California', 'onehot__State_Colorado',
       'onehot__State_Connecticut', 'onehot__State_Delaware',
       'onehot__State_District of Columbia', 'onehot__State_Florida',
       ...
       'GeneralHealth_label__GeneralHealth',
       'LastCheckupTime_label__LastCheckupTime',
       'RemovedTeeth_label__RemovedTeeth', 'SmokerStatus_label__SmokerStatus',
       'ECigaretteUsage_label__ECigaretteUsage',
       'remainder__PhysicalHealthDays', 'remainder__MentalHealthDays',
       'remainder__SleepHours', 'remainder__HeightInMeters',
       'remainder__WeightInKilograms'],
      dtype='object', length=121)


In [58]:
# Feature Selection for RF

# best rf model
best_rf = rf_model.best_estimator_

y_train = y_train.values.ravel()
print(y_train.shape)

best_rf = best_rf.fit(X_train, y_train)

rf_model_rfecv = RFECV(estimator=best_rf, 
                       # cv=StratifiedKFold(5), 
                       scoring='neg_mean_squared_error',
                       n_jobs = 8,
                       verbose = 1)

rf_model_rfecv.fit(X_train, y_train)

# # check the features
print(rf_model_rfecv.support_)

# # check the number of features
print(rf_model_rfecv.n_features_)

(121584,)
Fitting estimator with 121 features.
Fitting estimator with 121 features.
Fitting estimator with 121 features.
Fitting estimator with 121 features.
Fitting estimator with 121 features.
Fitting estimator with 120 features.
Fitting estimator with 120 features.
Fitting estimator with 120 features.
Fitting estimator with 120 features.
Fitting estimator with 120 features.
Fitting estimator with 119 features.
Fitting estimator with 119 features.
Fitting estimator with 119 features.
Fitting estimator with 119 features.
Fitting estimator with 119 features.
Fitting estimator with 118 features.
Fitting estimator with 118 features.
Fitting estimator with 118 features.
Fitting estimator with 118 features.
Fitting estimator with 118 features.
Fitting estimator with 117 features.
Fitting estimator with 117 features.
Fitting estimator with 117 features.
Fitting estimator with 117 features.
Fitting estimator with 117 features.
Fitting estimator with 116 features.
Fitting estimator with 116 f

In [68]:
# Features of the rf model
rf_features = X_train.columns[best_rf.feature_importances_ > 0]
print(X_train.columns[best_rf.feature_importances_ > 0])

# Features of the best feature selected RF model
fs_rf_features = X_train.columns[rf_model_rfecv.support_]
print(X_train.columns[rf_model_rfecv.support_])
print(len(X_train.columns[rf_model_rfecv.support_]))

# Features removed
removed = [i for i in rf_features if i not in fs_rf_features]
# for i in rf_features:
#     if i not in fs_rf_features:
#         print(i)
print(removed)



Index(['onehot__State_Alabama', 'onehot__State_Alaska',
       'onehot__State_Arizona', 'onehot__State_Arkansas',
       'onehot__State_California', 'onehot__State_Colorado',
       'onehot__State_Connecticut', 'onehot__State_Delaware',
       'onehot__State_District of Columbia', 'onehot__State_Florida',
       ...
       'GeneralHealth_label__GeneralHealth',
       'LastCheckupTime_label__LastCheckupTime',
       'RemovedTeeth_label__RemovedTeeth', 'SmokerStatus_label__SmokerStatus',
       'ECigaretteUsage_label__ECigaretteUsage',
       'remainder__PhysicalHealthDays', 'remainder__MentalHealthDays',
       'remainder__SleepHours', 'remainder__HeightInMeters',
       'remainder__WeightInKilograms'],
      dtype='object', length=121)
Index(['onehot__State_Arizona', 'onehot__State_Arkansas',
       'onehot__State_Florida', 'onehot__State_Georgia',
       'onehot__State_Indiana', 'onehot__State_Iowa', 'onehot__State_Kansas',
       'onehot__State_Maine', 'onehot__State_Maryland',
     

In [69]:
# Save rf_model_rfecv model
with open(root_path + "rf_model_rfecv.pkl", "wb") as f:
    pickle.dump(rf_model_rfecv, f)

# Save rf_features
with open(root_path + "rf_features.pkl", "wb") as f:
    pickle.dump(rf_features, f)

In [70]:
# Fit the best feature selected RF model
rf_model_rfecv.estimator_.fit(X_train[fs_rf_features], y_train)

In [71]:
# Save the best feature selected RF model
with open(root_path + "rf_model_rfecv_best.pkl", "wb") as f:
    pickle.dump(rf_model_rfecv.estimator_, f)

In [72]:
# Accuracy
y_pred = rf_model_rfecv.estimator_.predict(X_test[fs_rf_features])

rfecv_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", rfecv_accuracy)

# Precision
rfecv_precision = precision_score(y_test, y_pred)
print("Precision:", rfecv_precision)

# Recall
rfecv_recall = recall_score(y_test, y_pred)
print("Recall:", rfecv_recall)

# F1 Score
rfecv_f1 = f1_score(y_test, y_pred)
print("F1 Score:", rfecv_f1)

# ROC AUC
rfecv_roc_auc = roc_auc_score(y_test, y_pred)
print("ROC AUC:", rfecv_roc_auc)

# Classification Report
rfecv_classification_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(rfecv_classification_report)


Accuracy: 0.8004589419660482
Precision: 0.6475256095801472
Recall: 0.4429530201342282
F1 Score: 0.5260505186661196
ROC AUC: 0.6812903013554414
Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.92      0.87     30396
         1.0       0.65      0.44      0.53     10132

    accuracy                           0.80     40528
   macro avg       0.74      0.68      0.70     40528
weighted avg       0.79      0.80      0.79     40528



### XGBoost

In [19]:
best_xgb = xgb_model.best_estimator_

# Feature Selection for XGBoost
best_xgb = best_xgb.fit(X_train, y_train)

In [39]:
xgb_params = xgb_model.get_params()

Accuracy:  0.8011004737465456
Sensitivity:  0.4716739044611133
AUC:  0.8417133978217364


In [36]:
from numpy import sort
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import pandas as pd

thresholds = sort(best_xgb.feature_importances_)

results_df = pd.DataFrame(columns=['Threshold', 'Accuracy', 'Sensitivity', 'AUC'])
print(type(results_df))

for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(best_xgb, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    
    # train model
    selection_model = XGBClassifier(**xgb_params)
    selection_model.fit(select_X_train, y_train)
    
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    sensitivity = recall_score(y_test, predictions)
    auc = roc_auc_score(y_test, predictions)
    # save to dataframe
    # Create a DataFrame with the current results
    current_results = pd.DataFrame([[thresh, accuracy, sensitivity, auc]],
                                   columns=['Threshold', 'Accuracy', 'Sensitivity', 'AUC'])
    
    # Concatenate the current results with the existing DataFrame
    results_df = pd.concat([results_df, current_results], ignore_index=True)

<class 'pandas.core.frame.DataFrame'>


Parameters: { "cv", "error_score", "estimator", "estimator__enable_categorical", "estimator__missing", "estimator__objective", "param_grid", "pre_dispatch", "refit", "return_train_score", "scoring", "verbose" } are not used.

  results_df = pd.concat([results_df, current_results], ignore_index=True)
Parameters: { "cv", "error_score", "estimator", "estimator__enable_categorical", "estimator__missing", "estimator__objective", "param_grid", "pre_dispatch", "refit", "return_train_score", "scoring", "verbose" } are not used.

Parameters: { "cv", "error_score", "estimator", "estimator__enable_categorical", "estimator__missing", "estimator__objective", "param_grid", "pre_dispatch", "refit", "return_train_score", "scoring", "verbose" } are not used.

Parameters: { "cv", "error_score", "estimator", "estimator__enable_categorical", "estimator__missing", "estimator__objective", "param_grid", "pre_dispatch", "refit", "return_train_score", "scoring", "verbose" } are not used.

Parameters: { "cv", "

In [40]:
# print accuracy, sensitivity, AUC for xgb_model
y_pred_xgb = best_xgb.predict(X_test)
y_pred_proba_xgb = best_xgb.predict_proba(X_test)[:, 1]

print("Accuracy: ", accuracy_score(y_test, y_pred_xgb))
print("Sensitivity: ", recall_score(y_test, y_pred_xgb))
print("AUC: ", roc_auc_score(y_test, y_pred_proba_xgb))

# print top 5 results with the best sensitivity
print(results_df.sort_values(by='Sensitivity', ascending=False).head(5))

Accuracy:  0.8011004737465456
Sensitivity:  0.4716739044611133
AUC:  0.8417133978217364
    Threshold  Accuracy  Sensitivity       AUC
87   0.003053  0.799472     0.481741  0.693562
72   0.002371  0.798658     0.480853  0.692723
76   0.002634  0.798781     0.480359  0.692640
90   0.003235  0.798238     0.479866  0.692114
86   0.002957  0.798954     0.479372  0.692427
