# Automl feature selector

## Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd

/content


In [3]:
%cd "drive/MyDrive/22-23/DS 301 Advanced Data Science/DS 301 Final Project"

/content/drive/.shortcut-targets-by-id/1L_EikC2ryU5sSu8xLqkBA2WHnRs7BMt-/DS 301 Final Project


In [4]:
!pwd

/content/drive/.shortcut-targets-by-id/1L_EikC2ryU5sSu8xLqkBA2WHnRs7BMt-/DS 301 Final Project


## Import

In [5]:
# importing the libraries

import pandas as pd
import numpy as np
import copy

# for visualization

import plotly.express as px
import matplotlib.pyplot as plt


In [6]:
# models

import statsmodels.api as sm
import sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import xgboost

In [7]:
# dict used to convert state to abbrev

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}


## Reading data/helper functions

In [8]:
# Reading the training data

data = pd.read_csv('state_data_2019_processed_training.csv')

states = data.sitecode.unique()

In [9]:
# Creating the training/testing sets

state_data_t = []

state_data_test = []

seed = 100

for state in states:

    state_data = data[data.sitecode == state]
    X_s = state_data.iloc[:,:-1]
    y_s = state_data.iloc[:,-1]
    X_tr, X_val, y_tr, y_val = sklearn.model_selection.train_test_split(
        X_s, y_s, test_size=0.4, random_state=seed, stratify=y_s)
    X_val, X_te, y_val, y_te = sklearn.model_selection.train_test_split(
        X_val, y_val, test_size=0.5, random_state=seed, stratify=y_val)
    state_data_t.append([X_tr, X_val, y_tr, y_val])
    state_data_test.append((X_te, y_te))

X_train, X_val, y_train, y_val = state_data_t[0]

for i in range(1, len(state_data_t)):
    X_t,X_v,y_t,y_v = state_data_t[i]
    X_train = pd.concat([X_train, X_t])
    y_train = pd.concat([y_train, y_t])
    X_val = pd.concat([X_val, X_v])
    y_val = pd.concat([y_val, y_v])

In [10]:
# functions for visualizing the results

def map(model, race = 'all', exclude = None, data = state_data_t):

    auc_all = pd.DataFrame({'state': [us_state_to_abbrev[s] for s in states], 'auc': np.zeros(len(states))})

    for i, state in enumerate(states):
        _, X_test, _, y_test = data[i]
        if(race != 'all'):
            if(race == 'black'):
                index = X_test['race4_Black or African American'] == 1
            if(race == 'hispanic'):
                index = X_test['race4_Hispanic/Latino'] == 1
            X_test = X_test.loc[index]
            y_test = y_test.loc[index]
        if(exclude != None):
            X_test = X_test.drop(columns = exclude)
        y_pred = model.predict_proba(X_test.iloc[:,4:])
        fpr, tpr,_ = metrics.roc_curve(y_test, y_pred[:,1])
        auc = metrics.auc(fpr,tpr)
        auc_all.iloc[i,1] = auc

    fig = px.choropleth(auc_all,
                        locations='state', 
                        locationmode="USA-states", 
                        scope="usa",
                        color='auc',
                        color_continuous_scale="Viridis_r",
                        range_color = [0.6,1] 
                        )
    return fig, auc_all

def count_race(data = state_data_t):
    race = pd.DataFrame({
        'others': np.zeros(len(states)),
        'black': np.zeros(len(states)),
        'hispanic': np.zeros(len(states)),
        'white': np.zeros(len(states)),
        'missing': np.zeros(len(states))
    }, index = states)
    for i, state in enumerate(states):
        _, X_test, _, _ = data[i]
        race.iloc[i,0] = X_test['race4_All other races'].sum()
        race.iloc[i,1] = X_test['race4_Black or African American'].sum()
        race.iloc[i,2] = X_test['race4_Hispanic/Latino'].sum()
        race.iloc[i,3] = X_test['race4_White'].sum()
        race.iloc[i,4] = X_test['race4_All other races'].count() - \
            race.iloc[i,0] - race.iloc[i,1] -race.iloc[i,2] -race.iloc[i,3]
    
    return race

## AutoML

### Install

In [25]:
!pip install requests
!pip install tabulate
!pip install future

!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html
Collecting h2o
  Downloading h2o-3.38.0.3.tar.gz (177.5 MB)
[K     |████████████████████████████████| 177.5 MB 38 kB/s 
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.38.0.3-py2.py3-none-any.whl size=177541401 sha256=a099ec496d01ff0ad6fd9a013bb50c81a1ab1775ce8b861a8e7475c905cd822d
  Stored in directory: /root/.cache/pip/wheels/16/f8/f4/69e1ff6a0d1cb61bdbc0d9888ee8437a1acf1eb1c6ffb8be20
Successfully built h2o
Installing collec

### Running AutoML

In [26]:
import h2o

h2o.init()

predictors = X_train.columns[4:].to_list()
response = 'qn28'
train = pd.concat([X_train.iloc[:,4:], y_train.astype('category')], axis = 1)
valid = pd.concat([X_val.iloc[:,4:], y_val.astype('category')], axis = 1)
train = h2o.H2OFrame(train)
valid = h2o.H2OFrame(valid)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.17" 2022-10-18; OpenJDK Runtime Environment (build 11.0.17+8-post-Ubuntu-1ubuntu218.04); OpenJDK 64-Bit Server VM (build 11.0.17+8-post-Ubuntu-1ubuntu218.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.8/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp8pekapif
  JVM stdout: /tmp/tmp8pekapif/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp8pekapif/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.3
H2O_cluster_version_age:,25 days
H2O_cluster_name:,H2O_from_python_unknownUser_go1z46
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.172 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [27]:
train['qn28'] = train['qn28'].asfactor()
valid['qn28'] = valid['qn28'].asfactor()

In [29]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

aml = H2OAutoML(max_models=20, seed=1,
              nfolds = 0)
aml.train(x=predictors, y=response,
              training_frame=train,
              validation_frame=valid
          )

#24 min

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,24 mins 01 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.3
H2O_cluster_version_age:,25 days
H2O_cluster_name:,H2O_from_python_unknownUser_go1z46
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.026 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


AutoML progress: |
19:35:24.939: _train param, Dropping bad and constant columns: [stheight_1.19]

██
19:36:15.533: _train param, Dropping bad and constant columns: [stheight_1.19]

███████████████████
19:37:05.264: _train param, Dropping bad and constant columns: [stheight_1.19]
19:38:59.294: _train param, Dropping bad and constant columns: [stheight_1.19]

██
19:39:22.370: _train param, Dropping bad and constant columns: [stheight_1.19]

███████████
19:42:02.531: _train param, Dropping bad and constant columns: [stheight_1.19]
19:43:01.824: _train param, Dropping bad and constant columns: [stheight_1.19]

█████
19:44:11.584: _train param, Dropping bad and constant columns: [stheight_1.19]

██████
19:45:39.95: _train param, Dropping bad and constant columns: [stheight_1.19]

█
19:45:56.256: _train param, Dropping bad and constant columns: [stheight_1.19]


19:46:10.417: _train param, Dropping bad and constant columns: [stheight_1.19]

███
19:47:05.386: _train param, Dropping bad and c

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,60.0,60.0,50021.0,6.0,6.0,6.0,43.0,64.0,61.733334

Unnamed: 0,0,1,Error,Rate
0,65568.0,3157.0,0.0459,(3157.0/68725.0)
1,3069.0,3963.0,0.4364,(3069.0/7032.0)
Total,68637.0,7120.0,0.0822,(6226.0/75757.0)

metric,threshold,value,idx
max f1,0.2557138,0.5600622,199.0
max f2,0.1270918,0.6191954,270.0
max f0point5,0.4236611,0.6394159,136.0
max accuracy,0.4236611,0.9320195,136.0
max precision,0.9927894,1.0,0.0
max recall,0.0087739,1.0,397.0
max specificity,0.9927894,1.0,0.0
max absolute_mcc,0.3274021,0.5210559,170.0
max min_per_class_accuracy,0.1005715,0.8124289,289.0
max mean_per_class_accuracy,0.1005715,0.8128787,289.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100057,0.798988,10.7163292,10.7163292,0.994723,0.8728931,0.994723,0.8728931,0.1072241,0.1072241,971.6329197,971.6329197,0.1071659
2,0.0200114,0.6622472,9.5366802,10.1265047,0.8852243,0.7277365,0.9399736,0.8003148,0.0954209,0.2026451,853.6680227,912.6504712,0.2013209
3,0.0300038,0.5494407,8.0407484,9.4318644,0.7463672,0.6032367,0.8754949,0.7346799,0.080347,0.282992,704.0748426,843.1864365,0.2788742
4,0.0400095,0.4708803,6.4098998,8.676124,0.5949868,0.508536,0.8053448,0.6781253,0.0641354,0.3471274,540.9899825,767.6123975,0.3385425
5,0.050002,0.4099448,5.4648626,8.0343804,0.5072655,0.4403212,0.7457761,0.6306021,0.0546075,0.4017349,446.4862647,703.4380357,0.3877226
6,0.100004,0.2422906,3.5692557,5.801818,0.3313094,0.3121475,0.5385428,0.4713748,0.1784699,0.5802048,256.9255698,480.1818028,0.5293354
7,0.1500059,0.1676246,2.2467825,4.6168062,0.2085533,0.2011623,0.4285463,0.381304,0.1123436,0.6925484,124.6782472,361.6806176,0.5980558
8,0.2000079,0.1254463,1.3907299,3.8102871,0.1290919,0.1447619,0.3536827,0.3221685,0.0695392,0.7620876,39.072991,281.0287109,0.6195921
9,0.2999987,0.0783721,0.9329645,2.851264,0.0866007,0.0991831,0.2646632,0.2478465,0.0932878,0.8553754,-6.7035523,185.1263968,0.6122034
10,0.4000026,0.051293,0.601512,2.2888074,0.0558342,0.0641013,0.2124542,0.2019087,0.0601536,0.915529,-39.8487984,128.8807419,0.5682755

Unnamed: 0,0,1,Error,Rate
0,21169.0,1739.0,0.0759,(1739.0/22908.0)
1,1083.0,1265.0,0.4612,(1083.0/2348.0)
Total,22252.0,3004.0,0.1117,(2822.0/25256.0)

metric,threshold,value,idx
max f1,0.2103866,0.4727205,213.0
max f2,0.1274864,0.5619726,263.0
max f0point5,0.3634841,0.4961649,148.0
max accuracy,0.4860892,0.9168514,109.0
max precision,0.9459906,0.9,3.0
max recall,0.0088043,1.0,396.0
max specificity,0.9854747,0.9999563,0.0
max absolute_mcc,0.2103866,0.4151861,213.0
max min_per_class_accuracy,0.0894058,0.7749694,296.0
max mean_per_class_accuracy,0.082988,0.776523,302.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100174,0.7536643,8.1204355,8.1204355,0.7549407,0.844403,0.7549407,0.844403,0.0813458,0.0813458,712.0435523,712.0435523,0.0786393
2,0.0200348,0.6339241,6.33479,7.2276128,0.5889328,0.69375,0.6719368,0.7690765,0.0634583,0.1448041,533.4790016,622.7612769,0.1375577
3,0.0300127,0.5417481,6.4026122,6.9533382,0.5952381,0.5875867,0.646438,0.7087395,0.0638842,0.2086882,540.2612152,595.3338158,0.1969893
4,0.0400301,0.4632933,5.5695134,6.6070398,0.5177866,0.4994628,0.6142433,0.6563686,0.0557922,0.2644804,456.9513369,560.7039769,0.2474558
5,0.0500079,0.4041167,4.7806171,6.2426229,0.4444444,0.4317861,0.5803642,0.6115588,0.0477002,0.3121806,378.0617074,524.2622889,0.2890446
6,0.1000158,0.2409181,3.5002974,4.8714602,0.3254157,0.3108968,0.4528899,0.4612278,0.1750426,0.4872232,250.0297418,387.1460153,0.4268949
7,0.1500238,0.168879,2.307982,4.0169674,0.2145685,0.2019315,0.3734495,0.3747957,0.1154174,0.6026405,130.7981996,301.6967434,0.4990086
8,0.2000317,0.1271272,1.754407,3.4513273,0.1631037,0.1461189,0.320863,0.3176265,0.0877342,0.6903748,75.4406978,245.132732,0.5406018
9,0.3000079,0.0799125,1.0905487,2.6646088,0.1013861,0.1015172,0.2477234,0.2456091,0.109029,0.7994037,9.0548687,166.4608824,0.5505824
10,0.4000238,0.0522332,0.7920381,2.1964198,0.0736342,0.0650766,0.2041968,0.2004715,0.0792164,0.8786201,-20.7961898,119.6419806,0.527651

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
,2022-12-18 19:46:10,0.009 sec,0.0,0.2901844,0.3090213,0.5,0.0928231,1.0,0.9071769,0.2903876,0.3093516,0.5,0.092968,1.0,0.907032
,2022-12-18 19:46:15,4.843 sec,5.0,0.2666505,0.2540864,0.8580263,0.4874232,9.465617,0.0967435,0.2701442,0.2597431,0.8456392,0.4217571,8.0779202,0.1154973
,2022-12-18 19:46:19,9.334 sec,10.0,0.2562009,0.2342073,0.8654713,0.5092409,9.7782951,0.0965191,0.2619237,0.2426467,0.8527566,0.4355234,8.0354048,0.1219908
,2022-12-18 19:46:24,13.843 sec,15.0,0.2507,0.2237647,0.8708161,0.5251904,9.9488467,0.0969811,0.258287,0.2348837,0.8544208,0.4429499,8.1629509,0.1064697
,2022-12-18 19:46:28,18.363 sec,20.0,0.2471284,0.2170103,0.8758598,0.5393578,10.1193984,0.0922819,0.2563949,0.2305133,0.8572506,0.4483761,8.1629509,0.110548
,2022-12-18 19:46:33,22.891 sec,25.0,0.2444079,0.2118338,0.8800335,0.5519285,10.2330995,0.0888499,0.2553818,0.2278675,0.8588366,0.4509958,8.3330124,0.10742
,2022-12-18 19:46:37,27.447 sec,30.0,0.2422014,0.2080154,0.8837748,0.5641266,10.3610132,0.0905659,0.2548574,0.2264858,0.860313,0.4523859,8.1204355,0.1003326
,2022-12-18 19:46:42,31.994 sec,35.0,0.2402975,0.2046403,0.8871691,0.5748916,10.3894385,0.0918463,0.2545918,0.2254865,0.8613732,0.4533294,8.2479816,0.1087266
,2022-12-18 19:46:46,36.542 sec,40.0,0.2385552,0.2017979,0.8899837,0.5854377,10.4747143,0.0931399,0.2544844,0.2249159,0.862343,0.4529657,8.1204355,0.1125673
,2022-12-18 19:46:51,41.038 sec,45.0,0.2368267,0.1990782,0.8928481,0.596765,10.5315649,0.0871339,0.2545359,0.2246283,0.8629696,0.4516406,8.1204355,0.1075388

variable,relative_importance,scaled_importance,percentage
qn25_2.0,2496.5175781,1.0,0.2591823
qn32_2.0,421.6687317,0.1689028,0.0437766
qn22_1.0,379.0716248,0.1518402,0.0393543
qn49_1.0,290.4299927,0.1163340,0.0301517
qn24_2.0,269.5497742,0.1079703,0.0279840
qn20_1.0,243.4151154,0.0975019,0.0252708
qn19_1.0,233.3875122,0.0934852,0.0242297
qn53_1.0,226.9490204,0.0909062,0.0235613
qn52_1.0,160.5544281,0.0643114,0.0166684
qn15_1.0,147.6155548,0.0591286,0.0153251


In [30]:
aml.leader.params

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'GBM_5_AutoML_2_20221218_193524',
   'type': 'Key<Model>',
   'URL': '/3/Models/GBM_5_AutoML_2_20221218_193524'},
  'input': None},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'AutoML_2_20221218_193524_training_py_1_sid_96fe',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/AutoML_2_20221218_193524_training_py_1_sid_96fe'},
  'input': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'AutoML_2_20221218_193524_training_py_1_sid_96fe',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/AutoML_2_20221218_193524_training_py_1_sid_96fe'}},
 'validation_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type':

In [31]:
model_path = h2o.save_model(model=aml.leader, path="/tmp/mymodel", force=True)
print(model_path)

/tmp/mymodel/GBM_5_AutoML_2_20221218_193524


### Examining final model

In [32]:
X_test, y_test = state_data_test[0]

for i in range(1,len(state_data_test)):
  X_t,  y_t = state_data_test[i]
  X_test = pd.concat([X_test, X_t])
  y_test = pd.concat([y_test, y_t])

In [33]:
X_test = X_test.iloc[:,4:]

In [None]:
X_test = h2o.H2OFrame(X_test)

In [35]:
y_pred = aml.leader.predict(X_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [42]:
y_pred['p1']

p1
0.344594
0.123808
0.081307
0.142592
0.0764039
0.172917
0.434619
0.328944
0.117184
0.406527


In [44]:
fpr,tpr, _ = metrics.roc_curve(y_test, y_pred['p1'].as_data_frame())

In [45]:
metrics.roc_auc_score(y_test, y_pred['p1'].as_data_frame())

0.8640388868734653

## Feature Engineering

In [11]:
!pip install autofeat

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autofeat
  Downloading autofeat-2.0.10-py3-none-any.whl (24 kB)
Collecting pint
  Downloading Pint-0.20.1-py3-none-any.whl (269 kB)
[K     |████████████████████████████████| 269 kB 7.1 MB/s 
Installing collected packages: pint, autofeat
Successfully installed autofeat-2.0.10 pint-0.20.1


In [23]:
from autofeat import FeatureSelector, AutoFeatRegressor

X, _, y, _ = sklearn.model_selection.train_test_split(X_train.iloc[:,4:],y_train,train_size = 0.2, stratify=y_train)

nunique = X.nunique()
cols_to_drop = nunique[nunique == 1].index
X = X.drop(columns = cols_to_drop)

In [24]:
fsel = FeatureSelector(verbose=1, problem_type="classification")
new_X = fsel.fit_transform(X, y)

[featsel] Scaling data...done.
[featsel] Feature selection run 1/5


KeyboardInterrupt: ignored

In [None]:
print(X.columns)
print(new_X.columns)