# Analyzing the performance of models on the same dataset

In [None]:
!nvidia-smi

Wed Dec 21 19:23:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    26W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install ray

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ray
  Downloading ray-2.2.0-cp38-cp38-manylinux2014_x86_64.whl (57.4 MB)
[K     |████████████████████████████████| 57.4 MB 1.2 MB/s 
[?25hCollecting virtualenv>=20.0.24
  Downloading virtualenv-20.17.1-py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 64.1 MB/s 
Collecting distlib<1,>=0.3.6
  Downloading distlib-0.3.6-py2.py3-none-any.whl (468 kB)
[K     |████████████████████████████████| 468 kB 46.9 MB/s 
Installing collected packages: distlib, virtualenv, ray
Successfully installed distlib-0.3.6 ray-2.2.0 virtualenv-20.17.1


In [12]:
# importing the libraries

import pandas as pd
import numpy as np
import copy

# for visualization

import plotly.express as px
import matplotlib.pyplot as plt

# hypeparameter tuning

import ray
from ray import tune
from ray.tune.schedulers import HyperBandScheduler

In [13]:
# models

import statsmodels.api as sm
import sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import xgboost

In [14]:
# dict used to convert state to abbrev

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}


In [16]:
# Reading the training data

data = pd.read_csv('state_data_2019_processed_training.csv')

states = data.sitecode.unique()

In [17]:
# Creating the training/testing sets

state_data_t = []

state_data_test = []

seed = 100

for state in states:

    state_data = data[data.sitecode == state]
    X_s = state_data.iloc[:,:-1]
    y_s = state_data.iloc[:,-1]
    X_tr, X_val, y_tr, y_val = sklearn.model_selection.train_test_split(
        X_s, y_s, test_size=0.4, random_state=seed, stratify=y_s)
    X_val, X_te, y_val, y_te = sklearn.model_selection.train_test_split(
        X_val, y_val, test_size=0.5, random_state=seed, stratify=y_val)
    state_data_t.append([X_tr, X_val, y_tr, y_val])
    state_data_test.append((X_te, y_te))

X_train, X_val, y_train, y_val = state_data_t[0]

for i in range(1, len(state_data_t)):
    X_t,X_v,y_t,y_v = state_data_t[i]
    X_train = pd.concat([X_train, X_t])
    y_train = pd.concat([y_train, y_t])
    X_val = pd.concat([X_val, X_v])
    y_val = pd.concat([y_val, y_v])

In [18]:
state_data_t

state_data_test


[(     sitecode   weight  stratum  PSU  age_12 years old or younger  \
  4      Alaska  21.7647        2    2                            0   
  1193   Alaska   2.7846       11    7                            0   
  1438   Alaska   2.7164        7   12                            0   
  1742   Alaska   4.2704        7   11                            0   
  75     Alaska   2.8208        7   14                            0   
  ...       ...      ...      ...  ...                          ...   
  1066   Alaska   2.9296        7    1                            0   
  1649   Alaska   5.4174       13    5                            0   
  370    Alaska   5.8406       21    3                            0   
  248    Alaska  17.9232        2    2                            0   
  545    Alaska  33.0762       19    1                            0   
  
        age_13 years old  age_14 years old  age_15 years old  age_16 years old  \
  4                    0                 0                 0   

In [None]:
# functions for visualizing the results

def map(model, race = 'all', exclude = None, data = state_data_t):

    auc_all = pd.DataFrame({'state': [us_state_to_abbrev[s] for s in states], 'auc': np.zeros(len(states))})

    for i, state in enumerate(states):
        _, X_test, _, y_test = data[i]
        if(race != 'all'):
            if(race == 'black'):
                index = X_test['race4_Black or African American'] == 1
            if(race == 'hispanic'):
                index = X_test['race4_Hispanic/Latino'] == 1
            X_test = X_test.loc[index]
            y_test = y_test.loc[index]
        if(exclude != None):
            X_test = X_test.drop(columns = exclude)
        y_pred = model.predict_proba(X_test.iloc[:,4:])
        fpr, tpr,_ = metrics.roc_curve(y_test, y_pred[:,1])
        auc = metrics.auc(fpr,tpr)
        auc_all.iloc[i,1] = auc

    fig = px.choropleth(auc_all,
                        locations='state', 
                        locationmode="USA-states", 
                        scope="usa",
                        color='auc',
                        color_continuous_scale="Viridis_r",
                        range_color = [0.6,1] 
                        )
    return fig, auc_all

def count_race(data = state_data_t):
    race = pd.DataFrame({
        'others': np.zeros(len(states)),
        'black': np.zeros(len(states)),
        'hispanic': np.zeros(len(states)),
        'white': np.zeros(len(states)),
        'missing': np.zeros(len(states))
    }, index = states)
    for i, state in enumerate(states):
        _, X_test, _, _ = data[i]
        race.iloc[i,0] = X_test['race4_All other races'].sum()
        race.iloc[i,1] = X_test['race4_Black or African American'].sum()
        race.iloc[i,2] = X_test['race4_Hispanic/Latino'].sum()
        race.iloc[i,3] = X_test['race4_White'].sum()
        race.iloc[i,4] = X_test['race4_All other races'].count() - \
            race.iloc[i,0] - race.iloc[i,1] -race.iloc[i,2] -race.iloc[i,3]
    
    return race

## Logisitc Model

In [None]:
logistic_model = LogisticRegression(
    penalty='l2',
    max_iter=200
)
logistic_model = logistic_model.fit(X_train.iloc[:,4:], y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
logistic_model_map, logistic_auc = map(logistic_model)

In [None]:
logistic_model_map.show()

## Random Forest Model

In [None]:
!pip install tune_sklearn
!pip install hpbandster ConfigSpace

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tune_sklearn
  Downloading tune_sklearn-0.4.5-py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 125 kB/s 
Collecting tensorboardX>=1.9
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 14.7 MB/s 
Installing collected packages: tensorboardX, tune-sklearn
Successfully installed tensorboardX-2.5.1 tune-sklearn-0.4.5
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hpbandster
  Downloading hpbandster-0.7.4.tar.gz (51 kB)
[K     |████████████████████████████████| 51 kB 144 kB/s 
[?25hCollecting ConfigSpace
  Downloading ConfigSpace-0.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.7 MB)
[K     |████████████████████████████████| 5.7 MB 35.6 MB/s 
[?25hCollecting Pyro4
  Downloading Pyro4-4.82-py2.py3-none-any.whl (89 k

In [None]:
# Hyperparameter tuning using grid search

from tune_sklearn import TuneSearchCV
from tune_sklearn import TuneGridSearchCV

params = {
    'max_depth': [10,20,30,40],
    'min_samples_leaf' : [5,10,15,20],
    'min_samples_split': [10,20,30,40],
    'max_features' : [40,80,120,160],
}

In [None]:
tune_grid_search = TuneGridSearchCV(
    RandomForestClassifier(
        n_estimators = 200,
        random_state = seed, 
        criterion = 'gini'
        ),
    params,
    scoring = 'roc_auc',
    verbose=2,
    n_jobs = -1,
    use_gpu=True,
    early_stopping="MedianStoppingRule",
    max_iters=10
)

result_grid = tune_grid_search.fit(X_train.iloc[:,4:], y_train)

2022-12-21 19:28:23,000	INFO worker.py:1538 -- Started a local Ray instance.


0,1
Current time:,2022-12-21 22:58:18
Running for:,03:29:49.42
Memory:,2.8/12.7 GiB

Trial name,status,loc,max_depth,max_features,min_samples_leaf,min_samples_split,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_a3761_00157,RUNNING,172.28.0.12:4264,20,160,10,30,,,,,
_Trainable_a3761_00000,PAUSED,172.28.0.12:4264,10,40,5,10,4.0,71.0115,0.747724,0.819298,0.847283
_Trainable_a3761_00001,PAUSED,172.28.0.12:4264,20,40,5,10,3.0,78.9071,0.740719,0.80693,0.839777
_Trainable_a3761_00002,PAUSED,172.28.0.12:4264,30,40,5,10,3.0,83.9231,0.736987,0.806285,0.839776
_Trainable_a3761_00003,PAUSED,172.28.0.12:4264,40,40,5,10,3.0,84.6289,0.737918,0.806538,0.839999
_Trainable_a3761_00004,PAUSED,172.28.0.12:4264,10,80,5,10,3.0,84.9807,0.73379,0.814938,0.843574
_Trainable_a3761_00005,PAUSED,172.28.0.12:4264,20,80,5,10,2.0,92.9576,0.726765,0.800979,0.835654
_Trainable_a3761_00006,PAUSED,172.28.0.12:4264,30,80,5,10,2.0,98.9647,0.719146,0.800184,0.838892
_Trainable_a3761_00158,PENDING,,30,160,10,30,,,,,
_Trainable_a3761_00159,PENDING,,40,160,10,30,,,,,


Trial name,average_test_score,objective,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
_Trainable_a3761_00000,0.820195,0.820195,0.747724,0.819298,0.847283,0.806523,0.880147
_Trainable_a3761_00001,0.809556,0.809556,0.740719,0.80693,0.839777,0.789074,0.871279
_Trainable_a3761_00002,0.808084,0.808084,0.736987,0.806285,0.839776,0.788677,0.868694
_Trainable_a3761_00003,0.808606,0.808606,0.737918,0.806538,0.839999,0.788767,0.869806
_Trainable_a3761_00004,0.812495,0.812495,0.73379,0.814938,0.843574,0.789195,0.880977
_Trainable_a3761_00005,0.801305,0.801305,0.726765,0.800979,0.835654,0.77252,0.870605
_Trainable_a3761_00006,0.80162,0.80162,0.719146,0.800184,0.838892,0.780082,0.869795
_Trainable_a3761_00007,0.803198,0.803198,0.728192,0.797745,0.838094,0.782902,0.869055
_Trainable_a3761_00008,0.807904,0.807904,0.725033,0.812814,0.84325,0.777842,0.880582
_Trainable_a3761_00009,0.790211,0.790211,0.722688,0.793151,0.822074,0.750228,0.862917



Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid va

In [4]:
pip install tune_search

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement tune_search (from versions: none)[0m
[31mERROR: No matching distribution found for tune_search[0m


In [3]:
from tune_search import TuneSearchCV



ModuleNotFoundError: ignored

In [1]:
tune_search = TuneSearchCV(
    RandomForestClassifier(
        n_estimators = 200,
        random_state = seed, 
        criterion = 'gini'
        ),
    params,
    scoring = 'roc_auc',
    verbose=2,
    n_jobs = -1,
    early_stopping="MedianStoppingRule",
    n_trials=20,
    max_iters=10,
    search_optimization="bohb"
)

result = tune_search.fit(X_train.iloc[:,4:], y_train)

NameError: ignored

In [None]:
tune_grid_search.best_params

In [None]:
tune_search.best_params

In [None]:
rf_model = RandomForestClassifier(
    max_depth=30,
    max_features=40, 
    min_samples_leaf=20,
    min_samples_split=10,
    n_estimators=200,
    random_state = seed, 
    criterion = 'gini'
)
rf_model.fit(X_train.iloc[:,4:], y_train)

In [None]:
rf_model_map, rf_auc = map(rf_model)

In [None]:
rf_model_map.show()

## XGBoost Model

### Hyperparameter tuning

In [6]:
pip install ray 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ray
  Downloading ray-2.2.0-cp38-cp38-manylinux2014_x86_64.whl (57.4 MB)
[K     |████████████████████████████████| 57.4 MB 17.3 MB/s 
Collecting virtualenv>=20.0.24
  Downloading virtualenv-20.17.1-py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 25.8 MB/s 
Collecting distlib<1,>=0.3.6
  Downloading distlib-0.3.6-py2.py3-none-any.whl (468 kB)
[K     |████████████████████████████████| 468 kB 40.9 MB/s 
Installing collected packages: distlib, virtualenv, ray
Successfully installed distlib-0.3.6 ray-2.2.0 virtualenv-20.17.1


In [7]:
pip install tune

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tune
  Downloading tune-0.1.2-py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 4.2 MB/s 
[?25hCollecting fugue>=0.7.0
  Downloading fugue-0.7.3-py3-none-any.whl (322 kB)
[K     |████████████████████████████████| 322 kB 38.3 MB/s 
Collecting qpd>=0.3.1
  Downloading qpd-0.3.4-py3-none-any.whl (187 kB)
[K     |████████████████████████████████| 187 kB 62.4 MB/s 
Collecting fugue-sql-antlr>=0.1.1
  Downloading fugue-sql-antlr-0.1.1.tar.gz (153 kB)
[K     |████████████████████████████████| 153 kB 60.7 MB/s 
[?25hCollecting triad>=0.6.9
  Downloading triad-0.7.0-py3-none-any.whl (68 kB)
[K     |████████████████████████████████| 68 kB 7.7 MB/s 
[?25hCollecting adagio>=0.2.4
  Downloading adagio-0.2.4-py3-none-any.whl (26 kB)
Collecting antlr4-python3-runtime<4.12,>=4.11.1
  Downloading antlr4_python3_runtime-4.11.1-py3-none-any.whl (144 kB)
[K     |███

In [10]:
import tune
import random
import xgboost as xgb
from ray.tune.integration.xgboost import TuneReportCheckpointCallback

def data_loader():
    return (X_train.iloc[:,4:].values,y_train.values), (X_val.iloc[:,4:].values,y_val.values)
def train_data(config,data):
    t1, t2 = data
    train_set = xgb.DMatrix(t1[0], label = t1[1])
    val_set = xgb.DMatrix(t2[0], label = t2[1])
    results = {}
    bst = xgb.train(
        config,
        train_set,
        num_boost_round = 50,
        evals = [(val_set, 'eval')],
        evals_result = results,
        verbose_eval = False,
        callbacks=[TuneReportCheckpointCallback(filename="model.xgb")]
    )

config = {
          "objective": "binary:logistic",
          "tree_method": "gpu_hist",
          "eval_metric": ["auc"],
          "max_depth": tune.randint(5,12),
          "min_child_weight": tune.randint(1,5),
          "colsample_bytree": tune.uniform(0.5, 1.0),
          "eta": tune.loguniform(1e-3, 1e-1),
          "reg_lambda": tune.uniform(0.1, 5),
          "reg_alpha": tune.uniform(0.1, 5),
          "n_estimators" : 200,
          "seed": 0
}

AttributeError: ignored

In [11]:
t1,t2 = data_loader()

analysis = tune.run(
    tune.with_parameters(train_data, data = (t1,t2)),
    resources_per_trial = {"gpu":1},
    config = config,
    num_samples = 50,
    metric='eval-auc',
    mode="max",
    stop={
        "eval-auc": 0.90,
        "training_iteration": 50
    },
)

NameError: ignored

In [None]:
t1,t2 = data_loader()

from ray.tune.schedulers import HyperBandScheduler

hyperband_scheduler = HyperBandScheduler(
    time_attr='training_iteration',
    metric='eval-auc',
    mode='max',
    max_t=50,
    reduction_factor=3)


analysis_hyper = tune.run(
    tune.with_parameters(train_data, data = (t1,t2)),
    resources_per_trial = {"gpu":1},
    config = config,
    num_samples = 50,
    scheduler = hyperband_scheduler
)

In [None]:
analysis.get_best_trial(metric = 'eval-auc', mode = 'max').last_result

In [None]:
analysis_hyper.get_best_trial(metric = 'eval-auc', mode = 'max').last_result

### Helper Function

In [None]:
### Helper function

def map_xgb(model, race = 'all', exclude = None, data = state_data_t):

    auc_all = pd.DataFrame({'state': [us_state_to_abbrev[s] for s in states], 'auc': np.zeros(len(states))})

    for i, state in enumerate(states):
        _, X_test, _, y_test = data[i]
        if(race != 'all'):
            if(race == 'black'):
                index = X_test['race4_Black or African American'] == 1
            if(race == 'hispanic'):
                index = X_test['race4_Hispanic/Latino'] == 1
            X_test = X_test.loc[index]
            y_test = y_test.loc[index]
        if(exclude != None):
            X_test = X_test.drop(columns = exclude)
        X_test = xgb.DMatrix(X_test.iloc[:,4:].values)
        y_pred = model.predict(X_test)
        fpr, tpr,_ = metrics.roc_curve(y_test, y_pred)
        auc = metrics.auc(fpr,tpr)
        auc_all.iloc[i,1] = auc

    fig = px.choropleth(auc_all,
                        locations='state', 
                        locationmode="USA-states", 
                        scope="usa",
                        color='auc',
                        color_continuous_scale="Viridis_r",
                        range_color = [0.6,1] 
                        )
    return fig, auc_all

### Model

In [None]:
config =  {{'objective': 'binary:logistic',
  'tree_method': 'gpu_hist',
  'eval_metric': ['auc'],
  'max_depth': 10,
  'min_child_weight': 3,
  'colsample_bytree': 0.5020551988135382,
  'eta': 0.029222812929186416,
  'reg_lambda': 0.36756709262675025,
  'reg_alpha': 3.7137701473201687,
  'n_estimators': 200,
  'seed': 0}}

In [None]:
train_set = xgb.DMatrix(X_train.iloc[:,4:].values,y_train.values)
val_set = xgb.DMatrix(X_val.iloc[:,4:].values,y_val.values)
results = {}
xgb_model = xgb.train(
    config,
    train_set,
    num_boost_round = 50,
    evals = [(val_set, 'eval')],
    evals_result = results,
    early_stopping_rounds=2,
)

In [None]:
xgb_model_map, xgb_auc = map_xgb(xgb_model)

In [None]:
xgb_model_map

## Neural Network Model

### Finding the best hyperparameters & structure

In [None]:
#Hyperparamter tuning

import tensorflow as tf
import math

from keras.models import Sequential
from keras.layers import Dense
from keras import initializers
from keras.callbacks import LearningRateScheduler
from keras.callbacks import EarlyStopping

from ray.tune.integration.keras import TuneReportCallback
from ray.tune.schedulers import HyperBandScheduler

def data_loader():
    return (X_train.iloc[:,4:].values,y_train.values), (X_val.iloc[:,4:].values,y_val.values)

def lr_step_decay(epoch, lr):
    drop_rate = 0.5
    epochs_drop = 5.0
    return 0.01 * math.pow(drop_rate, math.floor(epoch/epochs_drop))

def train_data(config,data):

    t1, t2 = data

    X_train = t1[0]
    y_train = t1[1]
    X_test = t2[0]
    y_test = t2[1]

    n_units = config['units']
    n_layers = config['layers']
    activation = config['activation']
    if(activation == 'tanh' or activation == 'sigmoid'):
        initializer = 'glorot_uniform'
    else:
        initializer = 'he_normal'

    model = Sequential()

    model.add(Dense(units = n_units, 
                    input_dim=X_train.shape[1], 
                    activation= activation,
                    kernel_initializer= initializer, 
                    name='h1'))
    
    for i in range(2, n_layers + 1):
        model.add(Dense(units= n_units, 
                        activation= activation,
                        kernel_initializer= initializer,  
                        name='h{}'.format(i)))
        
    model.add(Dense(units=1, activation='sigmoid', kernel_initializer=initializer, name='o'))

    model.compile(
        loss="binary_crossentropy", 
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=0.01), 
        metrics=['AUC'])

    model.fit(
        X_train,
        y_train,
        batch_size=128,
        epochs=50,
        verbose=0,
        validation_data=(X_test, y_test),
        callbacks=[TuneReportCallback({
            "mean_auc": "val_auc"
            }),
            LearningRateScheduler(
                lr_step_decay, verbose=0
            ),
            EarlyStopping(
                monitor='val_auc', 
                patience=6
            )
        ])


### GridSearch

In [None]:
t1,t2 = data_loader()

config = {
          'units': tune.grid_search([10, 20, 40, 80]),
          'layers': tune.grid_search([2, 4, 6, 8, 10]),
          'activation': tune.grid_search(['ReLU','tanh','sigmoid'])
}

analysis = tune.run(
    tune.with_parameters(train_data, data = (t1,t2)),
    resources_per_trial = {"gpu":1},
    config = config,
    metric='mean_auc',
    mode="max"
)

In [None]:
analysis.get_best_trial(metric = 'mean_auc', mode = 'max').last_result

### Hyperband

In [None]:
t1,t2 = data_loader()

from ray.tune.schedulers import HyperBandScheduler

hyperband_scheduler = HyperBandScheduler(
    time_attr='training_iteration',
    metric='mean_auc',
    mode='max',
    max_t=50,
    reduction_factor=3)

config = {
          'units': tune.grid_search([10, 20, 40, 80]),
          'layers': tune.grid_search([2, 4, 6, 8, 10]),
          'activation': tune.grid_search(['ReLU','tanh','sigmoid'])
}

analysis_hyper = tune.run(
    tune.with_parameters(train_data, data = (t1,t2)),
    resources_per_trial = {"gpu":1},
    config = config,
    scheduler = hyperband_scheduler
)

In [None]:
analysis_hyper.get_best_trial(metric = 'mean_auc', mode = 'max').last_result

### Model

AutoML

In [None]:
import h2o

# Start an H2O cluster
h2o.init()

# Load the data into an H2OFrame
data = h2o.import_file("/content/state_data_2019_processed_training.csv")

# Split the data into training and testing sets
train, test = data.split_frame(ratios=[0.8])

# Specify the target and predictor columns
x = data.columns[:-1]
y = data.columns[-1]

# Run the AutoML function
aml = h2o.automl.H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train, validation_frame=test)

# View the AutoML leaderboard
lb = aml.leaderboard
print(lb)

# Get the best model
best_model = h2o.get_model(lb[0, "model_id"])

# Make predictions on the test set using the best model
predictions = best_model.predict(test)
