In [23]:
import os
import sys
import pickle
from copy import deepcopy
from collections import defaultdict

import numpy as np
import pandas as pd

import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch
import shap

In [2]:
# start a h2o instance to fit GLM models
# pick settings so things run fast but don't use all system resources
h2o.init(nthreads = 3, max_mem_size = "12G")

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)56-b12)
  Starting server from S:\ProgramData\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\AARONB~1\AppData\Local\Temp\tmpj0okuy9f
  JVM stdout: C:\Users\AARONB~1\AppData\Local\Temp\tmpj0okuy9f\h2o_Aaron_Blackwell_started_from_python.out
  JVM stderr: C:\Users\AARONB~1\AppData\Local\Temp\tmpj0okuy9f\h2o_Aaron_Blackwell_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Australia/Sydney
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.10
H2O cluster version age:,16 days
H2O cluster name:,H2O_from_python_Aaron_Blackwell_qcaavi
H2O cluster total nodes:,1
H2O cluster free memory:,10.67 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,3


In [48]:
def import_data(data):    
    return pd.read_csv("train_test_data/" + data + ".csv", index_col="uid").drop("Unnamed: 0", axis="columns")        
X_train = import_data("X_train")
X_test = import_data("X_test")
y_train = import_data("y_train")
y_test = import_data("y_test")

y_train = y_train[[e for e in y_train.columns if 'post' in e]]
y_test  =  y_test[[e for e in y_test.columns  if 'post' in e]]
train_cols = X_train.columns.tolist()

In [49]:
# remove rows where targets all all NA
train = pd.concat([y_train, X_train], axis = 1)
train = train.dropna(how = 'all', subset = ['panas_pos_imp_post', 'panas_neg_imp_post', 'panas_pos_imp_post'])
test  = pd.concat([y_test, X_test], axis = 1)
test = test.dropna(how = 'all', subset = ['panas_pos_imp_post', 'panas_neg_imp_post', 'panas_pos_imp_post'])

In [50]:
# h2o likes to convert mostly na values into categories. so
# we copy the pandas type mapping across
col_types = dict(train.dtypes)
replacements = {'float64': 'real',
                'int64': 'int'}
for e in col_types:
    col_types[e] = replacements[str(col_types[e])]
    
train_h2o = h2o.H2OFrame(train, column_types = col_types)
test_h2o  = h2o.H2OFrame(test, column_types = col_types)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [None]:
all_models = dict()
best_models = dict()
# restrict search to only imputated cases where imputation actually occurs
targets = ['flourishing_scale_raw_class_post', 
           'flourishing_scale_raw_post', 
           'panas_neg_raw_class_post', 
           'panas_neg_raw_post',
           'panas_pos_raw_class_post',
           'panas_pos_raw_post',
           'panas_pos_imp_class_post',
           'panas_pos_imp_post',
          ]
for target in targets:
    print(f"searching for best model for target {target}")
    if 'class' in target: 
        families = ['binomial']
        metric_name = 'logloss'
    else:
        #TODO: tweedie_variance_power and tweedie_link_power (for tweedie) to work
        families = ["gaussian", "tweedie", "gamma", "poisson", "negativebinomial"]
        metric_name = 'mse'
    output_models = defaultdict(pd.DataFrame)
    best_metic_value = np.Inf
    best_family = None
    best_model = None
    for features in ['all', 'wk_10', 'wk_9-10']:
        if features == 'all':
            x_cols = train_cols
        elif features == 'wk_10':
            x_cols = [e for e in train_cols if 'wk_10' in e]
        elif features == 'wk_9-10':
            x_cols = [e for e in train_cols if 'wk_9' in e or 'wk_10' in e]
        else:
            raise ValueError('feature set not encoded')
        for family in families:
            print(f"searching for best model in {family} family")
            hyper_parameters = {'alpha': list(np.arange(0,1.1,0.1))}

            # h2o grid search doesn't support searching tweedie distribution over the 
            # space of canonical link functions so we define a custom search to support
            # this
            if family == "tweedie":
                # define a simple space (noting that both Guassian, Poisson and Gamma)
                # are already covered in other cases
                tweedie_variance_powers = [1.1, 1.3, 1.5, 1.7, 1.9]
            else:
                tweedie_variance_powers = [0]
            if family == "negativebinomial":
                hyper_parameters['theta'] = [1e-10, 1e-8, 1e-4, 1e-2, 0.1, 0.5, 1]

            for vp in tweedie_variance_powers:
                h2o_glm = H2OGeneralizedLinearEstimator(family = family, nfolds = 5, seed = 20191106, 
                                                        # tweedie parameters are ignored if not tweedie distn.
                                                        tweedie_variance_power = vp,
                                                        tweedie_link_power = 1.0 - vp)
                gs = H2OGridSearch(h2o_glm, hyper_parameters)

                gs.train(y = target, x = x_cols, training_frame = train_h2o)
                glm_grid_models = gs.get_grid(sort_by = 'mse')

                num_models = len(list(glm_grid_models.get_grid()))

                model_results = {
                    'response': target,
                    'family': family,
                    'alpha': [glm_grid_models.get_hyperparams(e)[0] for e in range(num_models)],
                    'metric_name': metric_name,
                    'metric_value': list(glm_grid_models.get_grid(sort_by="mse").mse(xval=True).values()),
                    'features': features
                }
                if family == "tweedie":
                    model_results['tweedie_power'] = vp
                elif family == "negativebinomial":
                    model_results['theta'] = [glm_grid_models.get_hyperparams(e)[1] for e in range(num_models)]
                # keep track of all models
                output_models[family] = output_models[family].append(pd.DataFrame(model_results), ignore_index = True)

                family_best_model = glm_grid_models.models[0]

                if 'class' in target:
                    if family_best_model.logloss(xval=True) < best_metic_value:
                        print(f"!! new best model is {family} with {features} features !!")
                        best_model = family_best_model
                        best_metic_value = family_best_model.logloss(xval=True)
                        best_family = family
                else:
                    if family_best_model.mse(xval=True) < best_metic_value:
                        print(f"!! new best model is {family} with {features} features !!")
                        best_model = family_best_model
                        best_metic_value = family_best_model.mse(xval=True)
                        best_family = family
    all_models[target] = deepcopy(output_models)
    h2o.save_model(model=best_model, path=f"./fitted_models/h2o_glm/{target}", force=True)
    best_models[target] = {'best_model': best_model,
                           'metric_value': metric_name,
                           'best_metic_value': best_metic_value,
                           'best_family': best_family,
                           'features': 'features'}

searching for best model for target flourishing_scale_raw_class_post
searching for best model in binomial family
glm Grid Build progress: |████████████████████████████████████████████████| 100%
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
!! new best model is binomial with all features !!
searching for best model in binomial family
glm Grid Build progress: |████████████████████████████████████████████████| 100%
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
!! new best model is binomial with wk_10 features !!
searching for best model in

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperpar

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperpar

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperpar

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperpar

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperpar

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperpar

In [None]:
with open('fitted_models/h2o_glm/best_models.pkl', 'wb') as out_file:
    pickle.dump(best_models, out_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# all_models = pd.concat(all_models, ignore_index = True)
# all_models.sort_values(by=['metric_value', 'best_metric_value', inplace = True)
for e in all_models:
    all_models[e] = pd.concat(all_models[e], ignore_index = True)
all_models = pd.concat(all_models, ignore_index = True,sort=False)
all_models.sort_values(by=['response', 'metric_value'], inplace = True)
all_models.to_csv("./fitted_models/h2o_glm/glm_cv_results.csv")
all_models                      

In [None]:
for model in best_models:
    print(f'--------------- {model} ---------------')
    print(best_models[model]['best_model'])

In [None]:
# make predictions for all models
all_predictions = dict()
for model in best_models:
    all_predictions[model] = best_models[model]['best_model'].predict(test_h2o)

In [None]:
# large number of models are just the constant model
# that is using linear regression we can't beat a straight line 
# without further feature engineering
# this makes sense (since non-linear behaviour is expected)
for model in best_models:
    print(f'--------------- {model} ---------------')
    print(all_predictions[model])

In [None]:
for target in best_models:
    print(f'--------------- {target} ---------------')
    print(best_models[target]['best_model'].model_performance(test_data=test_h2o))

In [None]:
all_coefs = list()
for target in best_models:
    all_coefs.append(
        pd.DataFrame.from_dict(best_models[target]['best_model'].coef(),
                               orient='index', 
                               columns = [target])
    )
    
all_coefs = pd.concat(all_coefs, axis = 1)
all_coefs.to_csv("./fitted_models/h2o_glm/glm_coefs.csv")
all_coefs

In [None]:
# TO DO create summary metrics and stuff
# only need to compare hyper-parameters for raw vs imputated at a high level

In [None]:
# get top 10 parameters per model
# auc curve
# pvo
# confusion matrix