In [1]:
import os
import sys
from copy import deepcopy
import numpy as np
import pandas as pd
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch
import shap

In [2]:
# start a h2o instance to fit GLM models
# pick settings so things run fast but don't use all system resources
h2o.init(nthreads = 3, max_mem_size = "8G")

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_222"; OpenJDK Runtime Environment (build 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10); OpenJDK 64-Bit Server VM (build 25.222-b10, mixed mode)
  Starting server from /home/aaron/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp8kau7bn7
  JVM stdout: /tmp/tmp8kau7bn7/h2o_aaron_started_from_python.out
  JVM stderr: /tmp/tmp8kau7bn7/h2o_aaron_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Australia/Sydney
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.10
H2O cluster version age:,10 days
H2O cluster name:,H2O_from_python_aaron_cs4g0y
H2O cluster total nodes:,1
H2O cluster free memory:,7.111 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,3


In [3]:
def import_data(data):    
    return pd.read_csv("train_test_data/" + data + ".csv", index_col="uid").drop("Unnamed: 0", axis="columns")        
X_train = import_data("X_train")
X_test = import_data("X_test")
y_train = import_data("y_train")
y_test = import_data("y_test")

y_train = y_train[[e for e in y_train.columns if 'post' in e]]
y_test  =  y_test[[e for e in y_test.columns  if 'post' in e]]
train_cols = X_train.columns.tolist()

In [None]:
train = pd.concat([y_train, X_train], axis = 1)
train = train.dropna(how = 'all', subset = ['panas_pos_imp_post', 'panas_neg_imp_post', 'panas_pos_imp_post'])
test  = pd.concat([y_test, X_test], axis = 1)
test = test.dropna(how = 'all', subset = ['panas_pos_imp_post', 'panas_neg_imp_post', 'panas_pos_imp_post'])

In [None]:
# h2o likes to convert mostly na values into categories. so
# we copy the pandas type mapping across
col_types = dict(train.dtypes)
replacements = {'float64': 'real',
                'int64': 'int'}
for e in col_types:
    col_types[e] = replacements[str(col_types[e])]
 
train_h2o = h2o.H2OFrame(train, column_types = col_types)
test_h2o  = h2o.H2OFrame(test, column_types = col_types)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [None]:
all_models = dict()
best_models = dict()
for target in y_train.columns:
    print(f"searching for best model for target {target}")
    if 'class' in target: 
        families = ['binomial']
        metric_name = 'logloss'
    else:
        #TODO: tweedie_variance_power and tweedie_link_power (for tweedie) to work
        families = ["gaussian", "gamma"]
        metric_name = 'mse'
    output_models = dict()
    best_metic_value = np.Inf
    best_family = None
    best_model = None
    for family in families:
        print(f"searching in family {family}")
        if family == "tweedie":
            hyper_parameters = {'alpha': list(np.arange(0,1.05,0.05)),
    #                            'tweedie_variance_power': list(np.arange(1.05, 2, 0.5))
                               }
        if family == "negativebinomial":
            hyper_parameters = {'alpha': list(np.arange(0,1.05,0.05)),
    #                            'theta': list(np.arange(1.05, 2, 0.5))
                               }
        else:
            hyper_parameters = {'alpha': list(np.arange(0,1.05,0.05))}

        gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family = family, nfolds = 5, seed = 20191106), 
                           hyper_parameters)

        gs.train(y = target, x = train_cols, training_frame = train_h2o)
        glm_grid_models = gs.get_grid(sort_by = 'mse')
        # keep track of all models
        if family == "tweedie":
            output_models[family] = pd.DataFrame(
                {'response': target,
                 'family': family,
                 'alpha': [glm_grid_models.get_hyperparams(e)[0] for e in range(len(hyper_parameters['alpha']))],
    #              'tweedie_power': [glm_grid_models.get_hyperparams(e)[0] for e in range(len(hyper_parameters['tweedie_power']))],
                 'metric_name': metric_name,
                 'metric_value': list(glm_grid_models.get_grid(sort_by="mse").mse(xval=True).values())})
        else:
            output_models[family] = pd.DataFrame(
                {'response': target,
                 'family': family,
                 'alpha': [glm_grid_models.get_hyperparams(e)[0] for e in range(len(hyper_parameters['alpha']))],
                 'metric_name': metric_name,
                 'metric_value': list(glm_grid_models.get_grid(sort_by="mse").mse(xval=True).values())})
    #     sys.stdout = sys.__stdout__
        # keep track of best model

        family_best_model = glm_grid_models.models[0]

        if 'class' in target:
            if family_best_model.logloss(xval=True) < best_metic_value:
                print(f"new best model is {family}")
                best_model = family_best_model
                best_metic_value = family_best_model.logloss(xval=True)
                best_family = family
        else:
            if family_best_model.mse(xval=True) < best_metic_value:
                print(f"new best model is {family}")
                best_model = family_best_model
                best_metic_value = family_best_model.mse(xval=True)
                best_family = family
    h2o.save_model(model=best_model, path=f"./h2o_models/{target}", force=True)
    all_models[target] = deepcopy(output_models)
    best_models[target] = {'best_model': best_model,
                           'metric_value': metric_value,
                           'best_metic_value': best_metic_value,
                           'best_family': best_family}

searching for best model for target panas_pos_raw_post
searching in family gaussian
glm Grid Build progress: |████████████████████████████████████████████████| 100%
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
new best model is gaussian
searching in family gamma
glm Grid Build progress: |███████████████

In [None]:
h2o.save_model(model=best_model, path=f"./h2o_models/{target}", force=True)

In [None]:
target

In [None]:
# all_models = pd.concat(all_models, ignore_index = True)
# all_models.sort_values(by=['metric_value', 'best_metric_value', inplace = True)
for e in all_models:
    all_models[e] = pd.concat(all_models[e], ignore_index = True)
all_models = pd.concat(all_models, ignore_index = True)
all_models.sort_values(by=['response', 'best_metric_value'], inplace = True)
all_models                      

In [None]:
all_models