In [53]:
import os
import sys
import pickle
from copy import deepcopy

import numpy as np
import pandas as pd

import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch
import shap

In [2]:
# start a h2o instance to fit GLM models
# pick settings so things run fast but don't use all system resources
h2o.init(nthreads = 3, max_mem_size = "8G")

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_222"; OpenJDK Runtime Environment (build 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10); OpenJDK 64-Bit Server VM (build 25.222-b10, mixed mode)
  Starting server from /home/aaron/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp8kau7bn7
  JVM stdout: /tmp/tmp8kau7bn7/h2o_aaron_started_from_python.out
  JVM stderr: /tmp/tmp8kau7bn7/h2o_aaron_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Australia/Sydney
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.10
H2O cluster version age:,10 days
H2O cluster name:,H2O_from_python_aaron_cs4g0y
H2O cluster total nodes:,1
H2O cluster free memory:,7.111 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,3


In [3]:
def import_data(data):    
    return pd.read_csv("train_test_data/" + data + ".csv", index_col="uid").drop("Unnamed: 0", axis="columns")        
X_train = import_data("X_train")
X_test = import_data("X_test")
y_train = import_data("y_train")
y_test = import_data("y_test")

y_train = y_train[[e for e in y_train.columns if 'post' in e]]
y_test  =  y_test[[e for e in y_test.columns  if 'post' in e]]
train_cols = X_train.columns.tolist()

In [4]:
train = pd.concat([y_train, X_train], axis = 1)
train = train.dropna(how = 'all', subset = ['panas_pos_imp_post', 'panas_neg_imp_post', 'panas_pos_imp_post'])
test  = pd.concat([y_test, X_test], axis = 1)
test = test.dropna(how = 'all', subset = ['panas_pos_imp_post', 'panas_neg_imp_post', 'panas_pos_imp_post'])

In [5]:
# h2o likes to convert mostly na values into categories. so
# we copy the pandas type mapping across
col_types = dict(train.dtypes)
replacements = {'float64': 'real',
                'int64': 'int'}
for e in col_types:
    col_types[e] = replacements[str(col_types[e])]
    
train_h2o = h2o.H2OFrame(train, column_types = col_types)
test_h2o  = h2o.H2OFrame(test, column_types = col_types)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [93]:
all_models = dict()
best_models = dict()
# restrict search to only imputated cases where imputation actually occurs
targets = ['flourishing_scale_raw_class_post', 'flourishing_scale_raw_post', 
           'panas_neg_raw_class_post', 'panas_neg_raw_post',
           'panas_pos_raw_class_post', 'panas_pos_raw_post',
           'panas_pos_imp_class_post', 'panas_pos_imp_post',
          ]
for target in targets:
    print(f"searching for best model for target {target}")
    if 'class' in target: 
        families = ['binomial']
        metric_name = 'logloss'
    else:
        #TODO: tweedie_variance_power and tweedie_link_power (for tweedie) to work
        families = ["gaussian", "tweedie", "gamma", "poisson", "negativebinomial"]
        metric_name = 'mse'
    output_models = dict()
    best_metic_value = np.Inf
    best_family = None
    best_model = None
    for family in families:
        print(f"searching for best model in {family} family")
        hyper_parameters = {'alpha': list(np.arange(0,1.1,0.1))}
        
        # h2o grid search doesn't support searching tweedie distribution over the 
        # space of canonical link functions so we define a custom search to support
        # this
        if family == "tweedie":
            # define a simple space (noting that both Guassian, Poisson and Gamma)
            # are already covered in other cases
            tweedie_variance_powers = [1.1, 1.3, 1.5, 1.7, 1.9]
        else:
            tweedie_variance_powers = [0]
        if family == "negativebinomial":
            hyper_parameters['theta'] = [1e-10, 1e-8, 1e-4, 1e-2, 0.1, 0.5, 1]
            
        for vp in tweedie_variance_powers:
            h2o_glm = H2OGeneralizedLinearEstimator(family = family, nfolds = 5, seed = 20191106, 
                                                    # tweedie parameters are ignored if not tweedie distn.
                                                    tweedie_variance_power = vp,
                                                    tweedie_link_power = 1.0 - vp)
            gs = H2OGridSearch(h2o_glm, hyper_parameters)

            gs.train(y = target, x = train_cols, training_frame = train_h2o)
            glm_grid_models = gs.get_grid(sort_by = 'mse')

            num_models = len(list(glm_grid_models.get_grid()))

            model_results = {
                'response': target,
                'family': family,
                'alpha': [glm_grid_models.get_hyperparams(e)[0] for e in range(num_models)],
                'metric_name': metric_name,
                'metric_value': list(glm_grid_models.get_grid(sort_by="mse").mse(xval=True).values())
            }
            if family == "tweedie":
                model_results['tweedie_power'] = vp
            elif family == "negativebinomial":
                model_results['theta'] = [glm_grid_models.get_hyperparams(e)[1] for e in range(num_models)]
            # keep track of all models
            output_models[family] = pd.DataFrame(model_results)

            family_best_model = glm_grid_models.models[0]

            if 'class' in target:
                if family_best_model.logloss(xval=True) < best_metic_value:
                    print(f"!! new best model is {family} !!")
                    best_model = family_best_model
                    best_metic_value = family_best_model.logloss(xval=True)
                    best_family = family
            else:
                if family_best_model.mse(xval=True) < best_metic_value:
                    print(f"!! new best model is {family} !!")
                    best_model = family_best_model
                    best_metic_value = family_best_model.mse(xval=True)
                    best_family = family
            all_models[target] = deepcopy(output_models)
    h2o.save_model(model=best_model, path=f"./fitted_models/h2o_glm/{target}", force=True)
    best_models[target] = {'best_model': best_model,
                           'metric_value': metric_name,
                           'best_metic_value': best_metic_value,
                           'best_family': best_family}

searching for best model for target flourishing_scale_raw_class_post
searching for best model in binomial family
glm Grid Build progress: |████████████████████████████████████████████████| 100%
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
!! new best model is binomial !!
searching for best model for target flourishing_scale_raw_post
searching for best model in gaussian family
glm Grid Build progress: |████████████████████████████████████████████████| 100%
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
!! new best model is gaussian !!
se

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
searching for best model for target panas_neg_raw_class_post
searching for best model in binomial family
glm Grid Build progress: |████████████████████████████████████████████████| 100%
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [alpha]
Hyperparameters: [al

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperpar

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperpar

Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperparameters: [alpha, theta]
Hyperpar

In [94]:
with open('fitted_models/h2o_glm/best_models.pkl', 'wb') as out_file:
    pickle.dump(best_models, out_file, protocol=pickle.HIGHEST_PROTOCOL)

In [97]:
# all_models = pd.concat(all_models, ignore_index = True)
# all_models.sort_values(by=['metric_value', 'best_metric_value', inplace = True)
for e in all_models:
    all_models[e] = pd.concat(all_models[e], ignore_index = True)
all_models = pd.concat(all_models, ignore_index = True,sort=False)
all_models.sort_values(by=['response', 'metric_value'], inplace = True)
all_models.to_csv("./fitted_models/h2o_glm/glm_cv_results.csv")
all_models                      

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Unnamed: 0,response,family,alpha,metric_name,metric_value,theta,tweedie_power
0,flourishing_scale_raw_class_post,binomial,1.0,logloss,0.317614,,
1,flourishing_scale_raw_class_post,binomial,0.9,logloss,0.330378,,
2,flourishing_scale_raw_class_post,binomial,0.8,logloss,0.348314,,
3,flourishing_scale_raw_class_post,binomial,0.7,logloss,0.363921,,
4,flourishing_scale_raw_class_post,binomial,0.3,logloss,0.369614,,
...,...,...,...,...,...,...,...
281,panas_pos_raw_post,gaussian,0.0,mse,819.968224,,
282,panas_pos_raw_post,gaussian,0.4,mse,888.237635,,
283,panas_pos_raw_post,gaussian,0.3,mse,971.459380,,
284,panas_pos_raw_post,gaussian,0.2,mse,1044.628552,,


In [98]:
# make predictions for all models
all_predictions = dict()
for model in best_models:
    all_predictions[model] = best_models[model]['best_model'].predict(test_h2o)

glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%


In [99]:
# large number of models are just the constant model
# that is using linear regression we can't beat a straight line 
# without further feature engineering
# this makes sense (since non-linear behaviour is expected)
for model in best_models:
    all_predictions[model]

In [114]:
for target in best_models:
    print(best_models[target]['best_model'].model_performance(test_data=test_h2o))


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.2399059607462585
RMSE: 0.4898019607415414
LogLoss: 0.7003707236880794
Null degrees of freedom: 8
Residual degrees of freedom: -8
Null deviance: 12.476649250079015
Residual deviance: 12.606673026385431
AIC: 46.60667302638543
AUC: 0.65
pr_auc: 0.39940476190476193
Gini: 0.30000000000000004

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.08692383768381724: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,1.0,4.0,0.8,(4.0/5.0)
1,1,0.0,4.0,0.0,(0.0/4.0)
2,Total,1.0,8.0,0.4444,(4.0/9.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.086924,0.666667,7.0
1,max f2,0.086924,0.833333,7.0
2,max f0point5,0.997231,0.625,0.0
3,max accuracy,0.997231,0.666667,0.0
4,max precision,0.997231,1.0,0.0
5,max recall,0.086924,1.0,7.0
6,max specificity,0.997231,1.0,0.0
7,max absolute_mcc,0.997231,0.395285,0.0
8,max min_per_class_accuracy,0.41613,0.6,4.0
9,max mean_per_class_accuracy,0.41613,0.675,4.0



Gains/Lift Table: Avg response rate: 44.44 %, avg score: 44.23 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.111111,0.971536,2.25,2.25,1.0,0.997231,1.0,0.997231,0.25,0.25,125.0,125.0
1,,2,0.111111,0.945841,0.0,2.25,0.0,0.0,1.0,0.997231,0.0,0.25,-100.0,125.0
2,,3,0.111111,0.920146,0.0,2.25,0.0,0.0,1.0,0.997231,0.0,0.25,-100.0,125.0
3,,4,0.111111,0.89445,0.0,2.25,0.0,0.0,1.0,0.997231,0.0,0.25,-100.0,125.0
4,,5,0.111111,0.868755,0.0,2.25,0.0,0.0,1.0,0.997231,0.0,0.25,-100.0,125.0
5,,6,0.111111,0.740279,0.0,2.25,0.0,0.0,1.0,0.997231,0.0,0.25,-100.0,125.0
6,,7,0.222222,0.671845,0.0,1.125,0.0,0.676041,0.5,0.836636,0.0,0.25,-100.0,12.5
7,,8,0.222222,0.663454,0.0,1.125,0.0,0.0,0.5,0.836636,0.0,0.25,-100.0,12.5
8,,9,0.333333,0.577175,2.25,1.5,1.0,0.655064,0.666667,0.776112,0.25,0.5,125.0,50.0
9,,10,0.444444,0.4515,0.0,1.125,0.0,0.460342,0.5,0.69717,0.0,0.5,-100.0,12.5





ModelMetricsRegressionGLM: glm
** Reported on test data. **

MSE: 51.01091711130338
RMSE: 7.142192738319471
MAE: 6.130434585083499
RMSLE: 0.17871916681387506
R^2: -0.03972931203210339
Mean Residual Deviance: 0.8691133274931793
Null degrees of freedom: 8
Residual degrees of freedom: 8
Null deviance: 7.823884417539089
Residual deviance: 7.822019947438613
AIC: NaN


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.4091211019711439
RMSE: 0.6396257514915608
LogLoss: 1.2808710696762313
Null degrees of freedom: 8
Residual degrees of freedom: -12
Null deviance: 12.369824569664342
Residual deviance: 23.05567925417216
AIC: 65.05567925417216
AUC: 0.3
pr_auc: 0.3579365079365079
Gini: -0.4

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.010718189592843463: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,0.0,4.0,1.0,(4.0/4.0)
1,1,0.0,5.0,0.0,(0.0/5.0)
2,Total,0.0,9.0,0.4444,(4.0/9.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.010718,0.714286,8.0
1,max f2,0.010718,0.862069,8.0
2,max f0point5,0.010718,0.609756,8.0
3,max accuracy,0.959637,0.555556,0.0
4,max precision,0.959637,1.0,0.0
5,max recall,0.010718,1.0,8.0
6,max specificity,0.959637,1.0,0.0
7,max absolute_mcc,0.575647,0.632456,5.0
8,max min_per_class_accuracy,0.749538,0.4,3.0
9,max mean_per_class_accuracy,0.959637,0.6,0.0



Gains/Lift Table: Avg response rate: 55.56 %, avg score: 58.51 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.111111,0.945498,1.8,1.8,1.0,0.959637,1.0,0.959637,0.2,0.2,80.0,80.0
1,,2,0.111111,0.931358,0.0,1.8,0.0,0.0,1.0,0.959637,0.0,0.2,-100.0,80.0
2,,3,0.111111,0.917219,0.0,1.8,0.0,0.0,1.0,0.959637,0.0,0.2,-100.0,80.0
3,,4,0.111111,0.90308,0.0,1.8,0.0,0.0,1.0,0.959637,0.0,0.2,-100.0,80.0
4,,5,0.111111,0.888941,0.0,1.8,0.0,0.0,1.0,0.959637,0.0,0.2,-100.0,80.0
5,,6,0.111111,0.818245,0.0,1.8,0.0,0.0,1.0,0.959637,0.0,0.2,-100.0,80.0
6,,7,0.222222,0.777379,0.0,0.9,0.0,0.782897,0.5,0.871267,0.0,0.2,-100.0,-10.0
7,,8,0.222222,0.766343,0.0,0.9,0.0,0.0,0.5,0.871267,0.0,0.2,-100.0,-10.0
8,,9,0.333333,0.753,0.0,0.6,0.0,0.755307,0.333333,0.832614,0.0,0.2,-100.0,-40.0
9,,10,0.444444,0.724361,1.8,0.9,1.0,0.749538,0.5,0.811845,0.2,0.4,80.0,-10.0





ModelMetricsRegressionGLM: glm
** Reported on test data. **

MSE: 51.38315829368675
RMSE: 7.16820467716197
MAE: 5.36174149749776
RMSLE: 0.2826889069067607
R^2: -0.13903552867778446
Mean Residual Deviance: 0.8531411710651938
Null degrees of freedom: 8
Residual degrees of freedom: -7
Null deviance: 6.388652186803928
Residual deviance: 7.6782705395867445
AIC: NaN


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.2545415994912288
RMSE: 0.5045211586159978
LogLoss: 0.7066624491993022
Null degrees of freedom: 8
Residual degrees of freedom: -114
Null deviance: 12.694335777760172
Residual deviance: 12.71992408558744
AIC: 258.7199240855874
AUC: 0.6666666666666666
pr_auc: 0.25833333333333336
Gini: 0.33333333333333326

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5340793928403734: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,3.0,3.0,0.5,(3.0/6.0)
1,1,0.0,3.0,0.0,(0.0/3.0)
2,Total,3.0,6.0,0.3333,(3.0/9.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.534079,0.666667,5.0
1,max f2,0.534079,0.833333,5.0
2,max f0point5,0.984292,0.714286,0.0
3,max accuracy,0.984292,0.777778,0.0
4,max precision,0.984292,1.0,0.0
5,max recall,0.534079,1.0,5.0
6,max specificity,0.984292,1.0,0.0
7,max absolute_mcc,0.984292,0.5,0.0
8,max min_per_class_accuracy,0.615425,0.5,4.0
9,max mean_per_class_accuracy,0.534079,0.75,5.0



Gains/Lift Table: Avg response rate: 33.33 %, avg score: 56.61 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.111111,0.975141,3.0,3.0,1.0,0.984292,1.0,0.984292,0.333333,0.333333,200.0,200.0
1,,2,0.111111,0.965989,0.0,3.0,0.0,0.0,1.0,0.984292,0.0,0.333333,-100.0,200.0
2,,3,0.111111,0.956837,0.0,3.0,0.0,0.0,1.0,0.984292,0.0,0.333333,-100.0,200.0
3,,4,0.111111,0.947685,0.0,3.0,0.0,0.0,1.0,0.984292,0.0,0.333333,-100.0,200.0
4,,5,0.111111,0.938534,0.0,3.0,0.0,0.0,1.0,0.984292,0.0,0.333333,-100.0,200.0
5,,6,0.111111,0.892775,0.0,3.0,0.0,0.0,1.0,0.984292,0.0,0.333333,-100.0,200.0
6,,7,0.222222,0.834663,0.0,1.5,0.0,0.869896,0.5,0.927094,0.0,0.333333,-100.0,50.0
7,,8,0.222222,0.764198,0.0,1.5,0.0,0.0,0.5,0.927094,0.0,0.333333,-100.0,50.0
8,,9,0.333333,0.690188,0.0,1.0,0.0,0.693734,0.333333,0.849307,0.0,0.333333,-100.0,0.0
9,,10,0.444444,0.670981,0.0,0.75,0.0,0.684869,0.25,0.808198,0.0,0.333333,-100.0,-25.0





ModelMetricsRegressionGLM: glm
** Reported on test data. **

MSE: 37.88941736028551
RMSE: 6.155438031552711
MAE: 4.616858237547905
RMSLE: 0.252307092948434
R^2: -0.3701083956174642
Mean Residual Deviance: 1.0772310095981033
Null degrees of freedom: 8
Residual degrees of freedom: 8
Null deviance: 9.695079086382645
Residual deviance: 9.695079086382929
AIC: NaN


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.28558566527315876
RMSE: 0.534402156875474
LogLoss: 0.7958080455453375
Null degrees of freedom: 8
Residual degrees of freedom: -112
Null deviance: 12.917332581097009
Residual deviance: 14.324544819816074
AIC: 256.3245448198161
AUC: 0.6666666666666666
pr_auc: 0.25833333333333336
Gini: 0.33333333333333326

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5281399483892456: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,3.0,3.0,0.5,(3.0/6.0)
1,1,0.0,3.0,0.0,(0.0/3.0)
2,Total,3.0,6.0,0.3333,(3.0/9.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.52814,0.666667,5.0
1,max f2,0.52814,0.833333,5.0
2,max f0point5,0.985678,0.714286,0.0
3,max accuracy,0.985678,0.777778,0.0
4,max precision,0.985678,1.0,0.0
5,max recall,0.52814,1.0,5.0
6,max specificity,0.985678,1.0,0.0
7,max absolute_mcc,0.985678,0.5,0.0
8,max min_per_class_accuracy,0.565381,0.5,4.0
9,max mean_per_class_accuracy,0.52814,0.75,5.0



Gains/Lift Table: Avg response rate: 33.33 %, avg score: 57.84 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.111111,0.979347,3.0,3.0,1.0,0.985678,1.0,0.985678,0.333333,0.333333,200.0,200.0
1,,2,0.111111,0.973015,0.0,3.0,0.0,0.0,1.0,0.985678,0.0,0.333333,-100.0,200.0
2,,3,0.111111,0.966684,0.0,3.0,0.0,0.0,1.0,0.985678,0.0,0.333333,-100.0,200.0
3,,4,0.111111,0.960353,0.0,3.0,0.0,0.0,1.0,0.985678,0.0,0.333333,-100.0,200.0
4,,5,0.111111,0.954022,0.0,3.0,0.0,0.0,1.0,0.985678,0.0,0.333333,-100.0,200.0
5,,6,0.111111,0.922366,0.0,3.0,0.0,0.0,1.0,0.985678,0.0,0.333333,-100.0,200.0
6,,7,0.222222,0.875374,0.0,1.5,0.0,0.906538,0.5,0.946108,0.0,0.333333,-100.0,50.0
7,,8,0.222222,0.813046,0.0,1.5,0.0,0.0,0.5,0.946108,0.0,0.333333,-100.0,50.0
8,,9,0.333333,0.733488,0.0,1.0,0.0,0.750718,0.333333,0.880978,0.0,0.333333,-100.0,0.0
9,,10,0.444444,0.679192,0.0,0.75,0.0,0.707644,0.25,0.837644,0.0,0.333333,-100.0,-25.0





ModelMetricsRegressionGLM: glm
** Reported on test data. **

MSE: 43.2113298836071
RMSE: 6.573532527006092
MAE: 5.53893289366263
RMSLE: 0.2576161210145584
R^2: -0.5625525538268603
Mean Residual Deviance: 0.08718413720498781
Null degrees of freedom: 8
Residual degrees of freedom: -9
Null deviance: 0.7658634361768761
Residual deviance: 0.7846572348448904
AIC: NaN



In [102]:
best_models.keys()

dict_keys(['flourishing_scale_raw_class_post', 'flourishing_scale_raw_post', 'panas_neg_raw_class_post', 'panas_neg_raw_post', 'panas_pos_raw_class_post', 'panas_pos_raw_post', 'panas_pos_imp_class_post', 'panas_pos_imp_post'])

In [125]:
all_coefs = list()
for target in best_models:
    all_coefs.append(
        pd.DataFrame.from_dict(best_models[target]['best_model'].coef(),
                               orient='index', 
                               columns = [target])
    )
    
all_coefs = pd.concat(all_coefs, axis = 1)
all_coefs.to_csv("./fitted_models/h2o_glm/glm_coefs.csv")
all_coefs

Unnamed: 0,flourishing_scale_raw_class_post,flourishing_scale_raw_post,panas_neg_raw_class_post,panas_neg_raw_post,panas_pos_raw_class_post,panas_pos_raw_post,panas_pos_imp_class_post,panas_pos_imp_post
Intercept,-16.20789,0.686243,-3.766069,0.399438,20.073097,0.713342,22.570902,-0.007686
chargetime_count_wk_1,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
chargetime_count_wk_2,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
chargetime_count_wk_3,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
chargetime_count_wk_4,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
locktime_q3_wk_6,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
locktime_q3_wk_7,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
locktime_q3_wk_8,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
locktime_q3_wk_9,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
# TO DO create summary metrics and stuff
# only need to compare hyper-parameters for raw vs imputated at a high level

In [None]:
# get top 10 parameters per model
# auc curve
# pvo
# confusion matrix