## Importing important libraries

In [77]:
import h2o
import sys
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

## Initializing H2O 

In [78]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,2 hours 22 mins
H2O cluster version:,3.14.0.3
H2O cluster version age:,26 days
H2O cluster name:,H2O_from_python_avkashchauhan_jqfer7
H2O cluster total nodes:,1
H2O cluster free memory:,3.276 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


## Importing dataset from a public URL

In [79]:
local_url = "https://raw.githubusercontent.com/h2oai/sparkling-water/master/examples/smalldata/prostate/prostate.csv"
df = h2o.import_file(local_url)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [80]:
df

ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
1,0,65,1,2,1,1.4,0.0,6
2,0,72,1,3,2,6.7,0.0,7
3,0,70,1,1,2,4.9,0.0,6
4,0,76,2,2,1,51.2,20.0,7
5,0,69,1,1,1,12.3,55.9,6
6,1,71,1,3,2,3.3,0.0,8
7,0,68,2,4,2,31.9,0.0,7
8,0,61,2,4,2,66.7,27.2,7
9,0,69,1,1,1,3.9,24.0,7
10,0,68,2,1,2,13.0,0.0,6




In [81]:
df.describe()

Rows:380
Cols:9




Unnamed: 0,ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
type,int,int,int,int,int,int,real,real,int
mins,1.0,0.0,43.0,0.0,1.0,1.0,0.3,0.0,0.0
mean,190.5,0.402631578947,66.0394736842,1.08684210526,2.27105263158,1.10789473684,15.4086315789,15.8129210526,6.38421052632
maxs,380.0,1.0,79.0,2.0,4.0,2.0,139.7,97.6,9.0
sigma,109.840793879,0.491074338963,6.52707126917,0.308773258025,1.00010761815,0.310656449351,19.9975726686,18.3476199673,1.09195337443
zeros,0,227,0,3,0,0,0,167,2
missing,0,0,0,0,0,0,0,0,0
0,1.0,0.0,65.0,1.0,2.0,1.0,1.4,0.0,6.0
1,2.0,0.0,72.0,1.0,3.0,2.0,6.7,0.0,7.0
2,3.0,0.0,70.0,1.0,1.0,2.0,4.9,0.0,6.0


## Setting up the feature set and response column for machine learning

In [82]:
y = "CAPSULE"
feature_names = df.col_names
feature_names.remove(y) 
##df[y] = df[y].asfactor()

## Splitting loaded dataset into training and validation dataset

In [83]:
df_train, df_valid, df_test = df.split_frame(ratios=[0.8,0.1])
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(304, 9)
(40, 9)
(36, 9)


## Setting up H2O GLM Esitmatior with all the GLM configuration

In [84]:
prostate_glm = H2OGeneralizedLinearEstimator(model_id = "prostate_glm")

## Now starting the machine learning process to build a GLM model

In [85]:
prostate_glm.train(x = feature_names, y = y, training_frame=df_train, validation_frame=df_valid)

glm Model Build progress: |███████████████████████████████████████████████| 100%


## Taking a look at our GLM model details and its metrics

In [86]:
prostate_glm

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  prostate_glm


ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 0.171055588988
RMSE: 0.413588671252
MAE: 0.358631540065
RMSLE: 0.294993978043
R^2: 0.277468197269
Mean Residual Deviance: 0.171055588988
Null degrees of freedom: 303
Residual degrees of freedom: 295
Null deviance: 71.9703947368
Residual deviance: 52.0008990525
AIC: 345.921553363

ModelMetricsRegressionGLM: glm
** Reported on validation data. **

MSE: 0.152229266166
RMSE: 0.390165690657
MAE: 0.344024011817
RMSLE: 0.270362488566
R^2: 0.35048846436
Mean Residual Deviance: 0.152229266166
Null degrees of freedom: 39
Residual degrees of freedom: 31
Null deviance: 9.37889542936
Residual deviance: 6.08917064662
AIC: 58.2203800769
Scoring History: 


0,1,2,3,4,5
,timestamp,duration,iterations,negative_log_likelihood,objective
,2017-10-18 22:50:21,0.000 sec,0,71.9703947,0.2367447




## Getting Model Performance on all kind of input data (training, validation and test)

In [87]:
train_performance = prostate_glm.model_performance(df_train)
valid_performance = prostate_glm.model_performance(df_valid)
test_performance = prostate_glm.model_performance(df_test)

## R2 (R^2) Metrics

In [95]:
print(train_performance.r2())
print(valid_performance.r2())
print(test_performance.r2())
print(prostate_glm.r2())

0.277468197269
0.35048846436
0.154125855881
0.277468197269


## Above you will see the base model performance is provided on training dataset

##  RMSE  Metrics

In [94]:
print(train_performance.rmse())
print(valid_performance.rmse())
print(test_performance.rmse())
print(prostate_glm.rmse())

0.413588671252
0.390165690657
0.453425197832
0.413588671252


## Getting Prediction metrics

In [96]:
predictions = prostate_glm.predict(df_test)

glm prediction progress: |████████████████████████████████████████████████| 100%


In [97]:
predictions

predict
0.239852
0.182971
0.250637
0.629209
0.258934
0.417876
0.486641
0.332939
0.399963
0.351592




## This is how R2 (R^2) is calculated for GLM Model

In [100]:
SSE = ((predictions-df_test[y])**2).sum()
print(SSE)

7.40139876104


In [101]:
y_hat = df_test[y].mean()
print(y_hat)
print(y_hat[0])

[0.5833333333333334]
0.583333333333


In [102]:
SST = ((df_test[y]-y_hat[0])**2).sum()
print(SST)

8.75


In [103]:
1-SSE/SST

0.15412585588091998

In [104]:
print(test_performance.r2())

0.154125855881


## Above you can see that the R2 metrics for test performance is same as we calculated