In [None]:
import h2o
h2o.connect()

In [None]:
import pandas as pd

In [None]:
titanic_df = pd.read_csv('/Users/avkashchauhan/learn/comcast/titanic_list.csv')

In [None]:
titanic_df.shape

In [None]:
titanic_df.count()

In [None]:
# Converting Pandas Frame to H2O Frame
titanic = h2o.H2OFrame(titanic_df)

In [None]:
titanic

In [None]:
# Note: You will see that the following command will not work
# Because it is a H2OFrame
titanic.count()

In [None]:
# The Other option to import data directly is to use H2O.
titanic_data = h2o.import_file('/Users/avkashchauhan/learn/seattle-workshop/titanic_list.csv')

In [None]:
titanic_data.shape

In [None]:
titanic_data.summary

In [None]:
titanic_data.describe

In [None]:
titanic_data.table

In [None]:
titanic_data['pclass'].table()

In [None]:
titanic_data['age'].table()

In [None]:
titanic_data['sex'].table()

In [None]:
# Loading Estimators
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
# set this to True if interactive (matplotlib) plots are desired
import matplotlib
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt

In [None]:
titanic_data.col_names

In [None]:
response = "survived"

In [None]:
# Selected Columns
# pclass, survived, sex, age, sibsp, parch, fare, embarked 
#

predictors = ["pclass", "sex", "age", "sibsp", "parch", "fare", "embarked"]
# predictors = titanic_data.columns[:-1]

In [None]:
predictors

In [None]:
#Now setting factors to specific columns
titanic_data["pclass"] = titanic_data["pclass"].asfactor()
titanic_data["sex"] = titanic_data["sex"].asfactor()
titanic_data["embarked"] = titanic_data["embarked"].asfactor()


In [None]:
titanic_data.describe()

In [None]:
# Spliting the data set for training and validation
titanic_train, titanic_valid = titanic_data.split_frame(ratios=[0.9])

In [None]:
print(titanic_train.shape)
print(titanic_valid.shape)

# Creating GLM Model

In [None]:
titanic_glm = H2OGeneralizedLinearEstimator(alpha = .25)

In [None]:
titanic_glm.train(x = predictors, y = response, training_frame = titanic_train, validation_frame = titanic_valid)

In [None]:
# print the mse for the validation data
print "mse: ", titanic_glm.mse(valid=True)
print "r2 : ", titanic_glm.r2(valid=True)
print "rmse:", titanic_glm.rmse(valid=True)

# Note: Look for titanic_glm.[TAB] for the values you are interested into

# Adding Grid Search now

In [None]:
# grid over `alpha`
# import Grid Search
from h2o.grid.grid_search import H2OGridSearch


In [2]:
?

Object `H2OGeneralizedLinearEstimator` not found.


In [None]:
# select the values for `alpha` to grid over
hyper_params = {'alpha': [0, .25, .5, .75, .1]}

In [None]:
# this example uses cartesian grid search because the search space is small
# and we want to see the performance of all models. For a larger search space use
# random grid search instead: {'strategy': "RandomDiscrete"}
# initialize the GLM estimator
titanic_glm_hype = H2OGeneralizedLinearEstimator()


In [None]:
# build grid search with previously made GLM and hyperparameters
titanitc_grid = H2OGridSearch(model = titanic_glm_hype, hyper_params = hyper_params,
                     search_criteria = {'strategy': "Cartesian"})


In [None]:
# train using the grid
titanitc_grid.train(x = predictors, y = response, training_frame = titanic_train, validation_frame = titanic_valid)


In [None]:
# sort the grid models by mse
titanic_sorted_grid = titanitc_grid.get_grid(sort_by='mse', decreasing=False)
print(titanic_sorted_grid)

In [None]:
# If you want to sort by r2 then try this
titanic_sorted_grid = titanitc_grid.get_grid(sort_by='r2', decreasing=False)
print(titanic_sorted_grid)

# Adding multiple hyperparameters

In [None]:
# Now adding alpha and lambda together
hyper_params = {'alpha': [0, .25, .5, .75, .1], 'lambda': [0, .1, .01, .001, .0001]}

In [None]:
titanic_glm_hype = H2OGeneralizedLinearEstimator()
titanitc_grid = H2OGridSearch(model = titanic_glm_hype, hyper_params = hyper_params,
                     search_criteria = {'strategy': "Cartesian"})
titanitc_grid.train(x = predictors, y = response, training_frame = titanic_train, validation_frame = titanic_valid)


In [None]:
# If you want to sort by r2 then try this
titanic_sorted_grid = titanitc_grid.get_grid(sort_by='r2', decreasing=False)
print(titanic_sorted_grid)

In [None]:
h2o.ls()

In [None]:
h2o.remove_all()

In [None]:
h2o.ls()

In [None]:
newmodel = h2o.get_model("gbm-75df84b0-abdd-4896-bf1e-6802bf132325")

In [None]:
newmodel

In [None]:
newmodel.varimp()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()
fig, ax = plt.subplots()
variables = newmodel._model_json['output']['variable_importances']['variable']
y_pos = np.arange(len(variables))

In [None]:
scaled_importance = newmodel._model_json['output']['variable_importances']['scaled_importance']
ax.barh(y_pos, scaled_importance, align='center', color='green', ecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels(variables)
ax.invert_yaxis()
ax.set_xlabel('Scaled Importance')
ax.set_title('Variable Importance')
plt.show()

In [None]:
newmodel

In [None]:
newmodel._model_json['output']['variable_importances']

In [None]:
newmodel.score_history()

In [None]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [None]:
titanic_gbm = H2OGradientBoostingEstimator()

In [None]:
titanic_gbm.train(x = predictors, y = response, training_frame = titanic_train, 
                  validation_frame = titanic_valid, model_id = "new_GBM_Model")

In [None]:
myGbm = h2o.get_model("new_GBM_Model")