In [None]:
#Import H2O and other libaries that will be used in this tutorial 
import matplotlib as plt
import pandas as pd
%matplotlib inline
from h2o.automl import H2OAutoML

In [None]:
import os
import h2o

startup  = '/home/h2o/bin/aquarium_startup'
shutdown = '/home/h2o/bin/aquarium_stop'

if os.path.exists(startup):
    os.system(startup)
    local_url = 'http://localhost:54321/h2o'
    aquarium = True
else:
    local_url = 'http://localhost:54321'
    aquarium = False

In [None]:
h2o.init(url=local_url)

In [None]:
#Import the dataset 
loan_level = h2o.import_file("https://s3.amazonaws.com/data.h2o.ai/H2O-3-Tutorials/loan_level_50k.csv")

In [None]:
loan_level.head()

In [None]:
loan_level.describe()

In [None]:
loan_level["DELINQUENT"].table()

In [None]:
loan_level["ORIGINAL_INTEREST_RATE"].hist()

In [None]:
train, test = loan_level.split_frame([0.8], seed=42)

In [None]:
print("train:%d test:%d" % (train.nrows, test.nrows))

## Classification Use Case

In [None]:
y = "DELINQUENT"
ignore = ["DELINQUENT", "PREPAID", "PREPAYMENT_PENALTY_MORTGAGE_FLAG", "PRODUCT_TYPE"] 
x = list(set(train.names) - set(ignore))

In [None]:
test["DELINQUENT"].table()

In [None]:
aml = H2OAutoML(max_runtime_secs=300, seed=42, project_name='classification', balance_classes=True)
%time aml.train(x=x, y=y, training_frame=train)

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

In [None]:
# Get model ids for all models in the AutoML Leaderboard
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
# Get the "All Models" Stacked Ensemble model
se = h2o.get_model([mid for mid in model_ids if "StackedEnsemble_BestOfFamily" in mid][0])
# Get the Stacked Ensemble metalearner model
metalearner = h2o.get_model(se.metalearner()['name'])
metalearner.coef()

In [None]:
metalearner.std_coef_plot()

In [None]:
aml.leader.model_performance(test_data=test)

In [None]:
%matplotlib inline
aml.leader.model_performance(test_data=test).plot()

In [None]:
aml.predict(test)

## Regression Use Case

In [None]:
y_reg = "ORIGINAL_INTEREST_RATE"

ignore_reg = ["ORIGINAL_INTEREST_RATE", "FIRST_PAYMENT_DATE", "MATURITY_DATE", "MORTGAGE_INSURANCE_PERCENTAGE", "PREPAYMENT_PENALTY_MORTGAGE_FLAG", "LOAN_SEQUENCE_NUMBER", "PREPAID", "DELINQUENT", "PRODUCT_TYPE"] 

x = list(set(train.names) - set(ignore))

In [None]:
print("y:", y_reg, "\nx:", x_reg)

In [None]:
aml = H2OAutoML(max_models=10, seed=42, project_name='regression', stopping_metric="RMSE", sort_metric="RMSE")
%time aml.train(x=x_reg, y=y_reg, training_frame=train)

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

In [None]:
# Get model ids for all models in the AutoML Leaderboard
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
# Get the top XGBoost model
gbm = h2o.get_model([mid for mid in model_ids if "GBM_3" in mid][0])

In [None]:
print("ntrees = ", gbm.params['ntrees'])
print("max depth = ", gbm.params['max_depth'])
print("learn rate = ", gbm.params['learn_rate'])
print("sample rate = ", gbm.params['sample_rate'])

In [None]:
gbm.params

In [None]:
gbm

In [None]:
gbm.plot()

In [None]:
gbm.model_performance(test_data=test)

In [None]:
aml.leader.model_performance(test_data=test)

In [None]:
pred = gbm.predict(test)
pred = pred.cbind(test['ORIGINAL_INTEREST_RATE'])
pred.head()

In [None]:
h2o.cluster().shutdown()