In [None]:
###: Loading H2O Library

In [None]:
import h2o

In [None]:
###: Get H2O Version

In [None]:
h2o.__version__

In [None]:
###: Initalizing H2O cluster

In [None]:
h2o.init()

In [None]:
###: Importing both training and test dataset into H2O cluster memory

In [None]:
#train_df = h2o.import_file("/Users/avkashchauhan/examples/regression/house_price_train.csv")
#test_df = h2o.import_file("/Users/avkashchauhan/examples/regression/house_price_test.csv")

train_df = h2o.import_file("https://raw.githubusercontent.com/Avkash/mldl/master/data/house_price_train.csv")
test_df = h2o.import_file("https://raw.githubusercontent.com/Avkash/mldl/master/data/house_price_test.csv")

In [None]:
###: Understanding Training dataset

In [None]:
train_df.describe()

In [None]:
###: Understanding Test dataset

In [None]:
test_df.describe()

In [None]:
###: Training and test dataset - columns and rows details

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
###: Training and Test Dataframes - columns names

In [None]:
print(train_df.col_names)
print(test_df.col_names)

In [None]:
###: Settings response or target variable for supervised machine learning

In [None]:
response = "medv"
features = train_df.col_names
print(features)

In [None]:
###: Creating a list of all features we will use for machine learning

In [None]:
features.remove(response)
print(features)

In [None]:
###: Understanding response variable values as historgram in Training data

In [None]:
train_df['medv'].hist()

In [None]:
###: Importing H2O H2OGeneralizedLinearEstimator to build GLM Model

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
###:Building Gradient Boosting (GBM) -  Regression model with cross validation

In [None]:
glm_model_with_cv = H2OGeneralizedLinearEstimator(nfolds=5)
glm_model_with_cv.train(x = features, y = response, training_frame=train_df)

In [None]:
###: Getting model performance

In [None]:
glm_model_with_cv.model_performance(valid=True,test_data=test_df).r2()

In [None]:
###:Building GLM -  Regression model with cross validation andkey GBM parameters configuration

In [None]:
glm_model_cv_config = H2OGeneralizedLinearEstimator(nfolds=5,
                                                    keep_cross_validation_predictions=True,
                                                    lambda_search = True,
                                                    alpha = 0.1,
                                                    seed=1)

In [None]:
###: Training GBM Model

In [None]:
glm_model_cv_config.train(x = features, y = response, 
                                            training_frame=train_df, 
                                           model_id = "glm_model_with_training_and_validtion_python")

In [None]:
###: Getting GLM model performance on test data

In [None]:
glm_model_cv_config.model_performance(valid=True,test_data=test_df).r2()

In [None]:
###: Importing H2O Grid Library

In [None]:
from h2o.grid import H2OGridSearch

In [None]:
###: Settings GLM grid parameters

In [None]:
glm_hyper_params = { 'alpha': [0.01,0.1,0.3,0.5,0.7,0.9], 
                     'lambda': [1e-1,1e-3,1e-5,1e-7,1e-9] }

In [None]:
###: Setting H2O Grid Search Criteria

In [None]:
grid_search_criteria = { 'strategy': "RandomDiscrete", 
                    'seed': 123,
                    'stopping_metric': "AUTO", 
                    'stopping_tolerance': 0.01,
                    'stopping_rounds': 5 }

In [None]:
###: Finalzing the H2I Grid searching settings

In [None]:
house_price_glm_grid = H2OGridSearch(model=H2OGeneralizedLinearEstimator(
                                                        seed=12345,
                                                        nfolds=5,
                                                        fold_assignment="Modulo",
                                                        keep_cross_validation_predictions=True),
                     hyper_params=glm_hyper_params,
                     search_criteria=grid_search_criteria,
                     grid_id="house_price_glm_grid")

In [None]:
###: Finally training H2O Grid with data 

In [None]:
house_price_glm_grid.train(x=features, y=response, training_frame=train_df)

In [None]:
###: Finally getting total count of GLM models

In [None]:
len(house_price_glm_grid)

In [None]:
###: Defining a function to find the best model from the grid based on r2 or auc

In [None]:
def find_best_model_from_grid(h2o_grid, test_parameter):    
    model_list = []
    for grid_item in h2o_grid:
        if test_parameter is "r2":
            if not (grid_item.r2() == "NaN"):
                model_list.append(grid_item.r2())
            else:
                model_list.append(0.0)            
        elif test_parameter is "auc":
            if not (grid_item.auc() == "NaN"):
                model_list.append(grid_item.auc())
            else:
                model_list.append(0.0)            
    #print(model_list)        
    max_index = model_list.index(max(model_list))
    #print(max_index)
    best_model = h2o_grid[max_index]
    print("Model ID with best R2: " +  best_model.model_id)
    if test_parameter is "r2":
        print("Best R2: " +  str(best_model.r2()))
    elif test_parameter is "auc":
        print("Best AUC: " +  str(best_model.auc()))
    return best_model

In [None]:
###: Applying the function to get the best model from the grid

In [None]:
best_glm_model = find_best_model_from_grid(house_price_glm_grid, "r2")

In [None]:
###: Getting the best model performance on test data

In [None]:
best_glm_model.model_performance(valid=True,test_data=test_df).r2()

In [None]:
###: Performing predictions with one of the above model

In [None]:
glm_predictions = best_model.predict(test_df)

In [None]:
glm_predictions

In [None]:
###: Understanding/Validating predictions based on prediction results historgram

In [None]:
glm_predictions.hist()

In [None]:
###: Getting Scorring History

In [None]:
best_glm_model.scoring_history()

In [None]:
###: Getting GBM model variable importance 

In [None]:
best_glm_model.varimp()

In [None]:
###: Getting GBM model variable importance PLOT

In [None]:
best_glm_model.varimp_plot()