# H2O Use Case - Predictive Maintenance

- Source: https://archive.ics.uci.edu/ml/datasets/SECOM
- H2O Advanced Usage: 
        - Using Random Grid Search to fine tune models parameters        

In [None]:
# Load h2o library
suppressPackageStartupMessages(library(h2o))

In [None]:
# Start and connect to a local H2O cluster
h2o.init(nthreads = -1)

In [None]:
# Importing data from local CSV
h_secom <- h2o.importFile(path = "secom.csv", destination_frame = "h_secom")

In [None]:
# Print out column names
colnames(h_secom)

In [None]:
# Look at "Classification"
summary(h_secom$Classification, exact_quantiles=TRUE)

In [None]:
# "Classification" is a column of numerical values
# Convert "Classification" in secom dataset from numerical to categorical value
h_secom$Classification <- as.factor(h_secom$Classification)

In [None]:
# Look at "Classification" again
summary(h_secom$Classification, exact_quantiles=TRUE)

In [None]:
# Define target (y) and features (x)
target <- "Classification"
features <- setdiff(colnames(h_secom), target)
print(features)

In [None]:
# Splitting dataset into training and test
h_split <- h2o.splitFrame(h_secom, ratios = 0.7, seed = 1234)
h_train <- h_split[[1]] # 70%
h_test  <- h_split[[2]] # 30%

In [None]:
# Look at the size
dim(h_train)
dim(h_test)

In [None]:
# Check Classification in each dataset
summary(h_train$Classification, exact_quantiles = TRUE)
summary(h_test$Classification, exact_quantiles = TRUE)

<br>
# Build GBM Models using Random Grid Search and Extract the Best Model

In [None]:
# Define the criteria for random grid search
search_criteria = list(strategy = "RandomDiscrete",
                       max_models = 10,   
                       seed = 1234)

In [None]:
# Define the range of hyper-parameters for grid search
hyper_params <- list(
    sample_rate = c(0.6, 0.7, 0.8, 0.9),
    col_sample_rate = c(0.6, 0.7, 0.8, 0.9),
    max_depth = c(4, 5, 6)
)

In [None]:
# Set up grid search
# Add a seed for reproducibility
rand_grid <- h2o.grid(
  
    # Core parameters for model training
    x = features,
    y = target,
    training_frame = h_train,
    ntrees = 500,
    learn_rate = 0.05,
    balance_classes = TRUE,
    seed = 1234,
    
    # Settings for Cross-Validation
    nfolds = 5,
    fold_assignment = "Stratified",
    
    # Parameters for early stopping
    stopping_metric = "mean_per_class_error",
    stopping_rounds = 15,
    score_tree_interval = 1,
        
    # Parameters for grid search
    grid_id = "rand_grid",
    hyper_params = hyper_params,
    algorithm = "gbm",
    search_criteria = search_criteria  
  
)

In [None]:
# Sort and show the grid search results
rand_grid <- h2o.getGrid(grid_id = "rand_grid", sort_by = "mean_per_class_error", decreasing = FALSE)
print(rand_grid)

In [None]:
# Extract the best model from random grid search
best_model_id <- rand_grid@model_ids[[1]] # top of the list
best_model <- h2o.getModel(best_model_id)
print(best_model)

In [None]:
# Check performance on test set
h2o.performance(best_model, h_test)

# Making Predictions

In [None]:
# Use the model for predictions
yhat_test <- h2o.predict(best_model, h_test)

In [None]:
# Show first 10 rows
head(yhat_test, 10)