In [2]:
import os
import h2o
h2o.init()
from h2o.automl import H2OAutoML
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
class ModelBuilderTrainerTester:

    def __init__(self, config, info_tracker, data, n_best_models=3):
        self.config = config
        self.info_tracker = info_tracker
        self.data = data
        self.n_best_models = n_best_models

        # Before h20 format
        self.train_df: pd.DataFrame = None
        self.test_df: pd.DataFrame = None
        # After h2o format
        self.train_h2o_df: h2o.H2OFrame = None
        self.test_h2o_df: h2o.H2OFrame = None
        self.test_h2o_reindexed: h2o.H2OFrame = None
        self.predictors = None
        self.targets = None

        self.__split_data_to_train_test_sets()
        self.__convert_pandas_df_to_h2o_frame()
        self.__define_train_n_targets()

        self.model = self.__build_model()
        self.trained_model = self.__train_model()
        self.__save_leaderboard()
        self.__save_n_best_models()
        self.__predict_with_bmodel()

    def __split_data_to_train_test_sets(self):
        """ Split data into training and test sets. Keep 30% unseen data for testing. """
        self.train_df, self.test_df = train_test_split(
            self.data, 
            stratify=self.data.labels, 
            test_size=0.3,
            random_state=self.config.random_state.seed
        )

    def __convert_pandas_df_to_h2o_frame(self):
        """ Convert pandas df to h2o dfs. """
        # Train df
        self.train_h2o_df = h2o.H2OFrame(self.train_df)
        # Test df
        self.test_h2o_df = h2o.H2OFrame(self.test_df)
        # Test df with reset index
        self.test_h2o_reindexed = h2o.H2OFrame(self.test_df.reset_index())

    def __define_train_n_targets(self):
        """ 
        Define predictors and dependent variables.
        Convert dependent variables in h2o train df into factors.
        """
        
        temp_df = self.train_h2o_df

        # Define predictors and dependent variable
        predictors = temp_df.columns
        predictors.remove("labels")
        dependent_variable = "labels"

        # Convert dependent variable in the training set to factors
        temp_df[dependent_variable] = temp_df[dependent_variable].asfactor()

        self.predictors = predictors
        self.targets = dependent_variable
        self.train_h2o_df = temp_df

    def __build_model(self):
        """ Build h2o model. """
        model = H2OAutoML(
            balance_classes=True,
            max_models=3,
            max_runtime_secs=1800,
            nfolds=5,
            sort_metric='AUCPR',
            seed=self.config.random_state.seed
        )
        print(model)
        return model

    def __train_model(self):
        """ Train the h2o model. """
        model = self.model
        model.train(
            x=self.predictors,
            y=self.targets,
            training_frame=self.train_h2o_df 
        )
        print(model)
        return model

    def __save_leaderboard(self):
        """ Save leaderboard with training results. """
        leaderboard = self.trained_model.leaderboard
        leaderboard.as_data_frame(use_pandas=True).to_html(os.path.join(
            self.config.paths.model_results,
            "h20_report.html"
        ))
        self.info_tracker.h2o_leaderboard = leaderboard

    def __save_n_best_models(self):
        """ Save best models. """
        leaderboard = self.info_tracker.h2o_leaderboard
        self.info_tracker.h2o_leaderboard_df = leaderboard.as_data_frame(use_pandas=True)
        for indx in range(self.n_best_models):
            if indx <= len(leaderboard) - 1:
                best_m = h2o.get_model(leaderboard_df.iloc[indx, 0])
                h2o.save_model(
                    model=best_m, 
                    path=self.config.paths.model_results, 
                    force=True
                )

    def __predict_with_bmodel(self):
        """ Predict with best model and save report. """
        predict_df = self.trained_model.leader.predict(self.test_h2o_df)
        total_df = predict_df.cbind(self.test_h2o_reindexed)
        total_df.as_data_frame(use_pandas=True).to_csv(
            os.path.join(
                self.config.paths.model_results,
                "prediction_and_test_report.csv"
            )
        )
        