In [None]:
from h2o.tree import H2OTree
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
import h2o

In [None]:

h2o.init()

In [None]:
df = pd.read_csv("merge_dataset.csv", sep="\t")
df.drop(columns=[
    "Miasto", 
    "URL",
    "Pietro", 
    "formatted_address",
    "result_confidence",
    "suburb",
    "building_category",
    "result_type",
    "Dataset"
], inplace=True)

In [None]:
numeric_imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
categorical_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
binary_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
categorical_encoder = OneHotEncoder(drop="first")

categorical_pipeline = Pipeline(
    steps=[
        ("imputer", categorical_imputer),
        ("encoder", categorical_encoder)
    ]
)
binary_pipeline = binary_imputer
numeric_pipeline = numeric_imputer

preprocessing = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, make_column_selector(dtype_include=float)),
        ("cat", categorical_pipeline, make_column_selector(dtype_include=object)),
        ("bin", binary_pipeline, make_column_selector(dtype_include=int))
    ]
)

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
preprocessor = preprocessing.fit(train.iloc[:, 1:])
num_features = list(preprocessing.named_transformers_["num"].feature_names_in_)
cat_features = list(preprocessing.named_transformers_["cat"].steps[1][1].get_feature_names_out())
bin_features = list(preprocessing.named_transformers_["bin"].feature_names_in_)
features = num_features + cat_features + bin_features

In [None]:
h2o_x_train = h2o.H2OFrame(train_preprocesed, column_names=features)
h2o_y_train = h2o.H2OFrame(y_train.to_numpy(), column_names=["Cena"])
h2o_x_test = h2o.H2OFrame(test_preprocesed, column_names=features)
h2o_y_test = h2o.H2OFrame(y_test.to_numpy(), column_names=["Cena"])

In [None]:
predictors = h2o_x_train.columns
response = h2o_y_train.columns

In [None]:
tree = H2OGradientBoostingEstimator()
tree.train(
    x = predictors,
    y = response,
    training_frame = h2o_x_train,
    validation_frame = h2o_x_test
)

In [None]:
hyper_params = {
    'max_depth' : [5,7,9,10,12,13,15,20],
    "ntrees": [20, 50, 100, 200, 500, 1000],
    'sample_rate': [x/100. for x in range(20,101)],
    'col_sample_rate' : [x/100. for x in range(20,101)],
}
grid_id = 'max_depth_grid'
search_criteria = { 
    "strategy":"RandomDiscrete",                         
    'max_runtime_secs': 900, #15 min     
    'max_models': 100,  ## build no more than 100 models
    'seed' : 42
}

tree = H2OGradientBoostingEstimator()

xgb_grid = H2OGridSearch(model = tree, 
                         hyper_params = hyper_params,
                         grid_id = grid_id,
                         search_criteria = search_criteria)

xgb_grid.train(
    x = predictors,
    y = response,
    training_frame = h2o_x_train,
    validation_frame = h2o_x_test
)

In [None]:
xgb_grid.get_grid()

In [None]:
best_tree = xgb_grid.models[0]
print(best_tree.model_performance(h2o_x_test))
print(best_tree.r2())
print(best_tree.r2(valid=True))

In [None]:
tree.varimp_plot()