In [None]:
from h2o.tree import H2OTree
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators import H2ODeepLearningEstimator
from h2o.automl import H2OAutoML
import h2o
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

In [None]:
h2o.init()
# h2o.init(min_mem_size="30G", max_mem_size="40G", log_level="TRACE", insecure=False, https=False)

In [None]:
df = pd.read_csv("merge_dataset.csv", sep="\t")
df.drop(columns=[
    "Miasto", 
    "URL",
    "Pietro", 
    "formatted_address",
    "result_confidence",
    "suburb",
    "building_category",
    "result_type",
    "Dataset"
], inplace=True)

In [None]:
df1 = df.drop(columns=["Rynek", "Stan wykonczenia"])
df1.fillna(0, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(df1.iloc[:, 1:], df1["Cena"], test_size=0.2, random_state=42)
features = list(x_train.columns)

In [None]:
h2o_x_train = h2o.H2OFrame(x_train, column_names=features)
h2o_y_train = h2o.H2OFrame(y_train.to_numpy(), column_names=["Cena"])
h2o_train = h2o.deep_copy(h2o_x_train, "h2o_train")
h2o_train["Cena"] = h2o_y_train
h2o_x_test = h2o.H2OFrame(x_test, column_names=features)
h2o_y_test = h2o.H2OFrame(y_test.to_numpy(), column_names=["Cena"])
h2o_test = h2o.deep_copy(h2o_x_test, "h2o_test")
h2o_test["Cena"] = h2o_y_test

In [None]:
predictors = h2o_x_train.columns
response = h2o_y_train.columns[0]
predictors

# Baseline models

In [None]:
model = H2ORandomForestEstimator(seed=1)
model.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)
print(model.model_performance())
print(model.model_performance(h2o_test))
print(model.r2())
print(model.r2(valid=True))

In [None]:
model = H2OGradientBoostingEstimator(seed=1)
model.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)
print(model.model_performance())
print(model.model_performance(h2o_test))
print(model.r2())
print(model.r2(valid=True))

In [None]:
model = H2OGeneralizedLinearEstimator(solver="COORDINATE_DESCENT_NAIVE", alpha=1)
model.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)
print(model.model_performance())
print(model.model_performance(h2o_test))
print(model.r2())
print(model.r2(valid=True))

In [None]:
model = H2ODeepLearningEstimator(seed=1, reproducible=True)
model.train(
    x = predictors,
    y = response,
    training_frame = h2o_x_train,
    validation_frame = h2o_x_test
)
print(model.model_performance())
print(model.model_performance(h2o_test))
print(model.r2())
print(model.r2(valid=True))

In [None]:
model = H2OAutoML(seed=1)
model.train(
    x=predictors,
    y=response,
    training_frame=h2o_train, 
    validation_frame=h2o_test
)

## Outlier detection and data scaling

In [None]:
df = pd.read_csv("merge_dataset.csv", sep="\t")
df.drop(columns=[
    "Miasto", 
    "URL",
    "Pietro", 
    "formatted_address",
    "result_confidence",
    "suburb",
    "building_category",
    "result_type",
    "Dataset"
], inplace=True)

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train_outlier = train.copy()
test_outlier = test.copy()
train_outlier = train_outlier[(train_outlier["Cena"]>250000)&(train_outlier["Cena"]<4000000)]
test_outlier = test_outlier[(test_outlier["Cena"]>250000)&(test_outlier["Cena"]<4000000)]
train_outlier = train_outlier[(train_outlier["Powierzchnia"]<300)]
test_outlier = test_outlier[(test_outlier["Powierzchnia"]<300)]

In [None]:
numeric_imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
categorical_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
binary_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
categorical_encoder = OneHotEncoder(drop="first")

categorical_pipeline = Pipeline(
    steps=[
        ("imputer", categorical_imputer),
        ("encoder", categorical_encoder)
    ]
)
binary_pipeline = binary_imputer
numeric_pipeline = numeric_imputer

preprocessing = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, make_column_selector(dtype_include=float)),
        ("cat", categorical_pipeline, make_column_selector(dtype_include=object)),
        ("bin", binary_pipeline, make_column_selector(dtype_include=int))
    ]
)

In [None]:
preprocessor = preprocessing.fit(train_outlier.iloc[:, 1:])
num_features = list(preprocessing.named_transformers_["num"].feature_names_in_)
cat_features = list(preprocessing.named_transformers_["cat"].steps[1][1].get_feature_names_out())
bin_features = list(preprocessing.named_transformers_["bin"].feature_names_in_)
features = num_features + cat_features + bin_features

In [None]:
train_preprocessed = pd.DataFrame(preprocessor.transform(train_outlier.iloc[:, 1:]), columns=features)
y_train = train_outlier.iloc[:, 0]
test_preprocessed = pd.DataFrame(preprocessor.transform(test_outlier.iloc[:, 1:]), columns=features)
y_test = test_outlier.iloc[:, 0]

In [None]:
h2o_x_train = h2o.H2OFrame(train_preprocessed, column_names=features)
h2o_y_train = h2o.H2OFrame(y_train.to_numpy(), column_names=["Cena"])
h2o_train = h2o.deep_copy(h2o_x_train, "h2o_train")
h2o_train["Cena"] = h2o_y_train
h2o_x_test = h2o.H2OFrame(test_preprocessed, column_names=features)
h2o_y_test = h2o.H2OFrame(y_test.to_numpy(), column_names=["Cena"])
h2o_test = h2o.deep_copy(h2o_x_test, "h2o_test")
h2o_test["Cena"] = h2o_y_test

In [None]:
predictors = h2o_x_train.columns
response = h2o_y_train.columns[0]
predictors

### Random Forest

In [None]:
model = H2ORandomForestEstimator(seed=1)
model.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)
print(model.model_performance())
print(model.model_performance(h2o_test))
print(model.r2())
print(model.r2(valid=True))

In [None]:
hyper_params = {
    "ntrees": [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'max_depth' : [int(x) for x in np.linspace(10, 200, num = 5)],
     "min_rows": [1, 2, 4, 8, 16],
    'sample_rate': [x/100. for x in range(20,101)],
    'nbins' : [1, 2, 6, 10, 20, 50, 100],
    "seed": [1]
}

search_criteria = { 
    "strategy":"RandomDiscrete",   # Can be also cartesian                      
    'max_runtime_secs': 14400, # 4h   14400 
    'max_models': 300,  ## build no more than 100 models
    'seed' : 1
}

rf = H2ORandomForestEstimator()
rf_grid = H2OGridSearch(
    model = rf, 
    hyper_params = hyper_params,
    search_criteria = search_criteria
)

rf_grid.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)

In [None]:
rf_grid.models

In [None]:
h2o.save_model(rf_grid.models[0], path="models/rf_best", force=True)

In [None]:
print(rf_grid.models[0].model_performance())
print(rf_grid.models[0].model_performance(h2o_test))
print(rf_grid.models[0].r2())
print(rf_grid.models[0].r2(valid=True))

## Gradient Boosting Regressor

In [None]:
model = H2OGradientBoostingEstimator(seed=1)
model.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)
print(model.model_performance())
print(model.model_performance(h2o_test))
print(model.r2())
print(model.r2(valid=True))

In [None]:
hyper_params = {
    'max_depth' : [int(x) for x in np.linspace(start = 1, stop = 200, num = 5)],
    "ntrees": [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)],
    "min_rows": [int(x) for x in np.linspace(start = 2, stop = 30, num = 1)],
    'learn_rate': [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.25, 0.5, 1],
    "min_split_improvement": [0.000000001, 0.00000001, 0.0000001, 0.000001, 0.0001, 0.001, 0.01, 0.1, 0.3],
    'sample_rate': [x/100. for x in range(20,101)],
    'col_sample_rate' : [x/100. for x in range(20,101)],
    "seed": [1]
}

search_criteria = { 
    "strategy":"RandomDiscrete",   # Can be also cartesian                      
    'max_runtime_secs': 14400, # 4h   14400 
    'max_models': 300,  ## build no more than 100 models
    'seed' : 1
}

tree = H2OGradientBoostingEstimator()
xgb_grid = H2OGridSearch(
    model = tree, 
    hyper_params = hyper_params,
    search_criteria = search_criteria
)

xgb_grid.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)

In [None]:
xgb_grid.models

In [None]:
h2o.save_model(xgb_grid.models[0], path="models/xgb_grid", force=True)

In [None]:
print(xgb_grid.models[0].model_performance())
print(xgb_grid.models[0].model_performance(h2o_test))
print(xgb_grid.models[0].r2())
print(xgb_grid.models[0].r2(valid=True))

### Deep learning

In [None]:
model = H2ODeepLearningEstimator(seed=1, reproducible=True)
model.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)
print(model.model_performance())
print(model.model_performance(h2o_test))
print(model.r2())
print(model.r2(valid=True))

In [None]:
hyper_params = {
    "activation": ["Rectifier", "Maxout", "Tanh", "RectifierWithDropout", "MaxoutWithDropout", "TanhWithDropout"],
    "hidden": [[20], [50], [100], [250], [500], [1000], [2500], [20, 20], [50, 50], [100, 100], [250, 250], [500, 500], [1000, 1000],[2500, 2500], [20, 20, 20], [50, 50, 50], [100, 100, 100], [250, 250, 250], [500, 500, 500], [1000, 1000, 1000],[2500, 2500, 2500]],
    "epochs": [10, 25, 50, 100, 200, 350],
    "rho": [0.8, 0.9, 0.95, 0.99, 0.999],
    "rate": [0.5, 0.25, 0, 0.1, 0.005, 0.001],
    "momentum_start": [0, 0.25, 0.5, 1],
    "momentum_stable": [0, 0.25, 0.5, 1],
    "epsilon": [1e-10, 1e-8, 1e-6, 1e-4],
    "max_w2": [10, 100, 1000, 3.4028235e+38],
    "l1": [0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.25],
    "l2": [0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.25],
    "seed": [1]
}

search_criteria = { 
    "strategy":"RandomDiscrete",   # Can be also cartesian                      
    'max_runtime_secs': 14400, # 4h   14400 
    'max_models': 300,  ## build no more than 100 models
    'seed' : 1
}

mlp = H2ODeepLearningEstimator(seed=1)
mlp_grid = H2OGridSearch(
    model = mlp, 
    hyper_params = hyper_params,
    search_criteria = search_criteria
)

mlp_grid.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)

In [None]:
mlp_grid.models

In [None]:
h2o.save_model(mlp_grid.models[0], path="models/mlp_grid", force=True)

In [None]:
print(mlp_grid.models[0].model_performance())
print(mlp_grid.models[0].model_performance(h2o_test))
print(mlp_grid.models[0].r2())
print(mlp_grid.models[0].r2(valid=True))

In [None]:
#Regresja liniowa

In [None]:
model = H2OGeneralizedLinearEstimator(solver="COORDINATE_DESCENT_NAIVE", alpha=1)
model.train(
    x = predictors,
    y = response,
    training_frame = h2o_train,
    validation_frame = h2o_test
)
print(model.model_performance())
print(model.model_performance(h2o_test))
print(model.r2())
print(model.r2(valid=True))

# Automl

In [None]:
df = pd.read_csv("merge_dataset.csv", sep="\t")
df.drop(columns=[
    "Miasto", 
    "URL",
    "Pietro", 
    "formatted_address",
    "result_confidence",
    "suburb",
    "building_category",
    "result_type",
    "Dataset"
], inplace=True)

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train_outlier = train.copy()
test_outlier = test.copy()
train_outlier = train_outlier[(train_outlier["Cena"]>250000)&(train_outlier["Cena"]<4000000)]
test_outlier = test_outlier[(test_outlier["Cena"]>250000)&(test_outlier["Cena"]<4000000)]
train_outlier = train_outlier[(train_outlier["Powierzchnia"]<300)]
test_outlier = test_outlier[(test_outlier["Powierzchnia"]<300)]

In [None]:
features = train_outlier.drop(columns=["Cena"]).columns.tolist()
h2o_x_train = h2o.H2OFrame(train_outlier.drop(columns=["Cena"]), column_names=features)
h2o_y_train = h2o.H2OFrame(train_outlier["Cena"].to_numpy(), column_names=["Cena"])
h2o_train = h2o.deep_copy(h2o_x_train, "h2o_train")
h2o_train["Cena"] = h2o_y_train
h2o_x_test = h2o.H2OFrame(test_outlier.drop(columns=["Cena"]), column_names=features)
h2o_y_test = h2o.H2OFrame(test_outlier["Cena"].to_numpy(), column_names=["Cena"])
h2o_test = h2o.deep_copy(h2o_x_test, "h2o_test")
h2o_test["Cena"] = h2o_y_test

In [None]:
predictors = h2o_x_train.columns
response = h2o_y_train.columns[0]
predictors

In [None]:
aml = H2OAutoML(max_models=300, max_runtime_secs=14400, seed=1)
aml.train(x=predictors, y=response, training_frame=h2o_train, validation_frame=h2o_test)