# Immoweb data analysis 

### Importing libraries

In [2]:
# Import H2O and Initialize
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 21.0.2+13-LTS-58, mixed mode, sharing)
  Starting server from D:\Github\Projects\immo-eliza-ml\venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\daryc\AppData\Local\Temp\tmpg1h8lee9
  JVM stdout: C:\Users\daryc\AppData\Local\Temp\tmpg1h8lee9\h2o_daryc_started_from_python.out
  JVM stderr: C:\Users\daryc\AppData\Local\Temp\tmpg1h8lee9\h2o_daryc_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Brussels
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,5 days
H2O_cluster_name:,H2O_from_python_daryc_abpxkb
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.912 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [4]:

# Load dataset
data_path = 'data/data_20240313_cleaned.csv'
data = h2o.import_file(path=data_path)

# Split the data into training and testing sets
train, test = data.split_frame(ratios=[.8], seed=42)


# Specify Target and Predictor Variables 'price' is the target variable
target = 'price'
predictors = train.columns
predictors.remove(target)

# Run AutoML for 20 base models (limited here for simplicity)
aml = H2OAutoML(max_models=20, seed=42, max_runtime_secs=600)
aml.train(x=predictors, y=target, training_frame=train)


# View the AutoML Leaderboard
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))  # Print all rows instead of default (10 rows)


# Make Predictions
predictions = aml.leader.predict(test)
print(predictions.head())





Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
11:36:32.557: AutoML: XGBoost is not available; skipping it.
11:36:32.592: _train param, Dropping bad and constant columns: [fl_terrace, fl_floodzone, fl_double_glazing]

████████████████████████
11:40:09.305: XRT_1_AutoML_1_20240319_113632 [DRF XRT (Extremely Randomized Trees)] failed: java.lang.AssertionError

█
11:40:22.59: _train param, Dropping bad and constant columns: [fl_terrace, fl_floodzone, fl_double_glazing]

██████████████████████████████████████| (done) 100%
model_id                                       rmse          mse       mae       rmsle    mean_residual_deviance
GBM_4_AutoML_1_20240319_113632               227555  5.17814e+10   84527.7  nan                      5.17814e+10
GBM_5_AutoML_1_20240319_113632               227786  5.18866e+10   85292.9  nan                      5.18866e+10
GBM_grid_1_AutoML_1_20240319_113632_model_5  229930  5.28679e+10   837

### Saving the AutoML Leader Model

In [5]:
model_path = h2o.save_model(model=aml.leader, path="my_model_path", force=True)
print(f"Model saved to: {model_path}")


Model saved to: D:\Github\Projects\immo-eliza-ml\my_model_path\GBM_4_AutoML_1_20240319_113632


### Saving Predictions

In [6]:
predictions_path = "predictions_GBM_4_AutoML.csv"
h2o.export_file(predictions, path=predictions_path, force=True)
print(f"Predictions saved to: {predictions_path}")


Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Predictions saved to: predictions_GBM_4_AutoML.csv


### Loading the Model and Making Predictions

In [18]:
loaded_model = h2o.load_model(path=model_path)

# Assuming you have a dataset to predict
predictions = loaded_model.predict(test)
print(predictions.head())


gbm prediction progress: |

███████████████████████████████████████████████████████| (done) 100%
         predict
477247
131970
184354
280531
349789
     1.50248e+06
936761
311070
282211
519634
[10 rows x 1 column]



### Retrieve the Model

In [19]:
import pandas as pd
predictions_df = pd.read_csv("predictions_GBM_4_AutoML.csv")

# Examine the first few rows
print(predictions_df.head())

         predict
0  477246.864807
1  131970.079458
2  184353.697954
3  280531.422635
4  349789.256601


### Calculate Evaluation Metrics

In [22]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

actuals = test[target].as_data_frame().values.flatten()  


mse = mean_squared_error(actuals, predictions_df['predict'])
rmse = mean_squared_error(actuals, predictions_df['predict'], squared=False)
mae = mean_absolute_error(actuals, predictions_df['predict'])
r2 = r2_score(actuals, predictions_df['predict'])

# Print the metrics
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R-squared (R2): {r2}')


Mean Squared Error (MSE): 46211391769.93783
Root Mean Squared Error (RMSE): 214968.3506238484
Mean Absolute Error (MAE): 81025.29719639733
R-squared (R2): 0.7655447925411516



with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
    pandas_df = h2o_df.as_data_frame()

