# Immoweb data analysis 

### Importing libraries

In [9]:
# Import H2O and Initialize
import h2o
from h2o.automl import H2OAutoML

h2o.init()

NameError: name 'h2o_df' is not defined

In [2]:

# Load dataset
data_path = 'data/data_20240313_modified_minus1.csv'
data = h2o.import_file(path=data_path)

# Split the data into training and testing sets
train, test = data.split_frame(ratios=[.8], seed=42)


# Specify Target and Predictor Variables 'price' is the target variable
target = 'price'
predictors = train.columns
predictors.remove(target)

# Run AutoML for 20 base models (limited here for simplicity)
aml = H2OAutoML(max_models=20, seed=42, max_runtime_secs=600)
aml.train(x=predictors, y=target, training_frame=train)


# View the AutoML Leaderboard
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))  # Print all rows instead of default (10 rows)


# Make Predictions
predictions = aml.leader.predict(test)
print(predictions.head())





Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |█
12:16:50.287: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████| (done) 100%
model_id                                                rmse          mse       mae       rmsle    mean_residual_deviance
GBM_5_AutoML_2_20240319_121650                        231402  5.35469e+10   85861.4  nan                      5.35469e+10
GBM_4_AutoML_2_20240319_121650                        231586  5.3632e+10    85380.8  nan                      5.3632e+10
GBM_3_AutoML_2_20240319_121650                        232204  5.39187e+10   86905    nan                      5.39187e+10
GBM_grid_1_AutoML_2_20240319_121650_model_5           232338  5.39811e+10   84837.1  nan                      5.39811e+10
GBM_2_AutoML_2_20240319_121650                        234036  5.47728e+10   89403.7  nan                      5.47728e+10
GBM_grid_1_AutoM

### Saving the AutoML Leader Model

In [3]:
model_path = h2o.save_model(model=aml.leader, path="my_model_path", force=True)
print(f"Model saved to: {model_path}")


Model saved to: D:\Github\Projects\immo-eliza-ml\my_model_path\GBM_5_AutoML_2_20240319_121650


### Saving Predictions

In [4]:
predictions_path = "predictions_GBM_5_AutoML.csv"
h2o.export_file(predictions, path=predictions_path, force=True)
print(f"Predictions saved to: {predictions_path}")


Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Predictions saved to: predictions_GBM_5_AutoML.csv


### Loading the Model and Making Predictions

In [5]:
loaded_model = h2o.load_model(path=model_path)

# Assuming you have a dataset to predict
predictions = loaded_model.predict(test)
print(predictions.head())


gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
         predict
537797
 95308.2
198457
279118
347105
     1.44344e+06
741898
304137
266878
513635
[10 rows x 1 column]



### Retrieve the Model

In [7]:
import pandas as pd
predictions_df = pd.read_csv("predictions_GBM_5_AutoML.csv")

# Examine the first few rows
print(predictions_df.head())

         predict
0  537797.461761
1   95308.191711
2  198456.974266
3  279118.275848
4  347104.769530


### Calculate Evaluation Metrics

In [11]:
import h2o
from h2o.frame import H2OFrame
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

actuals = test[target].as_data_frame().values.flatten()  


mse = mean_squared_error(actuals, predictions_df['predict'])
rmse = mean_squared_error(actuals, predictions_df['predict'], squared=False)
mae = mean_absolute_error(actuals, predictions_df['predict'])
r2 = r2_score(actuals, predictions_df['predict'])

# Print the metrics
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R-squared (R2): {r2}')



with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
    pandas_df = h2o_df.as_data_frame()



Mean Squared Error (MSE): 39594959733.56407
Root Mean Squared Error (RMSE): 198984.82287240922
Mean Absolute Error (MAE): 81754.61126182029
R-squared (R2): 0.7991135054993821


