In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

In [10]:
filename = r"C:/Users/Becode/immo-eliza-ML/immoEliza-ML/ml_ready_real_estate_data_soft_filled.csv"
df = pd.read_csv(filename)

features = ['bedroomCount','habitableSurface', 'province_encoded', 'epcScore_encoded', 'type_encoded',
            'bathroomCount', 'hasLift_encoded']

X = df[features] # X: features to the model
y = df['price'] # y: target variable (price)

df.dropna(inplace=True)
df.drop('postCode', axis=1)

Unnamed: 0,bedroomCount,bathroomCount,habitableSurface,toiletCount,terraceSurface,gardenSurface,province_encoded,type_encoded,subtype_encoded,epcScore_encoded,...,hasSwimmingPool_encoded,hasFireplace_encoded,hasBasement_encoded,hasDressingRoom_encoded,hasDiningRoom_encoded,hasLift_encoded,hasHeatPump_encoded,hasPhotovoltaicPanels_encoded,hasLivingRoom_encoded,price
0,2.0,1.0,100.0,1.0,7.0,0.0,1.0,1,1,5.0,...,0,0,1,0,0,1,0,0,1,399000.0
1,4.0,2.0,270.0,3.0,1.0,100.0,1.0,2,2,3.0,...,0,0,1,0,1,0,0,0,1,895000.0
2,2.0,1.0,87.0,1.0,2.0,0.0,1.0,1,1,2.0,...,0,0,0,0,0,1,0,0,1,465000.0
3,2.0,2.0,104.0,2.0,3.0,0.0,1.0,1,1,6.0,...,0,0,0,0,0,1,0,0,1,590000.0
4,1.0,1.0,71.0,1.0,0.0,0.0,1.0,1,3,3.0,...,0,0,0,0,0,0,0,0,0,289000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57965,2.0,1.0,75.0,1.0,0.0,0.0,1.0,1,1,3.0,...,0,0,1,0,0,1,0,0,0,240000.0
57970,1.0,1.0,55.0,1.0,0.0,0.0,1.0,1,1,1.0,...,0,0,0,0,0,1,0,0,1,198500.0
67778,3.0,1.0,252.0,3.0,36.0,390.0,3.0,2,13,6.0,...,1,0,0,0,1,0,1,0,1,849000.0
67779,3.0,1.0,94.0,1.0,10.0,0.0,1.0,1,1,4.0,...,0,0,1,0,0,1,0,0,1,279000.0


In [11]:
# convert categorical variables to numerical values
if "price" in df.columns:
        # Remove rows with missing prices (can't train without target)
        before_price = len(df)
        df = df.dropna(subset=["price"])
        after_price = len(df)
        print(f"Removed {before_price - after_price} rows with missing prices")
        
for column in df.columns:
    if df[column].dtype == 'object':
        df = pd.get_dummies(df, columns=[column], drop_first=True)

#Define features and target variable
features = df.columns[df.columns != 'price'].tolist() # all columns except 'price'
X = df[features]
y = df['price']

print(X)

Removed 0 rows with missing prices
       bedroomCount  bathroomCount  habitableSurface  toiletCount  \
0               2.0            1.0             100.0          1.0   
1               4.0            2.0             270.0          3.0   
2               2.0            1.0              87.0          1.0   
3               2.0            2.0             104.0          2.0   
4               1.0            1.0              71.0          1.0   
...             ...            ...               ...          ...   
57965           2.0            1.0              75.0          1.0   
57970           1.0            1.0              55.0          1.0   
67778           3.0            1.0             252.0          3.0   
67779           3.0            1.0              94.0          1.0   
67782           3.0            2.0             132.0          2.0   

       terraceSurface  postCode  gardenSurface  province_encoded  \
0                 7.0      1040            0.0               1.0   


In [12]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Apply one-hot encoding to categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cols = pd.DataFrame(encoder.fit_transform(X[categorical_cols]),
                            columns=encoder.get_feature_names_out(categorical_cols))

# reset index to align with original DataFrame
encoded_cols.index = X.index

# Drop original categorical columns and concatenate encoded columns
X = X.drop(categorical_cols, axis=1)
X = pd.concat([X, encoded_cols], axis=1)

# Convert DataFrame to NumPy arrays
X = X.values
y = y.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.22)

In [None]:
#Prepare the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [300],          # Number of boosting stages
    'max_depth': [10],                 # Maximum depth of individual trees
    'min_samples_split': [5],         # Minimum samples required to split
    'min_samples_leaf': [2],
    'max_features': [0.5]           # Minimum samples required at a leaf
}

# Initialize GridSearchCV with GradientBoostingRegressor
grid = GridSearchCV(y(), param_grid, cv=5, n_jobs=-1, verbose=2)

#Fit the grid search to the training data
print("Fitting completed:", grid.fit(X_train, y_train))

#Get the best model from the grid search
best_model = grid.best_estimator_

#Predicting the target variable using the trained model
train_score = best_model.score(X_train, y_train)
test_score = best_model.score(X_test, y_test)
print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("R2 score:", r2_score(y_test, best_model.predict(X_test)))
print("mean_squared_error:", mean_squared_error(y_test, best_model.predict(X_test)))
print("mean_absolute_error:", mean_absolute_error(y_test, best_model.predict(X_test)))


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting completed: GridSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'max_depth': [10], 'max_features': [0.5],
                         'min_samples_leaf': [2], 'min_samples_split': [5],
                         'n_estimators': [300]},
             verbose=2)
Train Score: 0.9924923656218312
Test Score: 0.8298438734442208
R2 score: 0.8298438734442208
mean_squared_error: 29279529050.0244
mean_absolute_error: 79226.260538102
