In [None]:
# https://www.kaggle.com/datasets/yasserh/housing-prices-dataset?select=Housing.csv

import pandas as pd
import sklearn as sk
from sklearn import model_selection
from sklearn import ensemble

In [None]:
df = pd.read_csv("Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [None]:
# df.isna().sum()

for c in ["mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "prefarea"]:
    df[c] = df[c].map({"yes": 1, "no": 0})

df["furnishingstatus"] = df["furnishingstatus"].map({"furnished": 2, "semi-furnished": 1, "unfurnished": 0})

df.dtypes

price               int64
area                int64
bedrooms            int64
bathrooms           int64
stories             int64
mainroad            int64
guestroom           int64
basement            int64
hotwaterheating     int64
airconditioning     int64
parking             int64
prefarea            int64
furnishingstatus    int64
dtype: object

In [None]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,2
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,2
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,2
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,2


## Ensembles

In [None]:
X = df[df.columns.drop("price")]
y = df["price"]

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, train_size=0.8, random_state=42)

for criterion in ["squared_error", "absolute_error", "friedman_mse", "poisson"]:
    for max_depth in [5, 10, 15, 20, 25]:
        reg = sk.ensemble.RandomForestRegressor(n_estimators=250, criterion=criterion, max_depth=max_depth, n_jobs=-1, random_state=42)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)

        print(criterion, max_depth, "RMSE:", sk.metrics.mean_squared_error(y_test, y_pred, squared=False))

# RMSE: 1331071.4167895108
# poisson 15 RMSE: 1376085.324936133

squared_error 5 RMSE: 1447101.850469612
squared_error 10 RMSE: 1389267.525277643
squared_error 15 RMSE: 1380329.873035826
squared_error 20 RMSE: 1376988.558872015
squared_error 25 RMSE: 1376845.3778029946
absolute_error 5 RMSE: 1492213.3033813944
absolute_error 10 RMSE: 1392606.1430490145
absolute_error 15 RMSE: 1381715.931634255
absolute_error 20 RMSE: 1387339.0337558233
absolute_error 25 RMSE: 1388184.9241514695
friedman_mse 5 RMSE: 1447101.850469612
friedman_mse 10 RMSE: 1389267.525277643
friedman_mse 15 RMSE: 1380310.137081837
friedman_mse 20 RMSE: 1376968.6815911164
friedman_mse 25 RMSE: 1376825.4984549854
poisson 5 RMSE: 1435928.5456169506
poisson 10 RMSE: 1376564.025813099
poisson 15 RMSE: 1376085.324936133
poisson 20 RMSE: 1379987.6130763888
poisson 25 RMSE: 1379102.8307056492


In [None]:
X = df[df.columns.drop("price")]
y = df["price"]

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, train_size=0.8, random_state=42)

for criterion in ["squared_error", "absolute_error", "friedman_mse", "poisson"]:
    reg = sk.ensemble.GradientBoostingRegressor(n_estimators=250, random_state=42)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    print(criterion, "RMSE:", sk.metrics.mean_squared_error(y_test, y_pred, squared=False))
# 1331122.6446943558

squared_error RMSE: 1331122.6446943558
absolute_error RMSE: 1331122.6446943558
friedman_mse RMSE: 1331122.6446943558
poisson RMSE: 1331122.6446943558


In [None]:
estimators = [
    ('rf', sk.ensemble.RandomForestRegressor(n_estimators=250, criterion="poisson", max_depth=15)),
    ('gb', sk.ensemble.GradientBoostingRegressor(n_estimators=250, random_state=42)),
]

reg = sk.ensemble.StackingRegressor(estimators, n_jobs=-1)

reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("RMSE:", sk.metrics.mean_squared_error(y_test, y_pred, squared=False))

RMSE: 1328022.8764406887
