# All the imports in one place

In [71]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures



path = "data/properties.csv"
df = pd.read_csv(path)


In [72]:
nan_counts = df.isna().sum()
nan_counts = nan_counts.sort_values(ascending=False)
nan_counts[nan_counts > 0]

cadastral_income                  44967
surface_land_sqm                  36256
construction_year                 33391
primary_energy_consumption_sqm    26567
nbr_frontages                     26346
latitude                          14098
longitude                         14098
terrace_sqm                       13140
total_area_sqm                     7615
garden_sqm                         2939
dtype: int64

In [73]:
df.columns.tolist()

['id',
 'price',
 'property_type',
 'subproperty_type',
 'region',
 'province',
 'locality',
 'zip_code',
 'latitude',
 'longitude',
 'construction_year',
 'total_area_sqm',
 'surface_land_sqm',
 'nbr_frontages',
 'nbr_bedrooms',
 'equipped_kitchen',
 'fl_furnished',
 'fl_open_fire',
 'fl_terrace',
 'terrace_sqm',
 'fl_garden',
 'garden_sqm',
 'fl_swimming_pool',
 'fl_floodzone',
 'state_building',
 'primary_energy_consumption_sqm',
 'epc',
 'heating_type',
 'fl_double_glazing',
 'cadastral_income']

# Class for Preprocessing

In [74]:
class Preprocessing:

    @staticmethod
    def fill_nan_with_median(df, column_name):
        """
        Fill missing values in the specified column(s) with the median value.

        Parameters:
        df (pandas.DataFrame): The DataFrame containing the data.
        column_name (str or list): The name(s) of the column(s) to fill.

        Returns:
        None
        """
        if isinstance(column_name, list):
            for column in column_name:
                median_of_column = df[column].median()
                df.fillna({column: median_of_column}, inplace=True)
        else:
            median_of_column = df[column_name].median()
            df.fillna({column_name: median_of_column}, inplace=True)

        return None


    @staticmethod
    def fill_nan_with_0(df, column_name):
        """
        Fill NaN values in the specified column(s) with 0.

        Parameters:
        - df (pandas.DataFrame): The DataFrame containing the data.
        - column_name (str or list): The name(s) of the column(s) to fill NaN values in.

        Returns:
        None
        """
        if isinstance(column_name, list):
            for column in column_name:
                df.fillna({column: 0}, inplace=True)
        else:
            df.fillna({column_name: 0}, inplace=True)

        return None
    
    @staticmethod
    def iqr(dataframe, column_names):
        """
        Filter the given dataframe based on the interquartile range (IQR) of the specified column(s).

        Parameters:
        - dataframe: pandas DataFrame
            The input dataframe to filter.
        - column_names: str or list of str
            The name(s) of the column(s) to calculate the IQR and perform the filtering.

        Returns:
        - filtered_df: pandas DataFrame
            The filtered dataframe based on the IQR of the specified column(s).
        """
        filtered_df = dataframe.copy()

        if isinstance(column_names, list):
            for column_name in column_names:
                Q1 = dataframe[column_name].quantile(0.25)
                Q3 = dataframe[column_name].quantile(0.75)
                IQR = Q3 - Q1

                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR

                filtered_df = filtered_df[(filtered_df[column_name] >= lower_bound) & (filtered_df[column_name] <= upper_bound)]
        else:
            Q1 = dataframe[column_names].quantile(0.25)
            Q3 = dataframe[column_names].quantile(0.75)
            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            filtered_df = filtered_df[(filtered_df[column_names] >= lower_bound) & (filtered_df[column_names] <= upper_bound)]

        return filtered_df
    

    def iqr(dataframe, column_names, k=1.5):
        """
        Filter the given dataframe based on the interquartile range (IQR) of the specified column(s).

        Parameters:
        - dataframe: pandas DataFrame
            The input dataframe to filter.
        - column_names: str or list of str
            The name(s) of the column(s) to calculate the IQR and perform the filtering.
        - k: float
            The factor to multiply the IQR by when calculating the lower and upper bounds.

        Returns:
        - filtered_df: pandas DataFrame
            The filtered dataframe based on the IQR of the specified column(s).
        """
        filtered_df = dataframe.copy()

        if isinstance(column_names, list):
            for column_name in column_names:
                Q1 = dataframe[column_name].quantile(0.25)
                Q3 = dataframe[column_name].quantile(0.75)
                IQR = Q3 - Q1

                lower_bound = Q1 - k * IQR
                upper_bound = Q3 + k * IQR

                filtered_df = filtered_df[(filtered_df[column_name] >= lower_bound) & (filtered_df[column_name] <= upper_bound)]
        else:
            Q1 = dataframe[column_names].quantile(0.25)
            Q3 = dataframe[column_names].quantile(0.75)
            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            filtered_df = filtered_df[(filtered_df[column_names] >= lower_bound) & (filtered_df[column_names] <= upper_bound)]

        return filtered_df

prep = Preprocessing

# Model: Appartaments and Houses

### Preprocessing

In [75]:
# Logical fixing

In [76]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train.dropna(subset=['total_area_sqm'], inplace=True)
df_train.dropna(subset=['construction_year'], inplace=True)
df_train.dropna(subset=['nbr_frontages'], inplace=True)
df_train.dropna(subset=['garden_sqm'], inplace=True)
df_train.dropna(subset=['primary_energy_consumption_sqm'], inplace=True)

# Create train features here

df_train['total_energy_consumption'] = df_train.primary_energy_consumption_sqm / df_train.total_area_sqm

selected_features_train = df_train[[
 'price',
 'property_type',
 'subproperty_type',
 'region',
 'province',
 'locality',
 'zip_code',
 'latitude',
 'longitude',
 'construction_year',
 'total_area_sqm',
 'surface_land_sqm',
 'nbr_frontages',
 'nbr_bedrooms',
 'equipped_kitchen',
 'fl_furnished',
 'fl_open_fire',
 'fl_terrace',
 'terrace_sqm',
 'fl_garden',
 'garden_sqm',
 'fl_swimming_pool',
 'fl_floodzone',
 'state_building',
 'primary_energy_consumption_sqm',
 'epc',
 'heating_type',
 'fl_double_glazing',
 'cadastral_income'
]]

# Create test features here

df_train['total_energy_consumption'] = df_train.primary_energy_consumption_sqm / df_train.total_area_sqm

X_test = df_test.drop(['price'], axis=1)
y_test = df_test['price']

selected_features_train = prep.iqr(selected_features_train,
                                   ['price', 'nbr_bedrooms', ],
                                    k=1.5)

# Drop price_per_sqm from the features as it's derived from the target
X_train = selected_features_train.drop(['price'], axis=1)
y_train = selected_features_train['price']

numeric_sqm_features = []

categorical_columns = selected_features_train.select_dtypes(include=['object']).columns.tolist()

numeric_features = [col for col in X_train.columns if col not in categorical_columns and col not in numeric_sqm_features]


In [77]:
nan_counts = df_train.isna().sum()
nan_counts = nan_counts.sort_values(ascending=False)
nan_counts[nan_counts > 0]

surface_land_sqm    6185
cadastral_income    5577
terrace_sqm         3922
latitude            2991
longitude           2991
dtype: int64

In [78]:
df_train.shape[0]

15606

# Model Pipeline

In [79]:
from sklearn.ensemble import RandomForestRegressor

numeric_sqm_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('numeric_sqm', numeric_sqm_transformer, numeric_sqm_features),
        ('categorical', categorical_transformer, categorical_columns)])

model_pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', RandomForestRegressor(n_estimators=500, random_state=42))])

model_pipeline_rf.fit(X_train, y_train)

model_pipeline_rf

In [80]:
# Predicting on the test set
y_pred_rf = model_pipeline_rf.predict(X_test)

# Calculating MSE
mse_rf = mean_squared_error(y_test, y_pred_rf)

# Calculating RMSE
rmse_rf = np.sqrt(mse_rf)

print("Random Forest Mean Squared Error:", mse_rf)
print("Random Forest Root Mean Squared Error: €" + str(round(rmse_rf)))
print(f'Random Forest Score: {round(model_pipeline_rf.score(X_test, y_test) * 100, 2)}%')

Random Forest Mean Squared Error: 127010154409.87518
Random Forest Root Mean Squared Error: €356385
Random Forest Score: 27.0%


In [82]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_distributions = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Create Random Forest model
rf = RandomForestRegressor(random_state=42)

# Create randomized search 5-fold cross-validator
rf_cv = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit model
rf_cv.fit(X_train, y_train)

# Best parameters found
print("Best parameters found: ", rf_cv.best_params_)

# Evaluate the best model
best_model = rf_cv.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimat

ValueError: 
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1263, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/validation.py", line 997, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_array_api.py", line 521, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/generic.py", line 2150, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'APARTMENT'

--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1263, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/validation.py", line 997, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_array_api.py", line 521, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/generic.py", line 2150, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'HOUSE'
