# All the imports in one place

In [779]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

path = "data/properties.csv"
df = pd.read_csv(path)

In [780]:
df.columns

Index(['id', 'price', 'property_type', 'subproperty_type', 'region',
       'province', 'locality', 'zip_code', 'latitude', 'longitude',
       'construction_year', 'total_area_sqm', 'surface_land_sqm',
       'nbr_frontages', 'nbr_bedrooms', 'equipped_kitchen', 'fl_furnished',
       'fl_open_fire', 'fl_terrace', 'terrace_sqm', 'fl_garden', 'garden_sqm',
       'fl_swimming_pool', 'fl_floodzone', 'state_building',
       'primary_energy_consumption_sqm', 'epc', 'heating_type',
       'fl_double_glazing', 'cadastral_income'],
      dtype='object')

# Class for Preprocessing

In [781]:
class Preprocessing:

    @staticmethod
    def fill_nan_with_median(df, column_name):
        """
        Fill missing values in the specified column(s) with the median value.

        Parameters:
        df (pandas.DataFrame): The DataFrame containing the data.
        column_name (str or list): The name(s) of the column(s) to fill.

        Returns:
        None
        """
        if isinstance(column_name, list):
            for column in column_name:
                median_of_column = df[column].median()
                df.fillna({column: median_of_column}, inplace=True)
        else:
            median_of_column = df[column_name].median()
            df.fillna({column_name: median_of_column}, inplace=True)

        return None


    @staticmethod
    def fill_nan_with_0(df, column_name):
        """
        Fill NaN values in the specified column(s) with 0.

        Parameters:
        - df (pandas.DataFrame): The DataFrame containing the data.
        - column_name (str or list): The name(s) of the column(s) to fill NaN values in.

        Returns:
        None
        """
        if isinstance(column_name, list):
            for column in column_name:
                df.fillna({column: 0}, inplace=True)
        else:
            df.fillna({column_name: 0}, inplace=True)

        return None
    
    @staticmethod
    def iqr(dataframe, column_names):
        """
        Filter the given dataframe based on the interquartile range (IQR) of the specified column(s).

        Parameters:
        - dataframe: pandas DataFrame
            The input dataframe to filter.
        - column_names: str or list of str
            The name(s) of the column(s) to calculate the IQR and perform the filtering.

        Returns:
        - filtered_df: pandas DataFrame
            The filtered dataframe based on the IQR of the specified column(s).
        """
        filtered_df = dataframe.copy()

        if isinstance(column_names, list):
            for column_name in column_names:
                Q1 = dataframe[column_name].quantile(0.25)
                Q3 = dataframe[column_name].quantile(0.75)
                IQR = Q3 - Q1

                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR

                filtered_df = filtered_df[(filtered_df[column_name] >= lower_bound) & (filtered_df[column_name] <= upper_bound)]
        else:
            Q1 = dataframe[column_names].quantile(0.25)
            Q3 = dataframe[column_names].quantile(0.75)
            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            filtered_df = filtered_df[(filtered_df[column_names] >= lower_bound) & (filtered_df[column_names] <= upper_bound)]

        return filtered_df

prep = Preprocessing

def score(model):
    score = model.score(X_test_scaled, y_test)

    y_pred = model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)

    rmse = np.sqrt(mse)

    print(f'Error: €{round(rmse)}')

    print(f'Model\'s score: {round(score * 100, 2)}%')

# Model: Appartaments and Houses

### Importing data

In [782]:
path = "data/properties.csv"
df = pd.read_csv(path)

### Preprocessing

In [783]:
df['price_per_sqm'] = df['price'] / df['total_area_sqm']

prep.fill_nan_with_0(df, ['surface_land_sqm', 'terrace_sqm', 'garden_sqm'])

prep.fill_nan_with_median(df, 'construction_year')

selected_features = df[[
    'price', 'price_per_sqm', 'nbr_bedrooms', 'construction_year',
    'fl_double_glazing', 'fl_furnished', 'fl_open_fire', 'fl_floodzone',
    'garden_sqm', 'terrace_sqm', 'surface_land_sqm'
    ]]

filtered_selected_features = prep.iqr(selected_features,['price', 'price_per_sqm',
                                                          'nbr_bedrooms', 'garden_sqm',
                                                          'terrace_sqm', 'surface_land_sqm'])

# target
y = filtered_selected_features['price'].values

# features
X = filtered_selected_features.drop(columns=['price']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model

In [784]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)

### Result

In [785]:
score(model)

Error: €78025
Model's score: 60.18%


# Model: Houses

### Importing data

In [786]:
path = "data/properties.csv"
df = pd.read_csv(path)
df = df[df['property_type'] == 'HOUSE']
df.isna().sum()

id                                    0
price                                 0
property_type                         0
subproperty_type                      0
region                                0
province                              0
locality                              0
zip_code                              0
latitude                           8204
longitude                          8204
construction_year                 18417
total_area_sqm                     5312
surface_land_sqm                      0
nbr_frontages                      7989
nbr_bedrooms                          0
equipped_kitchen                      0
fl_furnished                          0
fl_open_fire                          0
fl_terrace                            0
terrace_sqm                        9181
fl_garden                             0
garden_sqm                         2420
fl_swimming_pool                      0
fl_floodzone                          0
state_building                        0


In [787]:
df.fl_terrace.value_counts()

fl_terrace
0    20098
1    19157
Name: count, dtype: int64

### Preprocessing

'nbr_frontages' fill NaN with median: score less. 

In [788]:
df['price_per_sqm'] = df['price'] / df['total_area_sqm']

prep.fill_nan_with_0(df, ['surface_land_sqm', 'terrace_sqm', 'garden_sqm'])

prep.fill_nan_with_median(df, ['construction_year'])

selected_features = df[[
    'price', 'price_per_sqm', 'nbr_bedrooms', 'construction_year',
    'fl_double_glazing', 'fl_furnished', 'fl_open_fire', 'fl_floodzone',
    'garden_sqm', 'terrace_sqm', 'surface_land_sqm', 'fl_swimming_pool'
    ]]

filtered_selected_features = prep.iqr(selected_features,['price', 'price_per_sqm',
                                                          'nbr_bedrooms', 'garden_sqm',
                                                          'terrace_sqm', 'surface_land_sqm'])

# target
y = filtered_selected_features['price'].values

# features
X = filtered_selected_features.drop(columns=['price']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model

In [789]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)

### Result

In [790]:
score(model)

Error: €96870
Model's score: 60.42%
