# All the imports in one place

In [188]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold


path = "data/properties.csv"
df = pd.read_csv(path)

In [189]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75511 entries, 0 to 75510
Data columns (total 30 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              75511 non-null  int64  
 1   price                           75511 non-null  float64
 2   property_type                   75511 non-null  object 
 3   subproperty_type                75511 non-null  object 
 4   region                          75511 non-null  object 
 5   province                        75511 non-null  object 
 6   locality                        75511 non-null  object 
 7   zip_code                        75511 non-null  int64  
 8   latitude                        61413 non-null  float64
 9   longitude                       61413 non-null  float64
 10  construction_year               42120 non-null  float64
 11  total_area_sqm                  67896 non-null  float64
 12  surface_land_sqm                

# Class for Preprocessing

In [190]:
class Preprocessing:

    @staticmethod
    def fill_nan_with_median(df, column_name):
        """
        Fill missing values in the specified column(s) with the median value.

        Parameters:
        df (pandas.DataFrame): The DataFrame containing the data.
        column_name (str or list): The name(s) of the column(s) to fill.

        Returns:
        None
        """
        if isinstance(column_name, list):
            for column in column_name:
                median_of_column = df[column].median()
                df.fillna({column: median_of_column}, inplace=True)
        else:
            median_of_column = df[column_name].median()
            df.fillna({column_name: median_of_column}, inplace=True)

        return None


    @staticmethod
    def fill_nan_with_0(df, column_name):
        """
        Fill NaN values in the specified column(s) with 0.

        Parameters:
        - df (pandas.DataFrame): The DataFrame containing the data.
        - column_name (str or list): The name(s) of the column(s) to fill NaN values in.

        Returns:
        None
        """
        if isinstance(column_name, list):
            for column in column_name:
                df.fillna({column: 0}, inplace=True)
        else:
            df.fillna({column_name: 0}, inplace=True)

        return None
    
    @staticmethod
    def iqr(dataframe, column_names):
        """
        Filter the given dataframe based on the interquartile range (IQR) of the specified column(s).

        Parameters:
        - dataframe: pandas DataFrame
            The input dataframe to filter.
        - column_names: str or list of str
            The name(s) of the column(s) to calculate the IQR and perform the filtering.

        Returns:
        - filtered_df: pandas DataFrame
            The filtered dataframe based on the IQR of the specified column(s).
        """
        filtered_df = dataframe.copy()

        if isinstance(column_names, list):
            for column_name in column_names:
                Q1 = dataframe[column_name].quantile(0.25)
                Q3 = dataframe[column_name].quantile(0.75)
                IQR = Q3 - Q1

                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR

                filtered_df = filtered_df[(filtered_df[column_name] >= lower_bound) & (filtered_df[column_name] <= upper_bound)]
        else:
            Q1 = dataframe[column_names].quantile(0.25)
            Q3 = dataframe[column_names].quantile(0.75)
            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            filtered_df = filtered_df[(filtered_df[column_names] >= lower_bound) & (filtered_df[column_names] <= upper_bound)]

        return filtered_df

prep = Preprocessing

def score(model):
    score = model.score(X_test_scaled, y_test)

    y_pred = model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)

    rmse = np.sqrt(mse)

    print(f'Error: €{round(rmse)}')

    print(f'Model\'s score: {round(score * 100, 2)}%')

# Model: Appartaments and Houses

### Importing data

In [191]:
path = "data/properties.csv"
df = pd.read_csv(path)

print(df.columns.tolist())

['id', 'price', 'property_type', 'subproperty_type', 'region', 'province', 'locality', 'zip_code', 'latitude', 'longitude', 'construction_year', 'total_area_sqm', 'surface_land_sqm', 'nbr_frontages', 'nbr_bedrooms', 'equipped_kitchen', 'fl_furnished', 'fl_open_fire', 'fl_terrace', 'terrace_sqm', 'fl_garden', 'garden_sqm', 'fl_swimming_pool', 'fl_floodzone', 'state_building', 'primary_energy_consumption_sqm', 'epc', 'heating_type', 'fl_double_glazing', 'cadastral_income']


### Preprocessing

In [192]:
# store all categorical columns in one place
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

# add zip code manually because it is int64
categorical_columns.append('zip_code')

numeric_features = [col for col in df if col not in categorical_columns and col != 'price']

In [193]:
path = "data/properties.csv"
df = pd.read_csv(path)

columns_to_drop = ['id', 'latitude', 'longitude', 'zip_code']

# store all categorical columns in one place
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

categorical_columns = [col for col in categorical_columns if col not in columns_to_drop]

# add zip code manually because it is int64
categorical_columns.append('zip_code')

numeric_features = [col for col in df if col not in categorical_columns and col != 'price' and col not in columns_to_drop]

prep.iqr(df, ['price', 'total_area_sqm'])

df['price_per_sqm'] = df['price'] / df['total_area_sqm']
df['age_of_building'] = 2024 - df['construction_year']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_columns)])

model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', LinearRegression())])

X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_pipeline.fit(X_train, y_train)

model_pipeline

In [194]:
score = model_pipeline.score(X_test, y_test)
print(f'Model Score: {round(score * 100, 2)}%')

Model Score: 44.67%


In [195]:
# Predicting on the test set
y_pred = model_pipeline.predict(X_test)

# Calculating MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculating RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

model_pipeline.score(X_test, y_test)

Mean Squared Error: 96265803345.28053
Root Mean Squared Error: 310267.3095014693


0.44671396720362344