# IMMOO ELIZA ML

### import libraries

In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import hstack
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

### importing dataset

In [104]:
path = "../data/data_properties.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,id,region,province,district,locality,postalcode,latitude,longitude,price_main,type,...,condition,facadeCount,hasKitchenSetup,isFurnished,fireplaceExists,hasSwimmingPool,terraceSurface,floodZone,gardenSurface,isNewRealEstateProject
0,10858284,Wallonie,Luxembourg,Arlon,Halanzy,6792,49.582632,5.756293,220000.0,HOUSE,...,TO_RENOVATE,3.0,SEMI_EQUIPPED,0.0,0,0.0,42.0,,80.0,0
1,10930768,Flanders,Antwerp,Antwerp,Antwerp,2018,51.194594,4.407379,589500.0,APARTMENT,...,GOOD,4.0,INSTALLED,,0,,28.0,NON_FLOOD_ZONE,,0
2,11137665,Flanders,East Flanders,Gent,Ertvelde,9940,51.179353,3.740274,825000.0,HOUSE,...,,4.0,SEMI_EQUIPPED,,1,,,NON_FLOOD_ZONE,1635.0,0
3,10966577,Flanders,Antwerp,Antwerp,Kapellen,2950,51.34506,4.401032,575000.0,HOUSE,...,GOOD,4.0,,,0,,,NON_FLOOD_ZONE,,0
4,11110694,Flanders,Antwerp,Antwerp,Deurne,2100,51.234994,4.47087,179000.0,APARTMENT,...,AS_NEW,3.0,SEMI_EQUIPPED,,1,,3.0,NON_FLOOD_ZONE,,0


### Handeling outliers

In [105]:
Q1 = df['price_main'].quantile(0.25) #Calculate Q1
Q3 = df['price_main'].quantile(0.75) #Calculate Q3

IQR = Q3 - Q1 #Calculate IQ range

lower_bound = Q1 - 1.5 * IQR #Calculate the lower bound
upper_bound = Q3 + 1.5 * IQR #Calculate the upper bound

df = df[(df['price_main'] >= lower_bound) & (df['price_main'] <= upper_bound)] # Filter rows where 'price_main' < lb or > ub
df.shape

(54802, 29)

#### deleting columns where values don't have much impact on the price

In [106]:
columns_to_drop = ['cadastralIncome', 'id', 'primaryEnergyConsumption', 'isNewRealEstateProject', 'longitude', 'latitude', 'locality', 'postalcode']
df = df.drop(columns=columns_to_drop)

#### assuming isFurnished, hasSwimmingPool were not mentioned because it is not present

In [107]:
# Replace NaN values in 'isFurnished' and 'hasSwimmingPool' columns with 0
df['isFurnished'].fillna(0, inplace=True)
df['hasSwimmingPool'].fillna(0, inplace=True)
#df['floodZone'].fillna("NON_FLOOD_ZONE", inplace=True)
print(df.head(20))

      region         province    district  price_main       type  \
0   Wallonie       Luxembourg       Arlon    220000.0      HOUSE   
1   Flanders          Antwerp     Antwerp    589500.0  APARTMENT   
3   Flanders          Antwerp     Antwerp    575000.0      HOUSE   
4   Flanders          Antwerp     Antwerp    179000.0  APARTMENT   
5   Flanders    East Flanders        Gent    549000.0  APARTMENT   
6   Flanders  Flemish Brabant      Leuven    439000.0      HOUSE   
7   Wallonie            Namur      Dinant    130000.0  APARTMENT   
8   Wallonie          Hainaut   Charleroi    105000.0  APARTMENT   
10  Wallonie            Liège       Liège    169000.0      HOUSE   
11  Flanders    East Flanders        Gent    465000.0  APARTMENT   
12  Flanders    East Flanders  Oudenaarde    169000.0      HOUSE   
13  Flanders  Flemish Brabant      Leuven    399000.0  APARTMENT   
14  Flanders          Limburg    Tongeren    269938.0  APARTMENT   
15  Wallonie  Walloon Brabant    Nivelles    390

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['isFurnished'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['hasSwimmingPool'].fillna(0, inplace=True)


### Handeling missing data (SimpleImputer)
* delete everything without price_main + remove columns with 50%+ missing data

* columns with missing values:
    * heatingType
    * epcScores
    * primaryEnergyConsumption
    * bedrooms
    * surface
    * surfaceGood
    * condition
    * facadeCount
    * hasKitchenSetup
    * hasSwimmingPool
    * floodZone

In [108]:
missing_data_all = df.isna().sum()
percentage_missing_all = round(missing_data_all * 100 / len(df), 0)
print(f'For all proporties :\n {percentage_missing_all}')

For all proporties :
 region                               0.0
province                             0.0
district                             0.0
price_main                           0.0
type                                 0.0
subtype                              0.0
heatingType                         34.0
epcScores                           29.0
bedrooms                             0.0
surface                             13.0
surfaceGood                         41.0
hasGasWaterElectricityConnection    66.0
condition                           30.0
facadeCount                         30.0
hasKitchenSetup                     44.0
isFurnished                          0.0
fireplaceExists                      0.0
hasSwimmingPool                      0.0
terraceSurface                      66.0
floodZone                           44.0
gardenSurface                       81.0
dtype: float64


In [109]:
#drop columns with more than 50% missing values
def get_column_missing_values():
    columns_to_drop = []
    for column, missing_percentage in percentage_missing_all.items():
        if missing_percentage >= 50:
            columns_to_drop.append(column)
    return columns_to_drop
    
print(get_column_missing_values())
df.drop(columns=get_column_missing_values(), inplace=True)  # Modify DataFrame inplace
print(df.dtypes)

['hasGasWaterElectricityConnection', 'terraceSurface', 'gardenSurface']
region              object
province            object
district            object
price_main         float64
type                object
subtype             object
heatingType         object
epcScores           object
bedrooms           float64
surface            float64
surfaceGood        float64
condition           object
facadeCount        float64
hasKitchenSetup     object
isFurnished        float64
fireplaceExists      int64
hasSwimmingPool    float64
floodZone           object
dtype: object


In [110]:
#train all properties
X = df.drop(['price_main'], axis=1)
y = df.price_main

X_train, X_test, y_train, y_test = train_test_split( X,y, test_size=0.2, random_state=1)
print(type(X_train), type(X_test), type(y_train), type(y_test))


<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


In [111]:
def impute_missing_values(X_train, X_test):
    # Impute missing values for numerical features
    numerical_imputer = SimpleImputer(strategy='mean')
    numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
    X_train[numerical_features] = numerical_imputer.fit_transform(X_train[numerical_features])
    X_test[numerical_features] = numerical_imputer.transform(X_test[numerical_features])

    # Impute missing values for categorical features
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    categorical_features = X_train.select_dtypes(include=['object']).columns
    X_train[categorical_features] = categorical_imputer.fit_transform(X_train[categorical_features])
    X_test[categorical_features] = categorical_imputer.transform(X_test[categorical_features])

    return X_train, X_test

# Impute missing values
X_train, X_test = impute_missing_values(X_train, X_test)

# Displaying imputed data
print("X_train after imputation:")
print(X_train)
print("\nX_test after imputation:")
print(X_test)
print("\ny_train:")
print(y_train)
print("\ny_test:")
print(y_test)

print(type(X_train), type(X_test), type(y_train), type(y_test))

X_train after imputation:
         region         province     district       type    subtype  \
50397  Flanders    East Flanders  Dendermonde      HOUSE      HOUSE   
26027  Flanders  Flemish Brabant       Leuven      HOUSE      HOUSE   
26855  Flanders          Limburg      Hasselt      HOUSE      HOUSE   
6618   Wallonie          Hainaut         Mons      HOUSE      VILLA   
7648   Flanders    East Flanders        Aalst      HOUSE      HOUSE   
...         ...              ...          ...        ...        ...   
56822  Flanders    East Flanders        Aalst      HOUSE      HOUSE   
36910  Flanders    East Flanders        Aalst  APARTMENT       LOFT   
5914   Wallonie            Namur       Dinant      HOUSE      HOUSE   
13873  Flanders          Antwerp      Antwerp  APARTMENT  APARTMENT   
37487  Flanders    West Flanders        Tielt  APARTMENT  APARTMENT   

      heatingType epcScores  bedrooms     surface  surfaceGood  \
50397         GAS         C       3.0  150.000000   436

In [112]:
print(y_train.shape)

(43841,)


In [113]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate over each column in X_train
for col in X_train.columns:    
    X_train[col]= label_encoder.fit_transform(X_train[col])

    # Transform X_test[col], handle unknown labels by replacing them with a constant (e.g., -1)
    X_test[col] = X_test[col].map(lambda s: label_encoder.transform([s])[0] if s in label_encoder.classes_ else -1)

print("X_train encoded:")
print(X_train)

X_train encoded:
       region  province  district  type  subtype  heatingType  epcScores  \
50397       1         2         8     1       10            3          4   
26027       1         3        18     1       10            1          6   
26855       1         5        14     1       10            3          3   
6618        2         4        23     1       21            3          4   
7648        1         2         0     1       10            4          6   
...       ...       ...       ...   ...      ...          ...        ...   
56822       1         2         0     1       10            3          5   
36910       1         2         0     0       12            3          3   
5914        2         8        10     1       10            2          7   
13873       1         0         1     0        0            3          3   
37487       1        10        35     0        0            1          3   

       bedrooms  surface  surfaceGood  condition  facadeCount  \
50397

In [114]:
# Assuming you have a list of column names (feature names) for X_train and X_test
column_names = X_train.columns.tolist()

# Initialize StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both training and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled arrays back into DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=column_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=column_names)


In [115]:
# Initialize Linear Regression model
regressor = LinearRegression()

# Fit the model to the scaled training data
regressor.fit(X_train, y_train)

# Evaluate the model on the scaled test data
score = regressor.score(X_test, y_test)

print("R^2 Score:", score)

R^2 Score: 0.4632083262742467


In [116]:
# Get the coefficients
coefficients = regressor.coef_
X_train.head()

# Print the coefficients along with corresponding feature names if available
if hasattr(X_train, 'columns'):
    print("Feature Coefficients:")
    for feature, coef in zip(X_train.columns, coefficients):
        print(feature, ':', coef)
else:
    print("Feature Coefficients:", coefficients)

Feature Coefficients:
region : -79788.482597899
province : -170.12262109881556
district : -737.4965206913998
type : 19272.94532001431
subtype : 1701.7287690175704
heatingType : 2000.6842449363924
epcScores : -18372.87380626524
bedrooms : 22449.77508459753
surface : 615.4960358060321
surfaceGood : 55.2530186530945
condition : -23396.70319796947
facadeCount : 9339.386899817591
hasKitchenSetup : -1580.654976070529
isFurnished : -5147.929165222759
fireplaceExists : 13304.31192934324
hasSwimmingPool : 69701.50232841517
floodZone : -5945.012974656768
