In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('cleaned_dataset.csv')

In [3]:
print(df.shape)
print(df.info())
print(df.describe())

(94891, 30)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94891 entries, 0 to 94890
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bathroomcount      94891 non-null  int64  
 1   bedroomcount       94891 non-null  int64  
 2   constructionyear   94891 non-null  int64  
 3   country            94891 non-null  object 
 4   district           94891 non-null  object 
 5   fireplace          94891 non-null  int64  
 6   floodingzone       94891 non-null  object 
 7   furnished          94891 non-null  int64  
 8   garden             94891 non-null  int64  
 9   kitchen            94891 non-null  int64  
 10  livingarea         94891 non-null  float64
 11  locality           94888 non-null  object 
 12  monthlycharges     94891 non-null  float64
 13  numberoffacades    94891 non-null  int64  
 14  peb                94891 non-null  object 
 15  postalcode         94891 non-null  int64  
 16  price     

In [4]:
df.columns

Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'country',
       'district', 'fireplace', 'floodingzone', 'furnished', 'garden',
       'kitchen', 'livingarea', 'locality', 'monthlycharges',
       'numberoffacades', 'peb', 'postalcode', 'price', 'propertyid',
       'province', 'region', 'roomcount', 'showercount', 'stateofbuilding',
       'subtypeofproperty', 'surfaceofplot', 'swimmingpool', 'terrace',
       'toiletcount', 'typeofproperty', 'typeofsale'],
      dtype='object')

In [5]:
df

Unnamed: 0,bathroomcount,bedroomcount,constructionyear,country,district,fireplace,floodingzone,furnished,garden,kitchen,...,roomcount,showercount,stateofbuilding,subtypeofproperty,surfaceofplot,swimmingpool,terrace,toiletcount,typeofproperty,typeofsale
0,1,1,1969,Belgium,Brugge,0,NON_FLOOD_ZONE,0,0,1,...,1,0,4,flat_studio,203,0,1,1,2,residential_sale
1,6,13,1920,Belgium,Tournai,0,NON_FLOOD_ZONE,0,0,2,...,31,1,4,apartment_block,130,0,0,5,1,residential_sale
2,2,4,2008,Belgium,Brugge,0,NON_FLOOD_ZONE,1,0,1,...,3,0,4,house,0,0,0,2,1,residential_sale
3,1,4,1979,Belgium,Veurne,0,NON_FLOOD_ZONE,0,1,1,...,9,1,2,house,170,0,1,2,1,residential_sale
4,0,2,1972,Belgium,Hasselt,0,NON_FLOOD_ZONE,0,0,1,...,1,0,5,apartment,400,0,1,1,2,residential_sale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94886,1,1,2017,Belgium,Tongeren,0,NON_FLOOD_ZONE,0,0,2,...,5,0,4,service_flat,286,0,1,1,2,residential_sale
94887,1,3,2024,Belgium,Gent,0,NON_FLOOD_ZONE,0,1,1,...,4,1,4,house,234,0,0,0,1,residential_sale
94888,4,4,2020,Belgium,Antwerp,0,NON_FLOOD_ZONE,0,0,2,...,8,1,3,apartment_block,202,0,0,1,1,residential_sale
94889,1,2,2014,Belgium,Antwerp,0,NON_FLOOD_ZONE,0,0,3,...,8,1,4,apartment,606,0,1,1,2,residential_sale


refining cleaned data

In [6]:
def clean_data(df):
    # Drop column: 'country'
    df = df.drop(columns=['country'])
    # Drop column: 'fireplace'
    df = df.drop(columns=['fireplace'])
    # Drop column: 'monthlycharges'
    df = df.drop(columns=['monthlycharges'])
    # Drop column: 'locality'
    df = df.drop(columns=['locality'])
    # Drop column: 'propertyid'
    df = df.drop(columns=['propertyid'])
    # Drop column: 'constructionyear'
    df = df.drop(columns=['constructionyear'])
   
    return df

df_clean = clean_data(df.copy())
df_clean.head()


Unnamed: 0,bathroomcount,bedroomcount,district,floodingzone,furnished,garden,kitchen,livingarea,numberoffacades,peb,...,roomcount,showercount,stateofbuilding,subtypeofproperty,surfaceofplot,swimmingpool,terrace,toiletcount,typeofproperty,typeofsale
0,1,1,Brugge,NON_FLOOD_ZONE,0,0,1,29.0,2,B,...,1,0,4,flat_studio,203,0,1,1,2,residential_sale
1,6,13,Tournai,NON_FLOOD_ZONE,0,0,2,391.0,3,D,...,31,1,4,apartment_block,130,0,0,5,1,residential_sale
2,2,4,Brugge,NON_FLOOD_ZONE,1,0,1,111.0,2,B,...,3,0,4,house,0,0,0,2,1,residential_sale
3,1,4,Veurne,NON_FLOOD_ZONE,0,1,1,113.6,2,F,...,9,1,2,house,170,0,1,2,1,residential_sale
4,0,2,Hasselt,NON_FLOOD_ZONE,0,0,1,92.0,2,B,...,1,0,5,apartment,400,0,1,1,2,residential_sale


In [7]:
print(df_clean.columns)


Index(['bathroomcount', 'bedroomcount', 'district', 'floodingzone',
       'furnished', 'garden', 'kitchen', 'livingarea', 'numberoffacades',
       'peb', 'postalcode', 'price', 'province', 'region', 'roomcount',
       'showercount', 'stateofbuilding', 'subtypeofproperty', 'surfaceofplot',
       'swimmingpool', 'terrace', 'toiletcount', 'typeofproperty',
       'typeofsale'],
      dtype='object')


Categorize str values
constructionyear, district, floodingzone, subtypeofproperty, typeofsale

In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

columns_to_encode = ['district', 'floodingzone', 'subtypeofproperty', 'typeofsale', 'peb','province', 'region']

data_to_encode = df_clean[columns_to_encode]

one = OneHotEncoder()

encoded_data = one.fit_transform(data_to_encode)

encoded_df = pd.DataFrame(encoded_data.toarray(), columns=one.get_feature_names_out(columns_to_encode))

df_final = pd.concat([df_clean.drop(columns=columns_to_encode), encoded_df], axis=1)

print(df_final.shape)
print(df_final.info())
df_final.head()


(94891, 125)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94891 entries, 0 to 94890
Columns: 125 entries, bathroomcount to region_Wallonie
dtypes: float64(109), int64(16)
memory usage: 90.5 MB
None


Unnamed: 0,bathroomcount,bedroomcount,furnished,garden,kitchen,livingarea,numberoffacades,postalcode,price,roomcount,...,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_Walloon Brabant,province_West Flanders,region_Brussels,region_Flanders,region_Wallonie
0,1,1,0,0,1,29.0,2,8380,99000,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,6,13,0,0,2,391.0,3,7500,765000,31,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,4,1,0,1,111.0,2,8370,399000,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1,4,0,1,1,113.6,2,8660,230000,9,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0,2,0,0,1,92.0,2,3500,198000,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Machine learning process

In [23]:
correlations = df_final.drop(columns=['price']).corrwith(df_final['price'])

print(correlations)



bathroomcount               0.197276
bedroomcount                0.377297
furnished                  -0.048302
garden                      0.072696
kitchen                     0.299656
                              ...   
province_Walloon Brabant    0.072215
province_West Flanders      0.069096
region_Brussels             0.073956
region_Flanders             0.196733
region_Wallonie            -0.255417
Length: 124, dtype: float64


  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
df_final

In [20]:
from sklearn.model_selection import train_test_split

df = df_final

y = np.array(df ['price'])
X = np.array(df.drop(columns=['price']))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (75912, 124)
X_test shape: (18979, 124)
y_train shape: (75912,)
y_test shape: (18979,)


Cleaning X train, test


In [27]:
import pandas as pd

def clean_data(X_train_df):
    for col in X_train_df.columns:
        X_train_df[col] = X_train_df[col].fillna(X_train_df[col].median())
    return X_train_df

X_train_df_clean = clean_data(X_train_df)
print(X_train_df_clean.head())


   0    1    2    3    4      5    6       7    8    9    ...  114  115  116  \
0  1.0  3.0  0.0  0.0  2.0  191.0  3.0  2870.0  8.0  1.0  ...  0.0  0.0  0.0   
1  1.0  3.0  0.0  0.0  1.0  154.0  4.0  4053.0  6.0  0.0  ...  0.0  0.0  1.0   
2  1.0  1.0  0.0  0.0  1.0   78.0  2.0  1080.0  5.0  0.0  ...  0.0  0.0  0.0   
3  1.0  1.0  0.0  0.0  3.0   61.0  2.0  2500.0  5.0  1.0  ...  0.0  0.0  0.0   
4  1.0  3.0  0.0  0.0  2.0  118.0  3.0  5070.0  6.0  1.0  ...  0.0  0.0  0.0   

   117  118  119  120  121  122  123  
0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  1.0  
2  0.0  0.0  0.0  0.0  1.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
4  0.0  1.0  0.0  0.0  0.0  0.0  1.0  

[5 rows x 124 columns]


In [30]:
import pandas as pd

def clean_data(X_test_df):
    
    for col in X_test_df.columns:
        if X_test_df[col].isnull().any():
            X_test_df[col].fillna(X_test_df[col].median(), inplace=True)
    return X_test_df


X_test_df_clean = clean_data(X_test_df)




   0    1    2    3    4      5    6       7     8    9    ...  114  115  116  \
0  1.0  2.0  0.0  0.0  1.0   88.0  2.0  9700.0   7.0  1.0  ...  0.0  0.0  0.0   
1  1.0  1.0  0.0  0.0  1.0   60.0  2.0  9050.0   7.0  0.0  ...  0.0  0.0  0.0   
2  1.0  4.0  0.0  1.0  1.0  154.0  2.0  8370.0   5.0  1.0  ...  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  2.0  400.0  2.0  2800.0   6.0  0.0  ...  0.0  0.0  0.0   
4  1.0  1.0  0.0  1.0  3.0   95.0  2.0  9820.0  11.0  1.0  ...  0.0  0.0  0.0   

   117  118  119  120  121  122  123  
0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
1  0.0  0.0  0.0  1.0  0.0  1.0  0.0  
2  0.0  0.0  0.0  1.0  0.0  1.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
4  0.0  0.0  0.0  1.0  0.0  1.0  0.0  

[5 rows x 124 columns]


Cleaning Y train and Y test

In [31]:
y_train

import pandas as pd

def clean_data(y_train_df):
    y_train_df = y_train_df.fillna({0: y_train_df[0].median()})
    return y_train_df



y_train_df_clean = clean_data(y_train_df)


In [32]:
y_test

def clean_data(y_test_df):
   
    y_test_df = y_test_df.fillna({0: 0})
    return y_test_df




y_test_df_clean = clean_data(y_test_df)
y_test_df_clean.shape

(18979, 1)

Utiliser des transformers pour choisir quelle variable est plus intérressante pour le modèle
https://www.youtube.com/watch?v=T4nZDuakYlU&list=PLO_fdPEVlfKoHQ3Ua2NtDL4nmynQC8YiS&index=9

Model training


In [15]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(random_state=42)
model.fit(X_train_df_clean, y_train_df_clean)
y_pred = model.predict(X_test_df_clean)

Learning rate set to 0.081146
0:	learn: 137482.9952351	total: 152ms	remaining: 2m 31s
1:	learn: 132742.7356303	total: 158ms	remaining: 1m 18s
2:	learn: 128332.8683354	total: 166ms	remaining: 55.1s
3:	learn: 124510.1677068	total: 172ms	remaining: 42.9s
4:	learn: 120979.7360593	total: 179ms	remaining: 35.7s
5:	learn: 117882.1045041	total: 188ms	remaining: 31.1s
6:	learn: 115029.5046961	total: 195ms	remaining: 27.7s
7:	learn: 112521.0602218	total: 205ms	remaining: 25.4s
8:	learn: 110247.9346776	total: 215ms	remaining: 23.6s
9:	learn: 108068.2860997	total: 221ms	remaining: 21.9s
10:	learn: 106089.0749787	total: 228ms	remaining: 20.5s
11:	learn: 104307.1699139	total: 235ms	remaining: 19.4s
12:	learn: 102747.6159676	total: 242ms	remaining: 18.4s
13:	learn: 101174.1523303	total: 250ms	remaining: 17.6s
14:	learn: 99908.2160677	total: 257ms	remaining: 16.9s
15:	learn: 98595.1166578	total: 264ms	remaining: 16.2s
16:	learn: 97465.9850348	total: 271ms	remaining: 15.7s
17:	learn: 96446.8500912	tota

In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_df_clean, y_pred)
mse = mean_squared_error(y_test_df_clean, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df_clean, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")


MAE: 47268.6309273482
MSE: 4267914974.6892767
RMSE: 65329.28114321538
R^2 Score: 0.7897073347323003


In [22]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from catboost import CatBoostRegressor
import pandas as pd

# Définir le modèle de base
model = CatBoostRegressor(random_state=42)

# Définir la grille des hyperparamètres
param_grid = {
    'iterations': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [4, 6, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

# Configurer la recherche en grille
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Exécuter la recherche en grille
grid_search.fit(X_train_df_clean, y_train_df_clean)

# Afficher les meilleurs hyperparamètres
print("Best parameters found: ", grid_search.best_params_)

# Évaluer les performances du modèle optimisé
best_model = grid_search.best_estimator_
cv_scores = cross_val_score(best_model, X_train_df_clean, y_train_df_clean, cv=5, scoring='neg_mean_absolute_error')

# Afficher les scores de validation croisée
print("Cross-validated scores (MAE): ", -cv_scores)
print("Mean CV score (MAE): ", -cv_scores.mean())

# Faire des prédictions et évaluer le modèle sur les données de test
y_pred = best_model.predict(X_test_df_clean)

# Calculer les métriques de performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test_df_clean, y_pred)
mse = mean_squared_error(y_test_df_clean, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df_clean, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")


  _data = np.array(data, dtype=dtype, copy=copy,


0:	learn: 127675.3244705	total: 44.9ms	remaining: 22.4s
1:	learn: 116269.3272121	total: 83.5ms	remaining: 20.8s
2:	learn: 107947.9598588	total: 123ms	remaining: 20.4s
3:	learn: 101515.1230708	total: 167ms	remaining: 20.7s
4:	learn: 96354.9008195	total: 211ms	remaining: 20.9s
5:	learn: 92554.6386108	total: 323ms	remaining: 26.6s
6:	learn: 89332.8748035	total: 362ms	remaining: 25.5s
7:	learn: 87117.1043680	total: 401ms	remaining: 24.6s
8:	learn: 85182.8491841	total: 447ms	remaining: 24.4s
9:	learn: 83638.7491218	total: 491ms	remaining: 24.1s
10:	learn: 82352.9780937	total: 544ms	remaining: 24.2s
11:	learn: 81337.7546432	total: 603ms	remaining: 24.5s
12:	learn: 80373.1708137	total: 661ms	remaining: 24.7s
13:	learn: 79489.1246587	total: 717ms	remaining: 24.9s
14:	learn: 78780.9124980	total: 758ms	remaining: 24.5s
15:	learn: 78002.8099055	total: 797ms	remaining: 24.1s
16:	learn: 77481.9958590	total: 844ms	remaining: 24s
17:	learn: 77016.2829477	total: 895ms	remaining: 24s
18:	learn: 76507.4