In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import optuna


In [30]:
df = pd.read_csv('cleaned_dataset.csv')

In [31]:
print(df.shape)
print(df.info())
print(df.describe())

(94891, 30)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94891 entries, 0 to 94890
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bathroomcount      94891 non-null  int64  
 1   bedroomcount       94891 non-null  int64  
 2   constructionyear   94891 non-null  int64  
 3   country            94891 non-null  object 
 4   district           94891 non-null  object 
 5   fireplace          94891 non-null  int64  
 6   floodingzone       94891 non-null  object 
 7   furnished          94891 non-null  int64  
 8   garden             94891 non-null  int64  
 9   kitchen            94891 non-null  int64  
 10  livingarea         94891 non-null  float64
 11  locality           94888 non-null  object 
 12  monthlycharges     94891 non-null  float64
 13  numberoffacades    94891 non-null  int64  
 14  peb                94891 non-null  object 
 15  postalcode         94891 non-null  int64  
 16  price     

In [32]:
df.columns

Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'country',
       'district', 'fireplace', 'floodingzone', 'furnished', 'garden',
       'kitchen', 'livingarea', 'locality', 'monthlycharges',
       'numberoffacades', 'peb', 'postalcode', 'price', 'propertyid',
       'province', 'region', 'roomcount', 'showercount', 'stateofbuilding',
       'subtypeofproperty', 'surfaceofplot', 'swimmingpool', 'terrace',
       'toiletcount', 'typeofproperty', 'typeofsale'],
      dtype='object')

In [33]:
df

Unnamed: 0,bathroomcount,bedroomcount,constructionyear,country,district,fireplace,floodingzone,furnished,garden,kitchen,...,roomcount,showercount,stateofbuilding,subtypeofproperty,surfaceofplot,swimmingpool,terrace,toiletcount,typeofproperty,typeofsale
0,1,1,1969,Belgium,Brugge,0,NON_FLOOD_ZONE,0,0,1,...,1,0,4,flat_studio,203,0,1,1,2,residential_sale
1,6,13,1920,Belgium,Tournai,0,NON_FLOOD_ZONE,0,0,2,...,31,1,4,apartment_block,130,0,0,5,1,residential_sale
2,2,4,2008,Belgium,Brugge,0,NON_FLOOD_ZONE,1,0,1,...,3,0,4,house,0,0,0,2,1,residential_sale
3,1,4,1979,Belgium,Veurne,0,NON_FLOOD_ZONE,0,1,1,...,9,1,2,house,170,0,1,2,1,residential_sale
4,0,2,1972,Belgium,Hasselt,0,NON_FLOOD_ZONE,0,0,1,...,1,0,5,apartment,400,0,1,1,2,residential_sale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94886,1,1,2017,Belgium,Tongeren,0,NON_FLOOD_ZONE,0,0,2,...,5,0,4,service_flat,286,0,1,1,2,residential_sale
94887,1,3,2024,Belgium,Gent,0,NON_FLOOD_ZONE,0,1,1,...,4,1,4,house,234,0,0,0,1,residential_sale
94888,4,4,2020,Belgium,Antwerp,0,NON_FLOOD_ZONE,0,0,2,...,8,1,3,apartment_block,202,0,0,1,1,residential_sale
94889,1,2,2014,Belgium,Antwerp,0,NON_FLOOD_ZONE,0,0,3,...,8,1,4,apartment,606,0,1,1,2,residential_sale


refining cleaned data

In [34]:
def clean_data(df):
    # Drop column: 'country'
    df = df.drop(columns=['country'])
    # Drop column: 'fireplace'
    df = df.drop(columns=['fireplace'])
    # Drop column: 'monthlycharges'
    df = df.drop(columns=['monthlycharges'])
    # Drop column: 'locality'
    df = df.drop(columns=['locality'])
    # Drop column: 'propertyid'
    df = df.drop(columns=['propertyid'])
    # Drop column: 'constructionyear'
    df = df.drop(columns=['constructionyear'])
    df = df.drop(columns=['furnished'])
    df = df.drop(columns=['roomcount'])
   
    return df

df_clean = clean_data(df.copy())
df_clean.head()


Unnamed: 0,bathroomcount,bedroomcount,district,floodingzone,garden,kitchen,livingarea,numberoffacades,peb,postalcode,...,region,showercount,stateofbuilding,subtypeofproperty,surfaceofplot,swimmingpool,terrace,toiletcount,typeofproperty,typeofsale
0,1,1,Brugge,NON_FLOOD_ZONE,0,1,29.0,2,B,8380,...,Flanders,0,4,flat_studio,203,0,1,1,2,residential_sale
1,6,13,Tournai,NON_FLOOD_ZONE,0,2,391.0,3,D,7500,...,Wallonie,1,4,apartment_block,130,0,0,5,1,residential_sale
2,2,4,Brugge,NON_FLOOD_ZONE,0,1,111.0,2,B,8370,...,Flanders,0,4,house,0,0,0,2,1,residential_sale
3,1,4,Veurne,NON_FLOOD_ZONE,1,1,113.6,2,F,8660,...,Flanders,1,2,house,170,0,1,2,1,residential_sale
4,0,2,Hasselt,NON_FLOOD_ZONE,0,1,92.0,2,B,3500,...,Flanders,0,5,apartment,400,0,1,1,2,residential_sale


In [35]:
print(df_clean.columns)


Index(['bathroomcount', 'bedroomcount', 'district', 'floodingzone', 'garden',
       'kitchen', 'livingarea', 'numberoffacades', 'peb', 'postalcode',
       'price', 'province', 'region', 'showercount', 'stateofbuilding',
       'subtypeofproperty', 'surfaceofplot', 'swimmingpool', 'terrace',
       'toiletcount', 'typeofproperty', 'typeofsale'],
      dtype='object')


Categorize str values
constructionyear, district, floodingzone, subtypeofproperty, typeofsale

In [36]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

columns_to_encode = ['district', 'floodingzone', 'subtypeofproperty', 'typeofsale', 'peb','province', 'region']

data_to_encode = df_clean[columns_to_encode]

one = OneHotEncoder()

encoded_data = one.fit_transform(data_to_encode)

encoded_df = pd.DataFrame(encoded_data.toarray(), columns=one.get_feature_names_out(columns_to_encode))

df_final = pd.concat([df_clean.drop(columns=columns_to_encode), encoded_df], axis=1)

print(df_final.shape)
print(df_final.info())
df_final.head()


(94891, 123)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94891 entries, 0 to 94890
Columns: 123 entries, bathroomcount to region_Wallonie
dtypes: float64(109), int64(14)
memory usage: 89.0 MB
None


Unnamed: 0,bathroomcount,bedroomcount,garden,kitchen,livingarea,numberoffacades,postalcode,price,showercount,stateofbuilding,...,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_Walloon Brabant,province_West Flanders,region_Brussels,region_Flanders,region_Wallonie
0,1,1,0,1,29.0,2,8380,99000,0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,6,13,0,2,391.0,3,7500,765000,1,4,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,4,0,1,111.0,2,8370,399000,0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1,4,1,1,113.6,2,8660,230000,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0,2,0,1,92.0,2,3500,198000,0,5,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Machine learning process

In [37]:
correlations = df_final.drop(columns=['price']).corrwith(df_final['price'])

print(correlations)



bathroomcount               0.197276
bedroomcount                0.377297
garden                      0.072696
kitchen                     0.299656
livingarea                  0.014218
                              ...   
province_Walloon Brabant    0.072215
province_West Flanders      0.069096
region_Brussels             0.073956
region_Flanders             0.196733
region_Wallonie            -0.255417
Length: 122, dtype: float64


  c /= stddev[:, None]
  c /= stddev[None, :]


In [38]:
df_final

Unnamed: 0,bathroomcount,bedroomcount,garden,kitchen,livingarea,numberoffacades,postalcode,price,showercount,stateofbuilding,...,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_Walloon Brabant,province_West Flanders,region_Brussels,region_Flanders,region_Wallonie
0,1,1,0,1,29.0,2,8380,99000,0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,6,13,0,2,391.0,3,7500,765000,1,4,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,4,0,1,111.0,2,8370,399000,0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1,4,1,1,113.6,2,8660,230000,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0,2,0,1,92.0,2,3500,198000,0,5,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94886,1,1,0,2,83.0,2,3830,219000,0,4,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
94887,1,3,1,1,129.0,2,9880,409000,1,4,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
94888,4,4,0,2,318.0,2,2020,599000,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
94889,1,2,0,3,85.0,4,2140,245000,1,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [39]:
from sklearn.model_selection import train_test_split

df = df_final

y = np.array(df ['price'])
X = np.array(df.drop(columns=['price']))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (75912, 122)
X_test shape: (18979, 122)
y_train shape: (75912,)
y_test shape: (18979,)


Cleaning X train, test


In [40]:
def clean_data(X_train_df):
    for col in X_train_df.columns:
        X_train_df[col] = X_train_df[col].fillna(X_train_df[col].median())
    return X_train_df

X_train_df = pd.DataFrame(X_train.tolist() if len(X_train.shape) > 2 else X_train)
X_train_df_clean = clean_data(X_train_df.copy())
print(X_train_df_clean.head())

   0    1    2    3      4    5       6    7    8       9    ...  112  113  \
0  1.0  3.0  0.0  2.0  191.0  3.0  2870.0  1.0  4.0     0.0  ...  0.0  0.0   
1  1.0  3.0  0.0  1.0  154.0  4.0  4053.0  0.0  4.0  1335.0  ...  0.0  0.0   
2  1.0  1.0  0.0  1.0   78.0  2.0  1080.0  0.0  2.0   167.0  ...  0.0  0.0   
3  1.0  1.0  0.0  3.0   61.0  2.0  2500.0  1.0  4.0   978.0  ...  0.0  0.0   
4  1.0  3.0  0.0  2.0  118.0  3.0  5070.0  1.0  4.0   235.0  ...  0.0  0.0   

   114  115  116  117  118  119  120  121  
0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
1  1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  
2  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
4  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  

[5 rows x 122 columns]


In [41]:
def clean_data(X_test_df):
    for col in X_test_df.columns:
        if X_test_df[col].isnull().any():
            X_test_df[col].fillna(X_test_df[col].median(), inplace=True)
    return X_test_df

X_test_df = pd.DataFrame(X_test.tolist() if len(X_test.shape) > 2 else X_test)
X_test_df_clean = clean_data(X_test_df.copy())
print(X_test_df_clean.head())




   0    1    2    3      4    5       6    7    8      9    ...  112  113  \
0  1.0  2.0  0.0  1.0   88.0  2.0  9700.0  1.0  3.0  547.0  ...  0.0  0.0   
1  1.0  1.0  0.0  1.0   60.0  2.0  9050.0  0.0  5.0  281.0  ...  0.0  0.0   
2  1.0  4.0  1.0  1.0  154.0  2.0  8370.0  1.0  2.0  304.0  ...  0.0  0.0   
3  0.0  0.0  0.0  2.0  400.0  2.0  2800.0  0.0  4.0    0.0  ...  0.0  0.0   
4  1.0  1.0  1.0  3.0   95.0  2.0  9820.0  1.0  4.0  892.0  ...  0.0  0.0   

   114  115  116  117  118  119  120  121  
0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
1  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  
2  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  
4  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  

[5 rows x 122 columns]


Cleaning Y train and Y test

In [42]:
def clean_data(y_train_df):
    y_train_df = y_train_df.fillna({0: y_train_df[0].median()})
    return y_train_df

y_train_df = pd.DataFrame(y_train.tolist() if len(y_train.shape) > 2 else y_train)
y_train_df_clean = clean_data(y_train_df.copy())
y_train_df_clean.head()



Unnamed: 0,0
0,429000
1,325000
2,179000
3,269102
4,372500


In [43]:
def clean_data(y_test_df):
    y_test_df = y_test_df.fillna({0: 0})
    return y_test_df

y_test_df = pd.DataFrame(y_test.tolist() if len(y_test.shape) > 2 else y_test)
y_test_df_clean = clean_data(y_test_df.copy())
print(y_test_df_clean.shape)

(18979, 1)


Utiliser des transformers pour choisir quelle variable est plus intérressante pour le modèle
https://www.youtube.com/watch?v=T4nZDuakYlU&list=PLO_fdPEVlfKoHQ3Ua2NtDL4nmynQC8YiS&index=9

Model training


In [55]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(random_state=42)
model.fit(X_train_df_clean, y_train_df_clean)
y_pred = model.predict(X_test_df_clean)

Learning rate set to 0.081146
0:	learn: 137574.0049645	total: 7.56ms	remaining: 7.55s
1:	learn: 132845.2478805	total: 14.9ms	remaining: 7.45s
2:	learn: 128568.0673594	total: 22ms	remaining: 7.32s
3:	learn: 124793.7126120	total: 28.8ms	remaining: 7.18s
4:	learn: 121519.5279570	total: 35.5ms	remaining: 7.07s
5:	learn: 118364.0683444	total: 43.8ms	remaining: 7.25s
6:	learn: 115496.2995029	total: 50.2ms	remaining: 7.12s
7:	learn: 112912.2272497	total: 56.9ms	remaining: 7.05s
8:	learn: 110410.5549367	total: 63.6ms	remaining: 7.01s
9:	learn: 108291.5692003	total: 70.8ms	remaining: 7.01s
10:	learn: 106276.8643954	total: 78.5ms	remaining: 7.05s
11:	learn: 104527.1245459	total: 85.4ms	remaining: 7.03s
12:	learn: 102775.2123990	total: 91.8ms	remaining: 6.97s
13:	learn: 101296.1635271	total: 98.7ms	remaining: 6.95s
14:	learn: 99679.4508884	total: 105ms	remaining: 6.92s
15:	learn: 98508.0602372	total: 112ms	remaining: 6.9s
16:	learn: 97295.7246414	total: 119ms	remaining: 6.89s
17:	learn: 96229.562

In [54]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_df_clean, y_pred)
mse = mean_squared_error(y_test_df_clean, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df_clean, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")


MAE: 47856.00886071866
MSE: 4370635560.1221075
RMSE: 66110.78248003201
R^2 Score: 0.7846459907700529


In [46]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from catboost import CatBoostRegressor
import pandas as pd

# Définir le modèle de base
model = CatBoostRegressor(random_state=42)

param_grid = {
    'iterations': [100, 300, 500, 700],
    'learning_rate': [0.001, 0.01, 0.05],
    'depth': [3, 5, 7, 10, 12],
    'l2_leaf_reg': [0.1, 1, 3, 5, 10]
}


# Configurer la recherche en grille
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Exécuter la recherche en grille
grid_search.fit(X_train_df_clean, y_train_df_clean)

# Afficher les meilleurs hyperparamètres
print("Best parameters found: ", grid_search.best_params_)

# Évaluer les performances du modèle optimisé
best_model = grid_search.best_estimator_
cv_scores = cross_val_score(best_model, X_train_df_clean, y_train_df_clean, cv=5, scoring='neg_mean_absolute_error')

# Afficher les scores de validation croisée
print("Cross-validated scores (MAE): ", -cv_scores)
print("Mean CV score (MAE): ", -cv_scores.mean())

# Faire des prédictions et évaluer le modèle sur les données de test
y_pred = best_model.predict(X_test_df_clean)

# Calculer les métriques de performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test_df_clean, y_pred)
mse = mean_squared_error(y_test_df_clean, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df_clean, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")


KeyboardInterrupt: 