In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import pickle
import optuna


In [109]:
df = pd.read_csv('dataset.csv')

In [110]:
print(df.shape)
print(df.info())
print(df.describe())

(118714, 24)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118714 entries, 0 to 118713
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   BathroomCount      109112 non-null  float64
 1   BedroomCount       118714 non-null  int64  
 2   ConstructionYear   68898 non-null   float64
 3   District           118707 non-null  object 
 4   Furnished          30325 non-null   float64
 5   Garden             22503 non-null   float64
 6   Kitchen            67134 non-null   object 
 7   LivingArea         104539 non-null  float64
 8   MonthlyCharges     13650 non-null   float64
 9   NumberOfFacades    76942 non-null   float64
 10  PEB                85074 non-null   object 
 11  Price              118714 non-null  int64  
 12  Province           118707 non-null  object 
 13  Region             118707 non-null  object 
 14  RoomCount          32916 non-null   float64
 15  ShowerCount        57127 non-null   fl

In [111]:
df.columns

Index(['BathroomCount', 'BedroomCount', 'ConstructionYear', 'District',
       'Furnished', 'Garden', 'Kitchen', 'LivingArea', 'MonthlyCharges',
       'NumberOfFacades', 'PEB', 'Price', 'Province', 'Region', 'RoomCount',
       'ShowerCount', 'StateOfBuilding', 'SubtypeOfProperty', 'SurfaceOfPlot',
       'SwimmingPool', 'Terrace', 'ToiletCount', 'TypeOfProperty',
       'TypeOfSale'],
      dtype='object')

In [112]:
df

Unnamed: 0,BathroomCount,BedroomCount,ConstructionYear,District,Furnished,Garden,Kitchen,LivingArea,MonthlyCharges,NumberOfFacades,...,RoomCount,ShowerCount,StateOfBuilding,SubtypeOfProperty,SurfaceOfPlot,SwimmingPool,Terrace,ToiletCount,TypeOfProperty,TypeOfSale
0,1.0,1,1969.0,Brugge,,,,29.0,,,...,1.0,0.0,GOOD,flat_studio,,,1.0,1.0,2,residential_sale
1,6.0,13,1920.0,Tournai,0.0,,,391.0,,3.0,...,31.0,,GOOD,apartment_block,130.0,,,5.0,1,residential_sale
2,2.0,4,2008.0,Brugge,1.0,,INSTALLED,111.0,,2.0,...,,0.0,GOOD,house,0.0,,,2.0,1,residential_sale
3,1.0,4,,Veurne,,1.0,,,,2.0,...,,,TO_BE_DONE_UP,house,170.0,0.0,1.0,2.0,1,residential_sale
4,0.0,2,1972.0,Hasselt,,,,92.0,,,...,1.0,0.0,AS_NEW,apartment,,,1.0,1.0,2,residential_sale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118709,1.0,3,2024.0,Gent,,1.0,,129.0,,,...,4.0,,GOOD,house,234.0,,,0.0,1,residential_sale
118710,4.0,4,,Antwerp,0.0,,,318.0,,2.0,...,,,JUST_RENOVATED,apartment_block,202.0,,,,1,residential_sale
118711,1.0,2,,Antwerp,,,HYPER_EQUIPPED,85.0,,4.0,...,,,GOOD,apartment,,0.0,1.0,1.0,2,residential_sale
118712,1.0,2,,Brugge,,,,100.0,,,...,8.0,,,apartment,,,1.0,,2,residential_sale


refining cleaned data

In [113]:
df=df.drop(['TypeOfSale'],axis=1)





In [114]:
def clean_data(df):
    df_clean = df.copy()  # Define the variable 'df_clean' and assign it a copy of 'df'
    # Filter rows based on column: 'bathroomcount'
    df_clean = df_clean[df_clean['BathroomCount'] <= 8]
    # Filter rows based on column: 'bedroomcount'
    df_clean = df_clean[df_clean['BedroomCount'] <= 6]
    # Filter rows based on column: 'livingarea'
    df_clean = df_clean[df_clean['LivingArea'] < 350]
    # Filter rows based on column: 'numberoffacades'
    df_clean = df_clean[df_clean['NumberOfFacades'] <= 4]
    # Filter rows based on column: 'showercount'
    df_clean = df_clean[df_clean['ShowerCount'] <= 3]
    # Filter rows based on column: 'surfaceofplot'
    df_clean = df_clean[df_clean['SurfaceOfPlot'] <= 1000]
    # Filter rows based on column: 'toiletcount'
    df_clean = df_clean[df_clean['ToiletCount'] <= 4]
    # Filter rows based on column: 'bathroomcount'
    df_clean = df_clean[df_clean['BathroomCount'] <= 3]
    return df_clean

df_clean = clean_data(df)

In [115]:
print(df.columns)


Index(['BathroomCount', 'BedroomCount', 'ConstructionYear', 'District',
       'Furnished', 'Garden', 'Kitchen', 'LivingArea', 'MonthlyCharges',
       'NumberOfFacades', 'PEB', 'Price', 'Province', 'Region', 'RoomCount',
       'ShowerCount', 'StateOfBuilding', 'SubtypeOfProperty', 'SurfaceOfPlot',
       'SwimmingPool', 'Terrace', 'ToiletCount', 'TypeOfProperty'],
      dtype='object')


Categorize str values
constructionyear, district, floodingzone, subtypeofproperty, typeofsale

In [118]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import pickle

columns_to_encode = ['District', 'Furnished', 'Garden', 'Kitchen', 'PEB', 'Province', 'Region', 'StateOfBuilding', 'SubtypeOfProperty', 'SwimmingPool','Terrace', 'TypeOfProperty']

data_to_encode = df_clean[columns_to_encode]

one = OneHotEncoder()

encoded_data = one.fit_transform(data_to_encode)

encoded_df = pd.DataFrame(encoded_data.toarray(), columns=one.get_feature_names_out(columns_to_encode))

df_final = pd.concat([df_clean.drop(columns=columns_to_encode), encoded_df], axis=1)

print(df_final.shape)
print(df_final.info())
df_final.to_csv('final_dataset.csv', index=False)
df_final.head()

# Save the model to a pickle file
with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(LinearRegression, file)


(28318, 124)
<class 'pandas.core.frame.DataFrame'>
Index: 28318 entries, 2 to 14968
Columns: 124 entries, BathroomCount to TypeOfProperty_1
dtypes: float64(124)
memory usage: 27.0 MB
None


Machine learning process

In [119]:
correlations = df_final.drop(columns=['Price']).corrwith(df_final['Price'])

print(correlations)



BathroomCount       0.248813
BedroomCount        0.277119
ConstructionYear    0.148299
LivingArea          0.390056
MonthlyCharges      0.690799
                      ...   
SwimmingPool_1.0    0.038767
SwimmingPool_nan    0.016408
Terrace_1.0        -0.005477
Terrace_nan         0.005477
TypeOfProperty_1         NaN
Length: 123, dtype: float64


  c /= stddev[:, None]
  c /= stddev[None, :]


In [85]:
'''#filling missing values with KNNImputer
imputer = KNNImputer(n_neighbors=5)
df_final_imputed = imputer.fit_transform(df_final)
df_final_imputed = pd.DataFrame(df_final_imputed, columns=df_final.columns)

#save the df_final_imputed to a csv file
df_final_imputed.to_csv('final_dataset_imputed.csv', index=False)'''

"#filling missing values with KNNImputer\nimputer = KNNImputer(n_neighbors=5)\ndf_final_imputed = imputer.fit_transform(df_final)\ndf_final_imputed = pd.DataFrame(df_final_imputed, columns=df_final.columns)\n\n#save the df_final_imputed to a csv file\ndf_final_imputed.to_csv('final_dataset_imputed.csv', index=False)"

In [120]:
from sklearn.model_selection import train_test_split

df = df_final

y = np.array(df ['Price'])
X = np.array(df.drop(columns=['Price']))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (22654, 123)
X_test shape: (5664, 123)
y_train shape: (22654,)
y_test shape: (5664,)


Cleaning X train, test


In [87]:
'''def clean_data(X_train_df):
    for col in X_train_df.columns:
        X_train_df[col] = X_train_df[col].fillna(X_train_df[col].median())
    return X_train_df

X_train_df = pd.DataFrame(X_train.tolist() if len(X_train.shape) > 2 else X_train)
X_train_df_clean = clean_data(X_train_df.copy())
print(X_train_df_clean.head())'''

'def clean_data(X_train_df):\n    for col in X_train_df.columns:\n        X_train_df[col] = X_train_df[col].fillna(X_train_df[col].median())\n    return X_train_df\n\nX_train_df = pd.DataFrame(X_train.tolist() if len(X_train.shape) > 2 else X_train)\nX_train_df_clean = clean_data(X_train_df.copy())\nprint(X_train_df_clean.head())'

In [88]:
'''def clean_data(X_test_df):
    for col in X_test_df.columns:
        if X_test_df[col].isnull().any():
            X_test_df[col].fillna(X_test_df[col].median(), inplace=True)
    return X_test_df

X_test_df = pd.DataFrame(X_test.tolist() if len(X_test.shape) > 2 else X_test)
X_test_df_clean = clean_data(X_test_df.copy())
print(X_test_df_clean.head())
'''



'def clean_data(X_test_df):\n    for col in X_test_df.columns:\n        if X_test_df[col].isnull().any():\n            X_test_df[col].fillna(X_test_df[col].median(), inplace=True)\n    return X_test_df\n\nX_test_df = pd.DataFrame(X_test.tolist() if len(X_test.shape) > 2 else X_test)\nX_test_df_clean = clean_data(X_test_df.copy())\nprint(X_test_df_clean.head())\n'

Cleaning Y train and Y test

In [89]:
'''def clean_data(y_train_df):
    y_train_df = y_train_df.fillna({0: y_train_df[0].median()})
    return y_train_df

y_train_df = pd.DataFrame(y_train.tolist() if len(y_train.shape) > 2 else y_train)
y_train_df_clean = clean_data(y_train_df.copy())'''




'def clean_data(y_train_df):\n    y_train_df = y_train_df.fillna({0: y_train_df[0].median()})\n    return y_train_df\n\ny_train_df = pd.DataFrame(y_train.tolist() if len(y_train.shape) > 2 else y_train)\ny_train_df_clean = clean_data(y_train_df.copy())'

In [121]:
'''def clean_data(y_test_df):
    y_test_df = y_test_df.fillna({0: 0})
    return y_test_df

y_test_df = pd.DataFrame(y_test.tolist() if len(y_test.shape) > 2 else y_test)
y_test_df_clean = clean_data(y_test_df.copy())
print(y_test_df_clean.shape)'''

(5664, 1)


Utiliser des transformers pour choisir quelle variable est plus intérressante pour le modèle
https://www.youtube.com/watch?v=T4nZDuakYlU&list=PLO_fdPEVlfKoHQ3Ua2NtDL4nmynQC8YiS&index=9

Model training


In [128]:
import numpy as np
from catboost import CatBoostRegressor
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Assuming df_final is your DataFrame
df = df_final

# Separating features and target variable
y = np.array(df['Price'])
X = np.array(df.drop(columns=['Price']))

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Define the hyperparameters to tune
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 1.0)
    }

    # Create the model with the hyperparameters
    model = CatBoostRegressor(random_state=42, **params, verbose=0)

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_pred = model.predict(X_test)

    # Calculate the evaluation metric (mean squared error)
    mse = mean_squared_error(y_test, y_pred)

    # Return the evaluation metric as the objective value to minimize
    return mse

# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters and the best objective value
best_params = study.best_params
best_objective = study.best_value

# Create a new model with the best hyperparameters
best_model = CatBoostRegressor(random_state=42, **best_params)

# Fit the best model on the training data
best_model.fit(X_train, y_train)

# Predict on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate both mean squared error and mean absolute error for the predictions
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Best Hyperparameters: {best_params}")
print(f"Best MSE from Study: {best_objective}")
print(f"Test Set MSE with Best Model: {mse}")
print(f"Test Set MAE with Best Model: {mae}")


[I 2024-08-13 15:32:21,169] A new study created in memory with name: no-name-6c26e87f-b20d-4269-9e4b-adf4b2a7e726
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 1.0)
[W 2024-08-13 15:32:21,172] Trial 0 failed with parameters: {'learning_rate': 0.0011434598146283267, 'max_depth': 10, 'n_estimators': 622, 'subsample': 0.5123567052687279, 'colsample_bylevel': 0.7074419770840347, 'reg_alpha': 0.09978836800784607, 'reg_lambda': 0.0029926359995735255} because of the following error: TypeError("CatBoostRegressor.__init__() got an unexpected keyword argument 'reg_alpha'").
Traceback (most recent call last):
  File "C:\Users\pieta\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\Loc

TypeError: CatBoostRegressor.__init__() got an unexpected keyword argument 'reg_alpha'

In [92]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_df_clean, y_pred)
mse = mean_squared_error(y_test_df_clean, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df_clean, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")

model.save_model('catboost_model.cbm')

MAE: 213641.72080697995
MSE: 84991103762.38252
RMSE: 291532.3374213957
R^2 Score: -0.10516959805880588


In [93]:
df_final.head(1)


Unnamed: 0,BathroomCount,BedroomCount,ConstructionYear,LivingArea,MonthlyCharges,NumberOfFacades,Price,RoomCount,ShowerCount,SurfaceOfPlot,...,SubtypeOfProperty_mixed_use_building,SubtypeOfProperty_other_property,SubtypeOfProperty_town_house,SubtypeOfProperty_villa,SwimmingPool_0.0,SwimmingPool_1.0,SwimmingPool_nan,Terrace_1.0,Terrace_nan,TypeOfProperty_1
2,2.0,4.0,2008.0,111.0,,2.0,399000.0,,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [94]:
'''import streamlit as st
import pandas as pd
from catboost import CatBoostRegressor

# Function to load the model
def load_model():
    model = CatBoostRegressor()
    model.load_model('catboost_model.cbm')
    return model

# Function to preprocess user input
def preprocess_input(user_input):
    categorical_features = ['furnished', 'district', 'subtypeofproperty', 'typeofsale', 'peb', 'province', 'region']
    user_input_df = pd.DataFrame([user_input])
    dummies = pd.get_dummies(user_input_df[categorical_features])
    user_input_df = pd.concat([user_input_df, dummies], axis=1)
    user_input_df = user_input_df.drop(categorical_features, axis=1)
    return user_input_df

# Main function to run the Streamlit app
def main():
    st.title("Real Estate Price Prediction")

    # Predefined options for each feature
    furnished_options = ["Yes", "No"]

    district_options = ["district_Aalst", "district_Antwerp","district_Arlon" , "district_Ath",
                        "district_Bastogne","district_Brugge","district_Brussels","district_Charleroi",
                        "district_Dendermonde","district_Diksmuide","district_Dinant","district_Eeklo",
                        "district_Gent","district_Halle-Vilvoorde","district_Hasselt","district_Huy",
                        "district_Ieper" ,"district_Kortrijk","district_Leuven","district_Liège",
                        "district_Maaseik","district_Marche-en-Famenne","district_Mechelen","district_Mons",
                        "district_Mouscron","district_Namur","district_Neufchâteau","district_Nivelles",
                        "district_Oostend","district_Oudenaarde","district_Philippeville","district_Sint-Niklaas",
                        "district_Roeselare","district_Soignies","district_Thuin","district_Tielt",
                        "district_Tongeren","district_Tournai","district_Turnhout","district_Verviers",
                        "district_Veurne","district_Virton","district_Waremme"]  

    subtypeofproperty_options = ["subtypeofproperty_apartment", "subtypeofproperty_apartement_block","subtypeofproperty_bungalow","subtypeofproperty_castle",
                                 "subtypeofproperty_chalet","subtypeofproperty_country_cottage","subtypeofproperty_duplex","subtypeofproperty_exeptional_property",
                                 "subtypeofproperty_farmhouse","subtypeofproperty_flat_studio","subtypeofproperty_ground_floor","subtypeofproperty_house",
                                 "subtypeofproperty_kot","subtypeofproperty_loft","subtypeofproperty_mansion","subtypeofproperty_manor_house",
                                 "subtypeofproperty_mixed_use_building","subtypeofproperty_other_property","subtypeofproperty_penthouse","subtypeofproperty_service_flat",
                                 "subtypeofproperty_pavilion","subtypeofproperty", "subtypeofproperty_town_house","subtypeofproperty_triplex","subtypeofproperty_villa",]  

    peb_options = ["peb_A", "peb_A+", "peb_A++", 
                   "peb_A_A+", "peb_B", "peb_B_A", 
                   "peb_C","peb_D", "peb_E","peb_E_D","peb_F","peb_F_C",
                   "peb_F_D","peb_F_E","peb_G"] 
    
    province_options = ["province_Antwerp", "province_Brussels", "province_East Flanders",
                        "province_Flemish Brabant","province_Hainaut", "province_Limburg",
                        "province_Liège","province_Luxembourg","province_Namur",
                        "province_Walloon Brabant", "province_West Flanders" ] 
    
    region_options = ["region_Brussels", "region_Flanders", "region_Wallonie"] 

    # User input fields
    furnished = st.selectbox("Furnished", furnished_options)
    district = st.selectbox("District", district_options)
    subtypeofproperty = st.selectbox("Subtype of Property", subtypeofproperty_options)
    peb = st.selectbox("PEB", peb_options)
    province = st.selectbox("Province", province_options)
    region = st.selectbox("Region", region_options)

    # Collect user input into a dictionary
    user_input = {
        'furnished': furnished,
        'district': district,
        'subtypeofproperty': subtypeofproperty,
        'peb': peb,
        'province': province,
        'region': region
    }

    # Preprocess the user input
    user_input_df = preprocess_input(user_input)

    # Load the model
    model = load_model()

    # Predict button
    if st.button("Predict"):
        prediction = model.predict(user_input_df)
        st.write(f"Predicted Price: {prediction[0]}")

if __name__ == '__main__':
    main()'''
    

'import streamlit as st\nimport pandas as pd\nfrom catboost import CatBoostRegressor\n\n# Function to load the model\ndef load_model():\n    model = CatBoostRegressor()\n    model.load_model(\'catboost_model.cbm\')\n    return model\n\n# Function to preprocess user input\ndef preprocess_input(user_input):\n    categorical_features = [\'furnished\', \'district\', \'subtypeofproperty\', \'typeofsale\', \'peb\', \'province\', \'region\']\n    user_input_df = pd.DataFrame([user_input])\n    dummies = pd.get_dummies(user_input_df[categorical_features])\n    user_input_df = pd.concat([user_input_df, dummies], axis=1)\n    user_input_df = user_input_df.drop(categorical_features, axis=1)\n    return user_input_df\n\n# Main function to run the Streamlit app\ndef main():\n    st.title("Real Estate Price Prediction")\n\n    # Predefined options for each feature\n    furnished_options = ["Yes", "No"]\n\n    district_options = ["district_Aalst", "district_Antwerp","district_Arlon" , "district_Ath",

In [95]:
'''from sklearn.model_selection import GridSearchCV, cross_val_score
from catboost import CatBoostRegressor
import pandas as pd

# Définir le modèle de base
model = CatBoostRegressor(random_state=42)

param_grid = {
    'iterations': [100, 300, 500, 700],
    'learning_rate': [0.001, 0.01, 0.05],
    'depth': [3, 5, 7, 10, 12],
    'l2_leaf_reg': [0.1, 1, 3, 5, 10]
}


# Configurer la recherche en grille
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Exécuter la recherche en grille
grid_search.fit(X_train_df_clean, y_train_df_clean)

# Afficher les meilleurs hyperparamètres
print("Best parameters found: ", grid_search.best_params_)

# Évaluer les performances du modèle optimisé
best_model = grid_search.best_estimator_
cv_scores = cross_val_score(best_model, X_train_df_clean, y_train_df_clean, cv=5, scoring='neg_mean_absolute_error')

# Afficher les scores de validation croisée
print("Cross-validated scores (MAE): ", -cv_scores)
print("Mean CV score (MAE): ", -cv_scores.mean())

# Faire des prédictions et évaluer le modèle sur les données de test
y_pred = best_model.predict(X_test_df_clean)

# Calculer les métriques de performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test_df_clean, y_pred)
mse = mean_squared_error(y_test_df_clean, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df_clean, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")'''




'from sklearn.model_selection import GridSearchCV, cross_val_score\nfrom catboost import CatBoostRegressor\nimport pandas as pd\n\n# Définir le modèle de base\nmodel = CatBoostRegressor(random_state=42)\n\nparam_grid = {\n    \'iterations\': [100, 300, 500, 700],\n    \'learning_rate\': [0.001, 0.01, 0.05],\n    \'depth\': [3, 5, 7, 10, 12],\n    \'l2_leaf_reg\': [0.1, 1, 3, 5, 10]\n}\n\n\n# Configurer la recherche en grille\ngrid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=\'neg_mean_absolute_error\', n_jobs=-1)\n\n# Exécuter la recherche en grille\ngrid_search.fit(X_train_df_clean, y_train_df_clean)\n\n# Afficher les meilleurs hyperparamètres\nprint("Best parameters found: ", grid_search.best_params_)\n\n# Évaluer les performances du modèle optimisé\nbest_model = grid_search.best_estimator_\ncv_scores = cross_val_score(best_model, X_train_df_clean, y_train_df_clean, cv=5, scoring=\'neg_mean_absolute_error\')\n\n# Afficher les scores de validation crois