In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Random forest

In [4]:
df = pd.read_csv("data_ml_clean.csv")

In [5]:
X = df.drop('Price', axis=1)
y = df['Price']

In [6]:
# Identifier les colonnes catégorielles et numériques
categorical_cols = ['Country', 'City_Centre']
numerical_cols = ['Bedrooms', 'Salary', 'Fitness_Club', 'McDonalds', 'Water']

# Créer les préprocesseurs
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Créer le pipeline avec le préprocesseur et le modèle
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit
model.fit(X_train, y_train)

# Prédictions
y_pred = model.predict(X_test)

# Évaluation
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"R²: {r2_score(y_test, y_pred)}")

MAE: 233.9009118343195
RMSE: 361.24278724970554
R²: 0.7860316646132761


In [7]:
# Recherche par grid search des meilleurs hyperparamètres
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    model, param_grid, cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print(f"Meilleurs paramètres: {grid_search.best_params_}")

# Évaluation du modèle optimisé
y_pred_best = best_model.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred_best)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_best))}")
print(f"R²: {r2_score(y_test, y_pred_best)}")


Meilleurs paramètres: {'regressor__max_depth': 20, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
MAE: 231.741321046785
RMSE: 356.5384263858566
R²: 0.791568273618608
