In [42]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [43]:
# Load data
df = pd.read_csv('data/cars_cleaned.csv')

In [44]:
# Define X and y
y = df['prix']
X = df[['type_carburant', 'aspiration', 'nombre_portes', 'type_carrosserie',
       'roues_motrices', 'emplacement_moteur', 'empattement',
       'longueur_voiture', 'largeur_voiture', 'hauteur_voiture', 'poids_vide',
       'type_moteur', 'nombre_cylindres', 'taille_moteur', 'systeme_carburant',
       'alesage', 'course', 'taux_compression', 'puissance', 'trmin_max',
       'consommation_ville', 'consommation_autoroute', 'marque',
       'modele']]

In [45]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [46]:
# Preprocessing pipelines preparation
numeric_features = [
    'longueur_voiture', 'hauteur_voiture', 'largeur_voiture', 'empattement', 'taille_moteur',
    'poids_vide', 'consommation_ville', 'consommation_autoroute', 'puissance', 'trmin_max',
    'nombre_cylindres', 'alesage', 'course', 'taux_compression'
]

# List of categorical features to be included
# Uncomment and modify each list item to include/exclude features
categorial_features = [
    #'marque',                # Uncomment to exclude 'marque'
    #'modele',                # Uncomment to exclude 'modele'
    'type_carrosserie',
    'nombre_portes',
    'type_carburant',
    'emplacement_moteur',
    #'systeme_carburant',
    #'aspiration',
    'roues_motrices',
    'type_moteur'
]


In [47]:
numeric_transformer_minmax = Pipeline([('minmax', MinMaxScaler())])
numeric_transformer_std = Pipeline([('standard', StandardScaler())])
numeric_transformer_rbst = Pipeline([('standard', RobustScaler())])

categorial_transformer = OneHotEncoder(sparse_output=True, handle_unknown='ignore')

In [48]:
# Add feature selection
numeric_transformer_rbst = Pipeline([
    ('standard', RobustScaler()),
    ('selection', SelectKBest(score_func=f_regression, k=10))  # choose 10 best features
])

In [49]:
# Update preprocessor with feature selection
preprocessor_rbst = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_rbst, numeric_features),
        ('cat', categorial_transformer, categorial_features)
    ],
    remainder='passthrough'
)

In [50]:
# Create pipeline with RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
pipe_rf = Pipeline([
    ('prep', preprocessor_rbst),
    ('rf', rf)
])

In [51]:
# Search for optimal hyperparameters using GridSearchCV
param_grid = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipe_rf, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
# Get the best parameters
best_params = grid_search.best_params_
print("Meilleurs paramètres trouvés :")
print(best_params)

In [None]:
# Evaluate the model with the best parameters on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
score = best_model.score(X_test, y_test)
print("Score R2 avec RandomForestRegressor :")
print(score)

In [None]:
# Save the trained model
pickle.dump(best_model, open('data/best_model.pkl', 'wb'))