In [20]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [21]:
# 📌 Charger les données
df = pd.read_csv("data/get_around_pricing_project.csv", index_col=0)
df.head(5)


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [22]:
print(df.isna().sum())  # Affiche le nombre de valeurs manquantes par colonne


model_key                    0
mileage                      0
engine_power                 0
fuel                         0
paint_color                  0
car_type                     0
private_parking_available    0
has_gps                      0
has_air_conditioning         0
automatic_car                0
has_getaround_connect        0
has_speed_regulator          0
winter_tires                 0
rental_price_per_day         0
dtype: int64


In [23]:
df.dtypes

model_key                    object
mileage                       int64
engine_power                  int64
fuel                         object
paint_color                  object
car_type                     object
private_parking_available      bool
has_gps                        bool
has_air_conditioning           bool
automatic_car                  bool
has_getaround_connect          bool
has_speed_regulator            bool
winter_tires                   bool
rental_price_per_day          int64
dtype: object

In [24]:
# Définition des colonnes catégorielles
cat_cols = ["model_key", "fuel", "paint_color", "car_type"]

# Création du transformateur pour les variables catégoriques
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"  # Laisse les colonnes numériques inchangées
)


# Choix des features


In [25]:
# Séparation X et y
y = df["rental_price_per_day"]
X = df.drop(columns=["rental_price_per_day"])

In [26]:
# Séparation en train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")

Train set: (3874, 13), Test set: (969, 13)


In [28]:
X_train.dtypes

model_key                    object
mileage                       int64
engine_power                  int64
fuel                         object
paint_color                  object
car_type                     object
private_parking_available      bool
has_gps                        bool
has_air_conditioning           bool
automatic_car                  bool
has_getaround_connect          bool
has_speed_regulator            bool
winter_tires                   bool
dtype: object

In [29]:
# Entraînement du preprocessing
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [30]:
# Recherche des meilleurs hyperparamètres pour le modèle
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20],
    "min_samples_split": [2, 5, 10]
}

In [31]:
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring="neg_mean_absolute_error")
grid_search.fit(X_train_transformed, y_train)

# Meilleurs paramètres
print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}


In [32]:

# Sauvegarde du modèle et du préprocesseur
best_model = grid_search.best_estimator_
joblib.dump(best_model, "models/rental_price_model.joblib")
joblib.dump(preprocessor, "models/preprocessor.joblib")

print("✅ Modèle et préprocesseur sauvegardés avec succès !")

✅ Modèle et préprocesseur sauvegardés avec succès !
