In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV

In [5]:
df= pd.read_csv('train_dataset.csv')
df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Peugeot,162021,90,diesel,black,estate,False,False,False,False,False,False,True,62
1,Audi,153074,160,diesel,black,sedan,False,True,False,False,False,False,True,120
2,Peugeot,145497,235,petrol,black,hatchback,True,True,True,True,True,True,True,147
3,Peugeot,60772,100,diesel,grey,estate,True,True,False,False,False,False,True,115
4,Peugeot,148429,100,diesel,silver,estate,False,True,False,False,False,True,True,102


In [6]:
# Separate target variable Y from features X
target_name = 'rental_price_per_day'

print("Separating labels from features...")
Y = df.loc[:,target_name]
X = df.drop(target_name, axis = 1) # All columns are kept, except the target
print("...Done.")
print(Y.head())
print()
print(X.head())
print()


Separating labels from features...
...Done.
0     62
1    120
2    147
3    115
4    102
Name: rental_price_per_day, dtype: int64

  model_key  mileage  engine_power    fuel paint_color   car_type  \
0   Peugeot   162021            90  diesel       black     estate   
1      Audi   153074           160  diesel       black      sedan   
2   Peugeot   145497           235  petrol       black  hatchback   
3   Peugeot    60772           100  diesel        grey     estate   
4   Peugeot   148429           100  diesel      silver     estate   

   private_parking_available  has_gps  has_air_conditioning  automatic_car  \
0                      False    False                 False          False   
1                      False     True                 False          False   
2                       True     True                  True           True   
3                       True     True                 False          False   
4                      False     True                 False     

In [7]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [9]:
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [11]:
#Pipeline
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5])
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head())
X_test = preprocessor.transform(X_test)
print('...Done.')
print(X_test[0:5,:])
print()



Performing preprocessings on train set...
       model_key  mileage  engine_power    fuel paint_color    car_type  \
2592     Citroën   221500           155  petrol        grey       coupe   
3614  Mitsubishi   148480           155  diesel      silver         suv   
322   Mitsubishi    95657           190  diesel      silver         suv   
1560      Nissan   132448           135  diesel       black         suv   
1225         BMW   190627            85  diesel       black  subcompact   

      private_parking_available  has_gps  has_air_conditioning  automatic_car  \
2592                       True     True                  True          False   
3614                       True     True                  True           True   
322                        True     True                 False           True   
1560                      False     True                 False          False   
1225                      False     True                 False          False   

      has_getaround_

...Done.
  (0, 0)	1.3377341433028123
  (0, 1)	0.6547007260248097
  (0, 4)	1.0
  (0, 31)	1.0
  (0, 36)	1.0
  (0, 41)	1.0
  (0, 48)	1.0
  (0, 49)	1.0
  (0, 50)	1.0
  (0, 52)	1.0
  (0, 53)	1.0
  (0, 54)	1.0
  (1, 0)	0.11285280139319316
  (1, 1)	0.6547007260248097
  (1, 16)	1.0
  (1, 39)	1.0
  (1, 46)	1.0
  (1, 48)	1.0
  (1, 49)	1.0
  (1, 50)	1.0
  (1, 51)	1.0
  (1, 52)	1.0
  (1, 54)	1.0
  (2, 0)	-0.7732319305116523
  (2, 1)	1.552678903165439
  (2, 16)	1.0
  (2, 39)	1.0
  (2, 46)	1.0
  (2, 48)	1.0
  (2, 49)	1.0
  (2, 51)	1.0
  (2, 53)	1.0
  (2, 54)	1.0
  (3, 0)	-0.15607759676477745
  (3, 1)	0.14157033908730707
  (3, 17)	1.0
  (3, 32)	1.0
  (3, 46)	1.0
  (3, 49)	1.0
  (3, 53)	1.0
  (3, 54)	1.0
  (4, 0)	0.8198518964009269
  (4, 1)	-1.1412556282564494
  (4, 3)	1.0
  (4, 32)	1.0
  (4, 45)	1.0
  (4, 49)	1.0
  (4, 54)	1.0

Performing preprocessings on test set...
     model_key  mileage  engine_power    fuel paint_color   car_type  \
3317   Peugeot   199074           100  diesel       black     

In [13]:
# Train model 
model = RandomForestRegressor()

print("Training model...")
model.fit(X_train, Y_train)
print("...Done.")

Training model...
...Done.


In [14]:
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

In [17]:
# Print R^2 scores, quand on a fait une prédiction
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9687632551271089
R2 score on test set :  -0.6055560729067533


In [20]:
mse = mean_squared_error(Y_test, Y_test_pred)
# Calculez la RMSE en prenant la racine carrée du MSE
rmse = np.sqrt(mse)

print("RMSE:", rmse)

RMSE: 40.46012937328829


cross validation

In [24]:
model = RandomForestRegressor()

# cross val
scores = cross_val_score(model, X_train, Y_train, cv=3, scoring='r2')

print("Scores R2 pour chaque cv :", scores)

# Affichez la moyenne des scores
print("Moyenne des scores R2 :", scores.mean())

Scores R2 pour chaque cv : [0.78553928 0.75595335 0.75893568]
Moyenne des scores R2 : 0.7668094347487173


In [25]:
model = RandomForestRegressor()

# calcul rmse
rmse_scorer = make_scorer(lambda y, y_pred: mean_squared_error(y, y_pred, squared=False), greater_is_better=False)

# coss val
scores = cross_val_score(model, X_train, Y_train, cv=3, scoring=rmse_scorer)

abs_scores = abs(scores)

print("Scores RMSE pour chaque cv :", abs_scores)

# Affichez la moyenne des scores RMSE
print("Moyenne des scores RMSE :", abs_scores.mean())

Scores RMSE pour chaque cv : [14.61935117 15.5062888  15.82595711]
Moyenne des scores RMSE : 15.317199024326536


Gridsearch

In [27]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_depth': [None, 10, 20],
    'max_leaf_nodes': [2, 5 ,10]} 

# Instantiating Decision Tree classifier
forest = RandomForestRegressor()

# Instantiating RandomizedSearchCV object
forest_cv = GridSearchCV(forest,
                           param_grid=param_grid)

forest_cv.fit(X_train, Y_train)

# Print the tuned parameters and score
print("Tuned Random Forest Parameters: {}".format(forest_cv.best_params_))
print("Best score is {}".format(forest_cv.best_score_))

#Print Accuracy 
print("Accuracy on training set : ", forest_cv.score(X_train, Y_train))
print("Accuracy on test set : ", forest_cv.score(X_test, Y_test))

Tuned Random Forest Parameters: {'max_depth': 20, 'max_leaf_nodes': 10, 'n_estimators': 50}
Best score is 0.6269109054546671
Accuracy on training set :  0.6360572669284132
Accuracy on test set :  -0.7370189356137276


Le modèle random forest overfit.