In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor

import joblib

In [23]:
df= pd.read_csv('train_dataset.csv')
df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Peugeot,162021,90,diesel,black,estate,False,False,False,False,False,False,True,62
1,Audi,153074,160,diesel,black,sedan,False,True,False,False,False,False,True,120
2,Peugeot,145497,235,petrol,black,hatchback,True,True,True,True,True,True,True,147
3,Peugeot,60772,100,diesel,grey,estate,True,True,False,False,False,False,True,115
4,Peugeot,148429,100,diesel,silver,estate,False,True,False,False,False,True,True,102


In [24]:
# Separate target variable Y from features X
target_name = 'rental_price_per_day'

print("Separating labels from features...")
Y = df.loc[:,target_name]
X = df.drop(target_name, axis = 1) # All columns are kept, except the target
print("...Done.")
print(Y.head())
print()
print(X.head())
print()

Separating labels from features...
...Done.
0     62
1    120
2    147
3    115
4    102
Name: rental_price_per_day, dtype: int64

  model_key  mileage  engine_power    fuel paint_color   car_type  \
0   Peugeot   162021            90  diesel       black     estate   
1      Audi   153074           160  diesel       black      sedan   
2   Peugeot   145497           235  petrol       black  hatchback   
3   Peugeot    60772           100  diesel        grey     estate   
4   Peugeot   148429           100  diesel      silver     estate   

   private_parking_available  has_gps  has_air_conditioning  automatic_car  \
0                      False    False                 False          False   
1                      False     True                 False          False   
2                       True     True                  True           True   
3                       True     True                 False          False   
4                      False     True                 False     

In [25]:
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [26]:
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [27]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[('encoder', OneHotEncoder(drop='first'))])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) 
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test)
print('...Done.')
print(X_test[0:5,:]) 
print()

Performing preprocessings on train set...
       model_key  mileage  engine_power    fuel paint_color   car_type  \
691          BMW   101583            85  diesel       black  hatchback   
2183     Citroën   150355           120  diesel       black      sedan   
416      Citroën    90797           135  diesel        blue      sedan   
1226     Renault   163318            85  diesel        blue     estate   
217   Volkswagen   109779           100  diesel        grey  hatchback   

      private_parking_available  has_gps  has_air_conditioning  automatic_car  \
691                       False    False                 False          False   
2183                      False     True                 False          False   
416                       False    False                 False          False   
1226                       True     True                 False          False   
217                        True     True                 False          False   

      has_getaround_connec

In [28]:
#joblib.dump(preprocessor, 'preprocessor.joblib')

['preprocessor.joblib']

In [29]:
print("Train model...")
model = XGBRegressor()
model.fit(X_train, Y_train)
print("...Done.")

Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

Train model...


  if is_sparse(data):


...Done.


In [30]:
# Print R^2 scores, quand on a fait une prédiction
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9572249450857621
R2 score on test set :  0.7632108037253089


In [31]:
mse = mean_squared_error(Y_test, Y_test_pred)
# Calculez la RMSE en prenant la racine carrée du MSE
rmse = np.sqrt(mse)

print("RMSE:", rmse)

RMSE: 15.901262808774769


Gridsearch

In [32]:
# Perform grid search
print("Grid search...")
model = XGBRegressor()

# Grid of values to be tested
params = {
    'max_depth': [2, 4, 6],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300] 
}
print(params)
gridsearch = GridSearchCV(model, param_grid = params, cv = 3)
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print()
print("Accuracy on training set : ", gridsearch.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, Y_test))

Y_test_pred = gridsearch.predict(X_test)
r2 = r2_score(Y_test, Y_test_pred)
print("R2 score on test set : ", r2)

rmse = np.sqrt(mean_squared_error(Y_test, Y_test_pred))
print("RMSE on test set : ", rmse)

Grid search...
{'max_depth': [2, 4, 6], 'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 200, 300]}


  if is_sparse(data):


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sp

...Done.
Best hyperparameters :  {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}
Best validation accuracy :  0.7250354925152985

Accuracy on training set :  0.8883924883488149
Accuracy on test set :  0.7687543334373695
R2 score on test set :  0.7687543334373695
RMSE on test set :  15.7140262848919


In [33]:
print("Train model...")
model = XGBRegressor(learning_rate= 0.1, max_depth= 6, n_estimators= 100)
model.fit(X_train, Y_train)
print("...Done.")

Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

Train model...


  if is_sparse(data):


...Done.


In [34]:
# Print R^2 scores, quand on a fait une prédiction
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.8883924883488149
R2 score on test set :  0.7687543334373695


In [35]:
mse = mean_squared_error(Y_test, Y_test_pred)
# Calculez la RMSE en prenant la racine carrée du MSE
rmse = np.sqrt(mse)

print("RMSE:", rmse)

RMSE: 15.7140262848919


RandomizedSearchCV


In [36]:
param_dist = {
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300, 400],
}

# Créez un modèle (par exemple, XGBoost)
model = XGBRegressor()

# Créez un objet RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, scoring='r2', cv=5)

# Effectuez la recherche aléatoire
random_search.fit(X_train, Y_train)

# Obtenez les meilleurs hyperparamètres
print("...Done.")
print("Best hyperparameters : ", random_search.best_params_)
print("Best validation accuracy : ", random_search.best_score_)
print()
print("Accuracy on training set : ", random_search.score(X_train, Y_train))
print("Accuracy on test set : ", random_search.score(X_test, Y_test))

Y_test_pred = random_search.predict(X_test)
r2 = r2_score(Y_test, Y_test_pred)
print("R2 score on test set : ", r2)

rmse = np.sqrt(mean_squared_error(Y_test, Y_test_pred))
print("RMSE on test set : ", rmse)

  if is_sparse(data):


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sp

...Done.
Best hyperparameters :  {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.1}
Best validation accuracy :  0.7405378495192936

Accuracy on training set :  0.9309572057479581
Accuracy on test set :  0.7875471858512175
R2 score on test set :  0.7875471858512175
RMSE on test set :  15.06197484024119


In [37]:
print("Train model...")
model = XGBRegressor(learning_rate= 0.2, max_depth= 3, n_estimators= 200)
model.fit(X_train, Y_train)
print("...Done.")

Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

Train model...


  if is_sparse(data):


...Done.


In [38]:
# Print R^2 scores, quand on a fait une prédiction
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.8390515885312032
R2 score on test set :  0.7716044913066795


In [39]:
mse = mean_squared_error(Y_test, Y_test_pred)
# Calculez la RMSE en prenant la racine carrée du MSE
rmse = np.sqrt(mse)

print("RMSE:", rmse)

RMSE: 15.616886482384258


In [40]:
#joblib.dump(model, "xgboost.joblib")