In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder 
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

In [4]:
df= pd.read_csv('train_dataset.csv')
df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Peugeot,162021,90,diesel,black,estate,False,False,False,False,False,False,True,62
1,Audi,153074,160,diesel,black,sedan,False,True,False,False,False,False,True,120
2,Peugeot,145497,235,petrol,black,hatchback,True,True,True,True,True,True,True,147
3,Peugeot,60772,100,diesel,grey,estate,True,True,False,False,False,False,True,115
4,Peugeot,148429,100,diesel,silver,estate,False,True,False,False,False,True,True,102


In [7]:
#Garder quelque col
data_sample = df.loc[:, ["model_key", "mileage", "fuel", 'car_type', 'has_gps', 'rental_price_per_day']]
data_sample.head()

Unnamed: 0,model_key,mileage,fuel,car_type,has_gps,rental_price_per_day
0,Peugeot,162021,diesel,estate,False,62
1,Audi,153074,diesel,sedan,True,120
2,Peugeot,145497,petrol,hatchback,True,147
3,Peugeot,60772,diesel,estate,True,115
4,Peugeot,148429,diesel,estate,True,102


In [8]:
# Separate target variable Y from features X
target_name = 'rental_price_per_day'

print("Separating labels from features...")
Y = data_sample.loc[:,target_name]
X = data_sample.drop(target_name, axis = 1) # All columns are kept, except the target
print("...Done.")
print(Y.head())
print()
print(X.head())
print()

Separating labels from features...
...Done.
0     62
1    120
2    147
3    115
4    102
Name: rental_price_per_day, dtype: int64

  model_key  mileage    fuel   car_type  has_gps
0   Peugeot   162021  diesel     estate    False
1      Audi   153074  diesel      sedan     True
2   Peugeot   145497  petrol  hatchback     True
3   Peugeot    60772  diesel     estate     True
4   Peugeot   148429  diesel     estate     True



In [9]:
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['mileage']
Found categorical features  ['model_key', 'fuel', 'car_type', 'has_gps']


In [10]:
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [12]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[('encoder', OneHotEncoder(drop='first'))])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) 
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test)
print('...Done.')
print(X_test[0:5,:]) 
print()

Performing preprocessings on train set...
       model_key  mileage    fuel   car_type  has_gps
691          BMW   101583  diesel  hatchback    False
2183     Citroën   150355  diesel      sedan     True
416      Citroën    90797  diesel      sedan    False
1226     Renault   163318  diesel     estate     True
217   Volkswagen   109779  diesel  hatchback     True
...Done.
  (0, 0)	-0.6515125283677188
  (0, 2)	1.0
  (0, 33)	1.0
  (1, 0)	0.16665482817603175
  (1, 3)	1.0
  (1, 34)	1.0
  (1, 38)	1.0
  (2, 0)	-0.832451450447619
  (2, 3)	1.0
  (2, 34)	1.0
  (3, 0)	0.38411368659636796
  (3, 21)	1.0
  (3, 32)	1.0
  (3, 38)	1.0
  (4, 0)	-0.5140217620626139
  (4, 26)	1.0
  (4, 33)	1.0
  (4, 38)	1.0

Performing preprocessings on test set...
     model_key  mileage    fuel car_type  has_gps
3680       BMW   120395  diesel      suv     True
3746   Renault   300178  diesel   estate     True
2289    Nissan   145781  diesel      suv     True
2500      Audi   174311  diesel   estate     True
2240   Cit

## 1-Regression linéaire

In [13]:
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Y_train_pred = regressor.predict(X_train)
Y_test_pred = regressor.predict(X_test)

Train model...
...Done.


In [14]:
# Print R^2 scores, quand on a fait une prédiction
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.5919597797425868
R2 score on test set :  0.5565783380388047


In [15]:
mse = mean_squared_error(Y_test, Y_test_pred)
# Calculez la RMSE en prenant la racine carrée du MSE
rmse = np.sqrt(mse)

print("RMSE:", rmse)

RMSE: 21.760011795232522


## 2-XGBoost

In [16]:
print("Train model...")
model = XGBRegressor(learning_rate= 0.1, max_depth= 6, n_estimators= 100)
model.fit(X_train, Y_train)
print("...Done.")

Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

Train model...


  if is_sparse(data):


...Done.


In [17]:
# Print R^2 scores, quand on a fait une prédiction
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.7583849015577286
R2 score on test set :  0.6085900931055708


In [18]:
mse = mean_squared_error(Y_test, Y_test_pred)
# Calculez la RMSE en prenant la racine carrée du MSE
rmse = np.sqrt(mse)

print("RMSE:", rmse)

RMSE: 20.44403334570687
