In [12]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from mlxtend.feature_selection import  SequentialFeatureSelector
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import json

In [2]:
df = pd.read_csv('train_dataset.csv')
df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Peugeot,162021,90,diesel,black,estate,False,False,False,False,False,False,True,62
1,Audi,153074,160,diesel,black,sedan,False,True,False,False,False,False,True,120
2,Peugeot,145497,235,petrol,black,hatchback,True,True,True,True,True,True,True,147
3,Peugeot,60772,100,diesel,grey,estate,True,True,False,False,False,False,True,115
4,Peugeot,148429,100,diesel,silver,estate,False,True,False,False,False,True,True,102


In [6]:
# Separate target variable Y from features X
target_name = 'rental_price_per_day'

print("Separating labels from features...")
Y = df.loc[:,target_name]
X = df.drop(target_name, axis = 1) # All columns are kept, except the target
print("...Done.")
print(Y.head())
print()
print(X.head())
print()

Separating labels from features...
...Done.
0     62
1    120
2    147
3    115
4    102
Name: rental_price_per_day, dtype: int64

  model_key  mileage  engine_power    fuel paint_color   car_type  \
0   Peugeot   162021            90  diesel       black     estate   
1      Audi   153074           160  diesel       black      sedan   
2   Peugeot   145497           235  petrol       black  hatchback   
3   Peugeot    60772           100  diesel        grey     estate   
4   Peugeot   148429           100  diesel      silver     estate   

   private_parking_available  has_gps  has_air_conditioning  automatic_car  \
0                      False    False                 False          False   
1                      False     True                 False          False   
2                       True     True                  True           True   
3                       True     True                 False          False   
4                      False     True                 False     

In [4]:
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [7]:
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [8]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[('encoder', OneHotEncoder(drop='first'))])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) 
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test)
print('...Done.')
print(X_test[0:5,:]) 
print()

Performing preprocessings on train set...
       model_key  mileage  engine_power    fuel paint_color   car_type  \
691          BMW   101583            85  diesel       black  hatchback   
2183     Citroën   150355           120  diesel       black      sedan   
416      Citroën    90797           135  diesel        blue      sedan   
1226     Renault   163318            85  diesel        blue     estate   
217   Volkswagen   109779           100  diesel        grey  hatchback   

      private_parking_available  has_gps  has_air_conditioning  automatic_car  \
691                       False    False                 False          False   
2183                      False     True                 False          False   
416                       False    False                 False          False   
1226                       True     True                 False          False   
217                        True     True                 False          False   

      has_getaround_connec

In [29]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Y_train_pred = regressor.predict(X_train)
Y_test_pred = regressor.predict(X_test)

print("Précision avec les caractéristiques sélectionnées sur le train:", r2_score(Y_train, Y_train_pred))
print("Précision avec les caractéristiques sélectionnées sur le test:", r2_score(Y_test, Y_test_pred))

mse = mean_squared_error(Y_test, Y_test_pred)
# Calculez la RMSE en prenant la racine carrée du MSE
rmse = np.sqrt(mse)

print("RMSE:", rmse)

Train model...
...Done.
Précision avec les caractéristiques sélectionnées sur le train: 0.7163708659988113
Précision avec les caractéristiques sélectionnées sur le test: 0.6847695196451195
RMSE: 18.346987946351522


In [30]:
print(regressor.coef_)

[-12.70490883  13.77140952   9.80256016   6.82026859   3.9756579
  19.55934154 -35.17593707 -17.66712441 -96.47701384   9.29321818
   0.62114667   9.09301542  31.57102763 -43.1031578   22.96904279
  -6.97894133  21.14734397  -0.51029616  24.2447601  -10.81202494
   8.07446264 -26.0700747   15.51871438  30.31765374  23.3385993
  36.01800246  41.31831861  23.72874936  45.73958542  58.02374864
  72.37871706 -16.62196591   1.91765336  -1.36791798   3.65078957
 -24.03701479   0.38932808  -1.83956348   4.94402122  -2.47394625
   5.35358178   2.80554538 -11.01901565  -9.30098277  -5.09345288
  -6.47238929   1.31804163 -35.2387364    1.42625968  12.36147155
   1.09554539   4.94790087   5.47214543   5.49884968  -6.63991752]


In [31]:
column_names = []
for name, pipeline, features_list in preprocessor.transformers_:
    if name == 'num': 
        features = features_list 
    else: 
        features = pipeline.named_steps['encoder'].get_feature_names_out() 
    column_names.extend(features) 
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['mileage', 'engine_power', 'model_key_Audi', 'model_key_BMW', 'model_key_Citroën', 'model_key_Ferrari', 'model_key_Fiat', 'model_key_Ford', 'model_key_Honda', 'model_key_KIA Motors', 'model_key_Lamborghini', 'model_key_Lexus', 'model_key_Maserati', 'model_key_Mazda', 'model_key_Mercedes', 'model_key_Mini', 'model_key_Mitsubishi', 'model_key_Nissan', 'model_key_Opel', 'model_key_PGO', 'model_key_Peugeot', 'model_key_Porsche', 'model_key_Renault', 'model_key_SEAT', 'model_key_Subaru', 'model_key_Suzuki', 'model_key_Toyota', 'model_key_Volkswagen', 'model_key_Yamaha', 'fuel_electro', 'fuel_hybrid_petrol', 'fuel_petrol', 'paint_color_black', 'paint_color_blue', 'paint_color_brown', 'paint_color_green', 'paint_color_grey', 'paint_color_orange', 'paint_color_red', 'paint_color_silver', 'paint_color_white', 'car_type_coupe', 'car_type_estate', 'car_type_hatchback', 'car_type_sedan', 'car_type_subcompact', 'car_type_suv', 'car_type_van', 'p

In [32]:
coefs = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
mileage,-12.704909
engine_power,13.77141
model_key_Audi,9.80256
model_key_BMW,6.820269
model_key_Citroën,3.975658
model_key_Ferrari,19.559342
model_key_Fiat,-35.175937
model_key_Ford,-17.667124
model_key_Honda,-96.477014
model_key_KIA Motors,9.293218
