In [61]:
import pandas as pd
import numpy as np
from scipy.stats import loguniform
from sklearn.model_selection import train_test_split, RandomizedSearchCV, RepeatedKFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import shap

import warnings
warnings.filterwarnings("ignore")

In [7]:
data = pd.read_csv('./src/get_around_pricing_project.csv')
data = data.iloc[:,1:]
data.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [9]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [27]:
numerical_features, categorical_features = [], []
for col in X.select_dtypes('int64'):
    numerical_features.append(col)
for el in X.columns.tolist():
    if el not in numerical_features:
        categorical_features.append(el)

In [29]:
numerical_features

['mileage', 'engine_power']

In [30]:
categorical_features

['model_key',
 'fuel',
 'paint_color',
 'car_type',
 'private_parking_available',
 'has_gps',
 'has_air_conditioning',
 'automatic_car',
 'has_getaround_connect',
 'has_speed_regulator',
 'winter_tires']

In [32]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

### Linear Regression

#### Simple Linear Regression

In [70]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [72]:
train_pred_lr = lr.predict(X_train)
test_pred_lr = lr.predict(X_test)

print("r2-score sur le train set : ", r2_score(y_train, train_pred_lr))
print("r2-score sur le test set : ", r2_score(y_test, test_pred_lr))

r2-score sur le train set :  0.7146287522065222
r2-score sur le test set :  0.6993567963152809


#### RandomSearchCV avec RidgeRegression

In [62]:
space = dict()
space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
space['alpha'] = loguniform(1e-5, 100)
space['fit_intercept'] = [True, False]
space['normalize'] = [True, False]

In [76]:
ridge = Ridge()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [66]:
search = RandomizedSearchCV(ridge, space, n_iter=500, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv, random_state=1)

In [67]:
result = search.fit(X_train, y_train)

In [68]:
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: -12.086055979458338
Best Hyperparameters: {'alpha': 1.2586407337959669, 'fit_intercept': True, 'normalize': False, 'solver': 'sag'}


In [69]:
train_pred_rs = result.predict(X_train)
test_pred_rs = result.predict(X_test)

print("r2-score sur le train set : ", r2_score(y_train, train_pred_rs))
print("r2-score sur le test set : ", r2_score(y_test, test_pred_rs))

r2-score sur le train set :  0.7133055863203008
r2-score sur le test set :  0.6986414063517947


#### GridSearchCV avec RidgeRegression

In [74]:
space = dict()
space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
space['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
space['fit_intercept'] = [True, False]
space['normalize'] = [True, False]

In [77]:
search_cv = GridSearchCV(ridge, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)

In [78]:
result_cv = search_cv.fit(X_train, y_train)

In [80]:
print('Best Score: %s' % result_cv.best_score_)
print('Best Hyperparameters: %s' % result_cv.best_params_)

Best Score: -12.08630378236768
Best Hyperparameters: {'alpha': 1, 'fit_intercept': True, 'normalize': False, 'solver': 'sag'}


In [81]:
train_pred_gs = result_cv.predict(X_train)
test_pred_gs = result_cv.predict(X_test)

print("r2-score sur le train set : ", r2_score(y_train, train_pred_gs))
print("r2-score sur le test set : ", r2_score(y_test, test_pred_gs))

r2-score sur le train set :  0.7135341776064981
r2-score sur le test set :  0.6986863364399198


#### Gradient Boosting Regressor

In [83]:
GBR = GradientBoostingRegressor()

In [85]:
parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
              'subsample'    : [0.9, 0.5, 0.2, 0.1],
              'n_estimators' : [100,500,1000, 1500],
              'max_depth'    : [4,6,8,10]
                 }

In [None]:
grid_GBR = GridSearchCV(estimator=GBR, param_grid=parameters, cv=2, n_jobs=-1)
grid_GBR.fit(X_train, y_train)

In [None]:
print('Best Estimator : %s' % grid_GBR.best_estimator_)
print('Best Score: %s' % grid_GBR.best_score_)
print('Best Hyperparameters: %s' % grid_GBR.best_params_)

In [None]:
col_names = list(preprocessor.transformers_[0][1].get_feature_names_out()) + list(preprocessor.transformers_[1][1].get_feature_names_out())
feat_importances = pd.Series(np.abs(linear.coef_), index=col_names).sort_values(ascending=False)[:20] # limiting to 20 features
plt.figure(figsize=(10,10))
feat_importances.sort_values().plot(kind='barh')
plt.show() 