__`NOTEBOOK START`__

In [102]:
print('>>>> 📚 Importing librairies...')

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

import plotly.graph_objs as go

import joblib
print('>>>> ✅ ...Done')

>>>> 📚 Importing librairies...
>>>> ✅ ...Done


---------
### __1. Data Preprocessing__
---------

In [103]:
print('>>>> 📊 Loading dataset...')
df_price = pd.read_pickle('price_analysis_df_clean.csv')
print('>>>> ✅ ...Done')
print()

>>>> 📊 Loading dataset...
>>>> ✅ ...Done



### Separating the target variable from features

In [104]:
# Separate target variable Y from features X
target_name = 'rental_price_per_day'

print('>>>> 💔 Separating labels from features...')
Y = df_price.loc[:,target_name]
X = df_price.drop(target_name, axis = 1) # All columns are kept, except the target
print('--------------------------------')
print ('Target Value (Y): rental_price_per_day')
display(Y.head())
print('--------------------------------')
print ('Features Values (X):')
display(X.head())
print('>>>> ✅ ...Done')


>>>> 💔 Separating labels from features...
--------------------------------
Target Value (Y): rental_price_per_day


0    106
2    101
3    158
4    183
5    131
Name: rental_price_per_day, dtype: int64

--------------------------------
Features Values (X):


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True
5,Citroën,152352,225,petrol,black,convertible,True,True,False,False,True,True,True


>>>> ✅ ...Done


### Preprocessing pipelines

In [105]:
# First : always divide dataset into train set & test set !!
print('>>>> 🟨 Dividing into train and test sets...')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# test_size indicates the proportion of rows from X and Y that will go into the test dataset while 
# maintaining the correspondance between the rows from X and Y 
# random_state is given a value the results will be the same everytime the cell runs

print('>>>> ✅ ...Done')

>>>> 🟨 Dividing into train and test sets...
>>>> ✅ ...Done


In [106]:
print('>>>> 🟨 Creating pipeline for numeric features...')

numeric_features = ['mileage', 'engine_power'] # Names of numeric columns in X_train/X_test

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

print('>>>> ✅ ...Done')

>>>> 🟨 Creating pipeline for numeric features...
>>>> ✅ ...Done


In [107]:
print('>>>> 🟨 Creating pipeline for categorical features...')

categorical_features = ['model_key', 'fuel', 'paint_color', 'car_type', 
                        'private_parking_available', 'has_gps', 'has_air_conditioning', 
                        'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires'] # Names of categorical columns in X_train/X_test

categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

print('>>>> ✅ ...Done')

>>>> 🟨 Creating pipeline for categorical features...
>>>> ✅ ...Done


In [108]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [109]:
# Preprocessings on train set
print()
print('>>>> 🟨 Performing preprocessings on train set...')
print()
display(X_train.head())
print()

print('Preprocessings on train set result: ')
X_train = preprocessor.fit_transform(X_train)
display(X_train[0:5]) # must use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
print('>>>> ✅ ...Done')


>>>> 🟨 Performing preprocessings on train set...



Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
362,Renault,123353,135,diesel,silver,estate,False,True,False,False,True,False,True
1427,Peugeot,230784,100,diesel,black,estate,False,False,False,False,True,False,True
442,Peugeot,126591,105,diesel,black,estate,False,True,False,False,False,False,True
4740,Mitsubishi,195769,173,diesel,black,suv,True,True,False,False,True,False,True
4377,BMW,113722,120,diesel,silver,suv,True,True,False,True,True,False,True



Preprocessings on train set result: 


<5x41 sparse matrix of type '<class 'numpy.float64'>'
	with 41 stored elements in Compressed Sparse Row format>


>>>> ✅ ...Done


In [110]:
# Preprocessings on test set
print()
print('>>>> 🟨 Performing preprocessings on test set...')
print()
display(X_test.head())

print('Preprocessings on test set result: ')
X_test = preprocessor.transform(X_test) 
# We don't fit again. The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all our results.

display(X_test[0:5,:]) # must use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
print('>>>> ✅ ...Done')
print()


>>>> 🟨 Performing preprocessings on test set...



Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
3957,BMW,152889,140,diesel,white,suv,False,False,False,False,False,False,False
4091,Nissan,136760,120,diesel,grey,suv,False,True,False,False,False,False,True
2446,Mercedes,54239,100,diesel,brown,hatchback,True,True,False,False,False,True,True
1047,Peugeot,151787,105,diesel,grey,estate,False,False,False,False,False,False,True
3423,Peugeot,109236,100,diesel,black,sedan,False,True,False,False,False,False,True


Preprocessings on test set result: 


<5x41 sparse matrix of type '<class 'numpy.float64'>'
	with 34 stored elements in Compressed Sparse Row format>


>>>> ✅ ...Done



--------
### __2. Baseline Model: Linear Regression__
--------

### Model trainning and predictions

In [111]:
# Train model
print('>>>> 🟨 Trainning model...')
regressor_1 = LinearRegression()
regressor_1.fit(X_train, Y_train)
print('>>>> ✅ ...Done')


>>>> 🟨 Trainning model...
>>>> ✅ ...Done


In [112]:
# Predictions on training set
print('>>>> 🟨 Predictions results on training set...')
print()
Y_train_pred = regressor_1.predict(X_train)
print(Y_train_pred)
print()
print('>>>> ✅ ...Done')

>>>> 🟨 Predictions results on training set...

[126.73462693  77.76373351 108.02237294 ...  94.11075948  92.99349711
 114.20115666]

>>>> ✅ ...Done


In [113]:
# Predictions on test set
print('>>>> 🟨 Predictions results on test set...')
print()
Y_test_pred = regressor_1.predict(X_test)
print(Y_test_pred)
print()
print('>>>> ✅ ...Done')

>>>> 🟨 Predictions results on test set...

[120.3387623  112.58845075 143.80004873  89.35555722 115.94159855
 120.5327197   56.63478653 108.59601357 130.53502769 112.11977559
 108.23287862  98.51255562 144.62129164 174.91774079  95.28509097
 106.13173344 122.80080079 115.91501037 146.94349364 104.18947823
 110.48264167 102.71182078 101.53114591 124.19432706 156.18863745
 148.99674627 154.61452841  98.81706332 127.20367943 125.69911203
 111.09097952 115.6241209  117.67476194 111.49719799  90.81999926
 110.50312073 125.39814898 127.64940487 123.91491229 118.58474374
 171.75539921 100.36898478 115.3791007  132.76753969 152.45274093
 119.04993242  85.59286238 158.25637747  75.80184616 124.92459926
 136.61524034 115.20932581  78.55692956 111.56375841 122.53654583
  95.28895879 120.17801746 103.55480085 149.78233483 190.89334762
 130.33376182 141.92047664 126.71798814 117.91567357  99.04295859
 102.58595294  98.90681238 103.55714896 116.69798175 123.62130855
  98.99453963 194.68237378 107.35

--------
### __3. Evaluating the model: Performance Metrics & Coefficients__
--------

In [114]:
# Print R^2 scores
print('R2 score on training set : ', r2_score(Y_train, Y_train_pred).round(5))
print('R2 score on test set : ', r2_score(Y_test, Y_test_pred).round(5))

R2 score on training set :  0.70505
R2 score on test set :  0.71318


In [115]:
# Calculate the mean squared error of the model on the train and test sets using scikit-learn's mean_squared_error function
train_mse = mean_squared_error(Y_train, Y_train_pred)
test_mse = mean_squared_error(Y_test, Y_test_pred)

# Print the results
print(f'MSE on training set: {train_mse:.2f}')
print(f'MSE on test set: {test_mse:.2f}')

MSE on training set: 260.47
MSE on test set: 258.76


In [116]:
actual_vs_predicted_results = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_test_pred})
actual_vs_predicted_results.head()

Unnamed: 0,Actual,Predicted
3957,151,120.338762
4091,116,112.588451
2446,140,143.800049
1047,91,89.355557
3423,110,115.941599


In [117]:
# we get to compare the actual average of our target 'Weekly_Sales' with the average our model predicted. 
print(f'Actual average of our target Rental Price Per Day: {round(Y.mean(), 2)}')
print(f'Predicted average Rental Price Per Day: {round(Y_test_pred.mean(), 2)}')

Actual average of our target Rental Price Per Day: 120.25
Predicted average Rental Price Per Day: 120.35


In [118]:
print(f'The variance score is: {regressor_1.score(X_test,Y_test)}')

The variance score is: 0.7131808619558895


In [119]:
feature_importance = list(zip(X.columns.to_list(), regressor_1.coef_))

feature_importance = pd.DataFrame(feature_importance, columns=['Features','Coefficients'])

feature_importance['Coefficients'] = abs(feature_importance['Coefficients'])

feature_importance = feature_importance.sort_values(by = 'Coefficients', ascending = False)

feature_importance

Unnamed: 0,Features,Coefficients
10,has_getaround_connect,17.895859
9,automatic_car,15.242424
8,has_air_conditioning,14.066157
0,model_key,11.992298
1,mileage,11.727303
6,private_parking_available,10.973355
4,paint_color,8.987538
7,has_gps,8.661069
3,fuel,7.563708
2,engine_power,6.280387


In [120]:
df_features = pd.DataFrame(feature_importance, columns=['Features','Coefficients'])

fig = go.Figure(
    data=[
        go.Bar(
            x= df_features['Coefficients'],
            y= df_features['Features'],
            orientation='h'
            )])

fig.update_layout(
    title='Features Importance',
    xaxis_title='Coefficients', 
    yaxis_title='Features')

fig.show()

In [121]:
joblib.dump(preprocessor, './Preprocessor.joblib')
joblib.dump(regressor_1, './Linear_Regression_Model.joblib')

['./LinearRegressionModel.joblib']

--------

# <center>__TESTING REGULARIZED LINEAR REGRESSION MODEL: RIDGE__</center>

--------

#### __SECTION PLAN__ 🎯

1. CROSS-VALIDATED SCORE FOR A RIDGE MODEL (with default value of λλ)
2. GRID SEARCH: TUNNING λλ  AND NUMBER OF FOLDS
3. EVALUATING THE MODEL: PERFORMANCE METRICS

--------
### __1. CROSS-VALIDATED SCORE FOR A RIDGE MODEL (with default value of λλ)__
--------

In [122]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score obtained with a Ridge model
print("3-fold cross-validation...")
regressor = Ridge()
scores = cross_val_score(regressor, X_train, Y_train, cv=3)

print('The cross-validated R2-score is: ', scores.mean().round(5))
print('The standard deviation is: ', scores.std().round(5))


3-fold cross-validation...
The cross-validated R2-score is:  0.69449
The standard deviation is:  0.01911


--------
### __2. GRID SEARCH: TUNNING λλ  AND NUMBER OF FOLDS__
--------

In [123]:
# Perform grid search
print('>>>> 🟨 Grid search...')
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.0, 0.1, 0.5, 1] # 0 corresponds to no regularization
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print('Best hyperparameters : ', gridsearch.best_params_)
print('Best R2 score: ', gridsearch.best_score_.round(5))
print('>>>> ✅ ...Done')


>>>> 🟨 Grid search...
Best hyperparameters :  {'alpha': 1}
Best R2 score:  0.69653
>>>> ✅ ...Done


--------
### __3. EVALUATING THE MODEL: PERFORMANCE METRICS__
--------

In [124]:
# Print R^2 scores
print("R2 score on training set : ", gridsearch.score(X_train, Y_train).round(5))
print("R2 score on test set : ", gridsearch.score(X_test, Y_test).round(5))
print('-----------------------------------')
#Calculate MSE on training set
predictions_train = gridsearch.predict(X_train)
mse_train = mean_squared_error(Y_train, predictions_train)

# Calculate MSE on test set
predictions_test = gridsearch.predict(X_test)
mse_test = mean_squared_error(Y_test, predictions_test)

print("MSE on training set: ", mse_train)
print("MSE on test set: ", mse_test)

R2 score on training set :  0.70496
R2 score on test set :  0.71332
-----------------------------------
MSE on training set:  260.55257084429735
MSE on test set:  258.6377724017892


--------

# <center>__TESTING REGULARIZED LINEAR REGRESSION MODEL: LASSO__</center>
<center><i>Least Absolute Shrinkage and Selection Operator</i></center>

--------

#### __SECTION PLAN__ 🎯

1. CROSS-VALIDATED SCORE FOR A LASSO MODEL (with default value of λλ)
2. GRID SEARCH: TUNNING λλ  AND NUMBER OF FOLDS
3. EVALUATING THE MODEL: PERFORMANCE METRICS

--------
### __1. CROSS-VALIDATED SCORE FOR A LASSO MODEL (with default value of λλ)__
--------

In [125]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score obtained with a Ridge model
print("3-fold cross-validation...")
regressor = Lasso()
scores = cross_val_score(regressor, X_train, Y_train, cv=3)

print('The cross-validated R2-score is: ', scores.mean().round(5))
print('The standard deviation is: ', scores.std().round(5))


3-fold cross-validation...
The cross-validated R2-score is:  0.61288
The standard deviation is:  0.0155


--------
### __2. GRID SEARCH: TUNNING λλ AND NUMBER OF FOLDS__
--------

In [126]:
# Perform grid search
print('>>>> 🟨 Grid search...')
regressor = Lasso()

# Grid of values to be tested
params = {
    'alpha': [0.0, 0.1, 0.5, 1] # 0 corresponds to no regularization
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_.round(5))
print('--------------------------------------------------------------')
print('The cross-validated R2-score is: ', scores.mean().round(5))
print('The standard deviation is: ', scores.std().round(5))
print('>>>> ✅ ...Done')

>>>> 🟨 Grid search...



With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator


Objective did not converge. You might want to increase the number of iterations. Duality gap: 379423.5446138846, tolerance: 264.2322651535381


With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator


Objective did not converge. You might want to increase the number of iterations. Duality gap: 384312.3460010405, tolerance: 264.07728528037387


With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator


Objective did not converge. You might want to increase the number of iterations. Duality gap: 401322.8742332639, tolerance: 266.28192052736983


With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator


Objective did not converge. You might want to increase the number of iterations. Duality gap: 384874.9517824652, tolerance: 260.6

Best hyperparameters :  {'alpha': 0.0}
Best R2 score :  0.6964
--------------------------------------------------------------
The cross-validated R2-score is:  0.61288
The standard deviation is:  0.0155
>>>> ✅ ...Done



Objective did not converge. You might want to increase the number of iterations. Duality gap: 487724.8065426045, tolerance: 330.71937303070763



--------
### __3. EVALUATING THE MODEL: PERFORMANCE METRICS__
--------

In [127]:
print('-----------------------------------')
# Print R^2 scores
print("R2 score on training set : ", gridsearch.score(X_train, Y_train).round(5))
print("R2 score on test set : ", gridsearch.score(X_test, Y_test).round(5))
print('-----------------------------------')
#Calculate MSE on training set
predictions_train = gridsearch.predict(X_train)
mse_train = mean_squared_error(Y_train, predictions_train)

# Calculate MSE on test set
predictions_test = gridsearch.predict(X_test)
mse_test = mean_squared_error(Y_test, predictions_test)

print("MSE on training set: ", mse_train)
print("MSE on test set: ", mse_test)

-----------------------------------
R2 score on training set :  0.70505
R2 score on test set :  0.71318
-----------------------------------
MSE on training set:  260.46718640459403
MSE on test set:  258.75798374314354


__`NOTEBOOK END`__