# Lasso Regression - Regularization:

In this notebook, a "Lasso" model was created, and GridSearchCV was used to tune alpha "Hyperparameter"

In [1]:
# Import Dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

Different brands and models were concatenated to one dataframe

In [2]:
# Read and concatenate all CSV's

path = 'https://bucketmarialara.s3.us-east-2.amazonaws.com/' # use your s3 bucket path ( the common path where you uploaded all csv files
li = []
brands = ["Audi","BMW","Ford","Hyundi","Mercedes Benz","Skoda","Toyota","Volkswagen"]
csv_names = ["audi", "bmw", "ford", "hyundi", "merc", "skoda","toyota", "vw"]
for i in range(0,8):
    filename = path + csv_names[i] + ".csv"
    print(filename)
    frame = pd.read_csv(filename, index_col=None, header=0)
    frame["make"] = brands[i]
    li.append(frame)
frame = pd.concat(li, axis=0, ignore_index=True)
frame


https://bucketmarialara.s3.us-east-2.amazonaws.com/audi.csv
https://bucketmarialara.s3.us-east-2.amazonaws.com/bmw.csv
https://bucketmarialara.s3.us-east-2.amazonaws.com/ford.csv
https://bucketmarialara.s3.us-east-2.amazonaws.com/hyundi.csv
https://bucketmarialara.s3.us-east-2.amazonaws.com/merc.csv
https://bucketmarialara.s3.us-east-2.amazonaws.com/skoda.csv
https://bucketmarialara.s3.us-east-2.amazonaws.com/toyota.csv
https://bucketmarialara.s3.us-east-2.amazonaws.com/vw.csv


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


In [3]:
frame_filtered = frame.loc[frame['year'] != 2060]
frame_filtered

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


In [4]:
# identified outliers
index_list = [9434,10109,7221,7845,17753,14988,14306,33361,22488,43661,44279,62386]
#outliers
Filter_df  = frame_filtered[frame_filtered.index.isin(index_list)]
Filter_df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
7221,A5,2020,59995,Semi-Auto,2000,Diesel,150,40.9,3.0,Audi
7845,A6,2018,59950,Automatic,22000,Petrol,145,29.4,4.0,Audi
9434,A8,2020,78990,Automatic,250,Diesel,145,39.2,3.0,Audi
10109,S3,2003,4990,Manual,106000,Petrol,325,39.8,1.8,Audi
14306,2 Series,2015,123456,Semi-Auto,33419,Diesel,20,68.9,2.0,BMW
14988,5 Series,2020,54845,Semi-Auto,450,Diesel,145,60.1,3.0,BMW
17753,3 Series,2020,71990,Semi-Auto,150,Diesel,150,47.1,3.0,BMW
22488,Focus,2017,38015,Manual,197,Diesel,145,74.3,1.5,Ford
33361,Focus,2018,54995,Manual,11000,Petrol,145,36.7,2.3,Ford
43661,I10,2017,92000,Automatic,35460,Petrol,150,47.9,1.2,Hyundi


In [5]:
#remove outliers
frame_filtered = frame_filtered.drop(index_list)

In [6]:
#remove outliers
frame_filtered

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


# Pipeline

Categorical and Numeric Features were preprocessed separately

In [7]:
# define numeric features

num_features = ['year','mileage',
                    'tax','mpg',
                    'engineSize']

# Pipeline steps to transform numeric features
num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),('poly',PolynomialFeatures(degree = 3)),
                                      ('scaler', StandardScaler())])

In [8]:
# define categorical features

cat_features = ['model','transmission','fuelType','make']

# Pipeline to transform (encode) categorical features

cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [9]:
# Use "ColumnTransformer" to join both categorical and numeric pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

In [10]:
# define Lasso model
lasso_model = Lasso(max_iter=15000, tol=0.1, random_state = 42)

In [11]:
# Create a pipeline that execute "preprocessor" and the Lasso model
regression = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',lasso_model )])

In [12]:
# Define X, y

X = frame_filtered.drop('price', axis = 1)
y = frame_filtered['price']

In [13]:
# Split X,y into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [14]:
# search hyperparameters

regression.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'regressor', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__imputer', 'preprocessor__num__poly', 'preprocessor__num__scaler', 'preprocessor__num__imputer__add_indicator', 'preprocessor__num__imputer__copy', 'preprocessor__num__imputer__fill_value', 'preprocessor__num__imputer__missing_values', 'preprocessor__num__imputer__strategy', 'preprocessor__num__imputer__verbose', 'preprocessor__num__poly__degree', 'preprocessor__num__poly__include_bias', 'preprocessor__num__poly__interaction_only', 'preprocessor__num__poly__order', 'preprocessor__num__scaler__copy', 'preprocessor__num__scaler__with_mean', 'preprocessor__num__scaler__with_std', 'preprocessor__cat__m

In [15]:
# define parameters grid
param_grid = {'regressor__alpha': [0.001, 0.01,0.02,0.03,0.04, 0.05, 0.06,0.07, 0.08, 1, 2],
             'regressor__selection':['random','cyclic']}

In [16]:
# apply GridSearchCV for tuning
search = GridSearchCV(regression, param_grid, scoring = 'r2', verbose = 3)

In [17]:
# fit the model
search.fit(X_train, y_train)

Fitting 5 folds for each of 22 candidates, totalling 110 fits
[CV] regressor__alpha=0.001, regressor__selection=random .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.892, total=   3.8s
[CV] regressor__alpha=0.001, regressor__selection=random .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.7s remaining:    0.0s


[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.873, total=   2.4s
[CV] regressor__alpha=0.001, regressor__selection=random .............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.1s remaining:    0.0s


[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.904, total=   2.2s
[CV] regressor__alpha=0.001, regressor__selection=random .............
[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.905, total=   2.2s
[CV] regressor__alpha=0.001, regressor__selection=random .............
[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.894, total=   3.8s
[CV] regressor__alpha=0.001, regressor__selection=cyclic .............
[CV]  regressor__alpha=0.001, regressor__selection=cyclic, score=0.885, total=   2.8s
[CV] regressor__alpha=0.001, regressor__selection=cyclic .............
[CV]  regressor__alpha=0.001, regressor__selection=cyclic, score=0.890, total=   2.0s
[CV] regressor__alpha=0.001, regressor__selection=cyclic .............
[CV]  regressor__alpha=0.001, regressor__selection=cyclic, score=0.899, total=   2.2s
[CV] regressor__alpha=0.001, regressor__selection=cyclic .............
[CV]  regressor__alpha=0.001, regressor__selection=cyclic,

[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.885, total=   2.5s
[CV] regressor__alpha=0.05, regressor__selection=cyclic ..............
[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.890, total=   2.3s
[CV] regressor__alpha=0.05, regressor__selection=cyclic ..............
[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.899, total=   2.5s
[CV] regressor__alpha=0.05, regressor__selection=cyclic ..............
[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.901, total=   2.4s
[CV] regressor__alpha=0.05, regressor__selection=cyclic ..............
[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.880, total=   2.2s
[CV] regressor__alpha=0.06, regressor__selection=random ..............
[CV]  regressor__alpha=0.06, regressor__selection=random, score=0.892, total=   2.9s
[CV] regressor__alpha=0.06, regressor__selection=random ..............
[CV]  regressor__alpha=0.06, regressor__selection=random, score=

[CV]  regressor__alpha=2, regressor__selection=cyclic, score=0.886, total=   2.7s
[CV] regressor__alpha=2, regressor__selection=cyclic .................
[CV]  regressor__alpha=2, regressor__selection=cyclic, score=0.874, total=   2.5s


[Parallel(n_jobs=1)]: Done 110 out of 110 | elapsed:  5.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                 

In [18]:
# check test accuracy
print('Test Acc: %.3f' % search.score(X_test, y_test))

Test Acc: 0.895


In [19]:
# check best score
print('Test Acc: %.3f' % search.best_score_)

Test Acc: 0.894


In [20]:
# define model "best params"
search.best_params_

{'regressor__alpha': 0.001, 'regressor__selection': 'random'}

# Examining a Lasso Model with "Best Params"

In [21]:
model = Lasso(alpha = 0.05, selection = 'cyclic', max_iter=15000, tol=0.1)

In [22]:
regression_lasso = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])

In [23]:
regression_lasso.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                           

In [24]:
# train accuracy
regression_lasso.score(X_train, y_train)

0.8966519832943262

In [25]:
# test accuracy
regression_lasso.score(X_test, y_test)

0.895228191159861

# Predictions

In [26]:
# predictions with test data
encoded_prediction= regression_lasso.predict(X_test[:5])
# Actual data
actual = list(y_test[:5])

print(f"Predicted classes: {encoded_prediction}")
print(f"Actual Labels: {actual}")

Predicted classes: [ 2166.23607925 -1756.09931463 10945.99003905 12549.73910209
 12707.8442677 ]
Actual Labels: [5395, 2250, 10490, 13495, 13990]
