# Lasso Regression - Regularization:

In this notebook, a "Lasso" model was created, and GridSearchCV was used to tune alpha "Hyperparameter"

In [1]:
# Import Dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

Different brands and models were concatenated to one dataframe

In [2]:
# Read and concatenate all CSV's

path = r'D:\Data_Analytics\Project 3\Data'
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","Skoda","BMW","Volkswagen","Toyota","Mercedes Benz","Ford","Hyundi"]

for filename, brand in zip(all_files, brands):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["make"] = brand
    li.append(df)
    
frame = pd.concat(li, axis=0, ignore_index=True)
frame


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,I30,2016,8680,Manual,25906,Diesel,0,78.4,1.6,Hyundi
85551,I40,2015,7830,Manual,59508,Diesel,30,65.7,1.7,Hyundi
85552,I10,2017,6830,Manual,13810,Petrol,20,60.1,1.0,Hyundi
85553,Tucson,2018,13994,Manual,23313,Petrol,145,44.8,1.6,Hyundi


# Pipeline

Categorical and Numeric Features were preprocessed separately

In [3]:
# define numeric features

num_features = ['year','mileage',
                    'tax','mpg',
                    'engineSize']

# Pipeline steps to transform numeric features
num_transformer = Pipeline(steps=[('poly',PolynomialFeatures(degree = 3)),
                                      ('scaler', StandardScaler())])

In [4]:
# define categorical features

cat_features = ['model','transmission','fuelType','make']

# Pipeline to transform (encode) categorical features

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [5]:
# Use "ColumnTransformer" to join both categorical and numeric pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

In [6]:
# define Lasso model
lasso_model = Lasso(max_iter=15000, tol=0.1, random_state = 42)

In [7]:
# Create a pipeline that execute "preprocessor" and the Lasso model
regression = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',lasso_model )])

In [8]:
# Define X, y

X = frame.drop('price', axis = 1)
y = frame['price']

In [9]:
# Split X,y into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [10]:
# search hyperparameters

regression.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'regressor', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__poly', 'preprocessor__num__scaler', 'preprocessor__num__poly__degree', 'preprocessor__num__poly__include_bias', 'preprocessor__num__poly__interaction_only', 'preprocessor__num__poly__order', 'preprocessor__num__scaler__copy', 'preprocessor__num__scaler__with_mean', 'preprocessor__num__scaler__with_std', 'preprocessor__cat__memory', 'preprocessor__cat__steps', 'preprocessor__cat__verbose', 'preprocessor__cat__onehot', 'preprocessor__cat__onehot__categories', 'preprocessor__cat__onehot__drop', 'preprocessor__cat__onehot__dtype', 'preprocessor__cat__onehot__handle_unknown', 'preprocessor__cat__onehot

In [11]:
# define parameters grid
param_grid = {'regressor__alpha': [0.001, 0.01,0.02,0.03,0.04, 0.05, 0.06,0.07, 0.08, 1, 2],
             'regressor__selection':['random','cyclic']}

In [12]:
# apply GridSearchCV for tuning
search = GridSearchCV(regression, param_grid, scoring = 'r2', verbose = 3)

In [13]:
# fit the model
search.fit(X_train, y_train)

Fitting 5 folds for each of 22 candidates, totalling 110 fits
[CV] regressor__alpha=0.001, regressor__selection=random .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.891, total=   1.0s
[CV] regressor__alpha=0.001, regressor__selection=random .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.901, total=   1.1s
[CV] regressor__alpha=0.001, regressor__selection=random .............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.898, total=   1.1s
[CV] regressor__alpha=0.001, regressor__selection=random .............
[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.898, total=   1.1s
[CV] regressor__alpha=0.001, regressor__selection=random .............
[CV]  regressor__alpha=0.001, regressor__selection=random, score=0.894, total=   1.0s
[CV] regressor__alpha=0.001, regressor__selection=cyclic .............
[CV]  regressor__alpha=0.001, regressor__selection=cyclic, score=0.880, total=   0.8s
[CV] regressor__alpha=0.001, regressor__selection=cyclic .............
[CV]  regressor__alpha=0.001, regressor__selection=cyclic, score=0.889, total=   0.8s
[CV] regressor__alpha=0.001, regressor__selection=cyclic .............
[CV]  regressor__alpha=0.001, regressor__selection=cyclic, score=0.877, total=   0.9s
[CV] regressor__alpha=0.001, regressor__selection=cyclic .............
[CV]  regressor__alpha=0.001, regressor__selection=cyclic,

[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.880, total=   0.9s
[CV] regressor__alpha=0.05, regressor__selection=cyclic ..............
[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.889, total=   0.9s
[CV] regressor__alpha=0.05, regressor__selection=cyclic ..............
[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.877, total=   0.9s
[CV] regressor__alpha=0.05, regressor__selection=cyclic ..............
[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.878, total=   0.8s
[CV] regressor__alpha=0.05, regressor__selection=cyclic ..............
[CV]  regressor__alpha=0.05, regressor__selection=cyclic, score=0.882, total=   0.9s
[CV] regressor__alpha=0.06, regressor__selection=random ..............
[CV]  regressor__alpha=0.06, regressor__selection=random, score=0.891, total=   1.0s
[CV] regressor__alpha=0.06, regressor__selection=random ..............
[CV]  regressor__alpha=0.06, regressor__selection=random, score=

[CV]  regressor__alpha=2, regressor__selection=cyclic, score=0.872, total=   0.9s
[CV] regressor__alpha=2, regressor__selection=cyclic .................
[CV]  regressor__alpha=2, regressor__selection=cyclic, score=0.869, total=   0.8s


[Parallel(n_jobs=1)]: Done 110 out of 110 | elapsed:  1.7min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('poly',
                                                                                          PolynomialFeatures(degree=3,
                                                                                                             include_bias=True,
                             

In [14]:
# check test accuracy
print('Test Acc: %.3f' % search.score(X_test, y_test))

Test Acc: 0.890


In [15]:
# check best score
print('Test Acc: %.3f' % search.best_score_)

Test Acc: 0.896


In [16]:
# define model "best params"
search.best_params_

{'regressor__alpha': 0.001, 'regressor__selection': 'random'}

# Examining a Lasso Model with "Best Params"

In [17]:
model = Lasso(alpha = 0.001, selection = 'random', random_state = 42, max_iter=15000, tol=0.1)

In [18]:
regression_lasso = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])

In [19]:
regression_lasso.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('poly',
                                                                   PolynomialFeatures(degree=3,
                                                                                      include_bias=True,
                                                                                      interaction_only=False,
                                                                                      order='C')),
                                                                  ('scaler',
                                                                   Sta

In [20]:
# train accuracy
regression_lasso.score(X_train, y_train)

0.8961432606631403

In [21]:
# test accuracy
regression_lasso.score(X_test, y_test)

0.8897168731795976

# Predictions

In [22]:
# predictions with test data
encoded_prediction= regression_lasso.predict(X_test[:5])
# Actual data
actual = list(y_test[:5])

print(f"Predicted classes: {encoded_prediction}")
print(f"Actual Labels: {actual}")

Predicted classes: [ 7075.02277963  7799.74861624 12659.3445443   9092.63463429
 20398.14577137]
Actual Labels: [7990, 9000, 13000, 8947, 16995]
