# Linear Regression:

This is the first model tested for Used Car Price Prediction

In [1]:
# Import Dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

Different brands and models were concatenated to one dataframe

In [2]:
# Read and concatenate all CSV's

path = r'D:\Data_Analytics\Project 3\Data'
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","Skoda","BMW","Volkswagen","Toyota","Mercedes Benz","Ford","Hyundi"]

for filename, brand in zip(all_files, brands):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["make"] = brand
    li.append(df)
    
frame = pd.concat(li, axis=0, ignore_index=True)
frame


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,I30,2016,8680,Manual,25906,Diesel,0,78.4,1.6,Hyundi
85551,I40,2015,7830,Manual,59508,Diesel,30,65.7,1.7,Hyundi
85552,I10,2017,6830,Manual,13810,Petrol,20,60.1,1.0,Hyundi
85553,Tucson,2018,13994,Manual,23313,Petrol,145,44.8,1.6,Hyundi


In [3]:
num_features = ['year','mileage',
                    'tax','mpg',
                    'engineSize']
num_transformer = Pipeline(steps=[('poly',PolynomialFeatures(degree = 3)),
                                      ('scaler', StandardScaler())])

In [4]:
cat_features = ['model','transmission','fuelType','make']
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

In [6]:
param_grid = {'alpha': [0.001, 0.01,0.02,0.03,0.04, 0.05, 0.06,0.07, 0.08, 1, 2, 3, 5, 8, 10, 20, 50, 100]}

In [8]:
regression = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', GridSearchCV(estimator= Lasso(), param_grid = param_grid, scoring = 'r2', verbose=3))])

In [9]:
X = frame.drop('price', axis = 1)
y = frame['price']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
regression.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  model = cd_fast.sparse_enet_coordinate_descent(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   48.3s remaining:    0.0s


[CV] ......................... alpha=0.001, score=0.909, total=  48.4s
[CV] alpha=0.001 .....................................................


  model = cd_fast.sparse_enet_coordinate_descent(
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s


[CV] ......................... alpha=0.001, score=0.915, total=  50.5s
[CV] alpha=0.001 .....................................................


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] ......................... alpha=0.001, score=0.905, total=  47.5s
[CV] alpha=0.001 .....................................................


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] ......................... alpha=0.001, score=0.909, total=  48.0s
[CV] alpha=0.001 .....................................................


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] ......................... alpha=0.001, score=0.907, total=  48.9s
[CV] alpha=0.01 ......................................................


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] .......................... alpha=0.01, score=0.909, total=  48.3s
[CV] alpha=0.01 ......................................................


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] .......................... alpha=0.01, score=0.915, total=  47.7s
[CV] alpha=0.01 ......................................................


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] .......................... alpha=0.01, score=0.905, total=  46.6s
[CV] alpha=0.01 ......................................................


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] .......................... alpha=0.01, score=0.910, total=  46.2s
[CV] alpha=0.01 ......................................................


In [None]:
print(regression.best_params_)

In [None]:
regression.score(X_train, y_train)

In [None]:
regression.score(X_test, y_test)

In [None]:
regression.predict(X_test)

In [None]:
y_test