In [1]:
# a list of column names
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]

# Step -1 Loading The Data

In [2]:
# importing all necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# loading the dataset
data = pd.read_csv(r"C:\Users\amit9\Downloads\archive\auto_car_338gqj.data", names = headers)
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [5]:
#data.to_csv(r"C:\Users\amit9\Downloads\archive\auto_car_338gqj.csv")

# Data Cleaning

In [58]:
# checking for null values
data.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [59]:
# replacing "?" with null value in the whole data
data.replace("?", np.nan, inplace = True)

In [60]:
# checking for null values after replacement
data.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [61]:
# checking for duplicate rows /  data
data.duplicated().sum()

0

In [62]:
# getting summary of data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       203 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

# Treating Null Values

In [63]:
# calculating mean for the column "normalized-losses" 
# astype - to change the datatype
a = data['normalized-losses'].astype("float").mean()
a

122.0

In [64]:
# replacing null values with the respective column mean value
data['normalized-losses'].replace(np.nan, a, inplace = True)

In [65]:
# checking if 'normalized-losses' null values has been updated or not
data.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         2
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 4
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                4
dtype: int64

In [66]:
# Handling null values in the column "num-of-doors"
# calculating mode since the column has categorical data 
a = data['num-of-doors'].mode()[0]
a

'four'

In [67]:
# Replacing null values with mode of that repective column
data['num-of-doors'].replace(np.nan, a, inplace =True)

In [68]:
# Checking if the changes in "num-of-doors" columns is saved or not
data.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 4
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                4
dtype: int64

In [69]:
# replacing null values in the column "bore" with its own mean value 
a = data['bore'].astype("float").mean()
data['bore'].replace(np.nan, a, inplace = True)

In [70]:
# replacing null values in the column "stroke" with its own mean value 
a = data['stroke'].astype("float").mean()
data['stroke'].replace(np.nan, a, inplace = True)

In [71]:
# replacing null values in the column "horsepower" with its own mean value
a = data['horsepower'].astype("float").mean()
data['horsepower'].replace(np.nan, a, inplace = True)

In [72]:
# replacing null values in the column "peak-rpm" with its own mean value
a = data['peak-rpm'].astype("float").mean()
data['peak-rpm'].replace(np.nan, a, inplace = True)

In [73]:
# # replacing null values in the column "price" with its own mean value
a = data['price'].astype("float").mean()
data['price'].replace(np.nan, a, inplace = True)

In [74]:
# checking null values to confirm that none are left out
data.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

# Correcting Data-types of Columns

In [75]:
# to print data-types of all columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [76]:
# command to display the full data-set
pd.set_option('display.max_columns', None, 'display.max_rows', None)

# printing first five rows of data
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [77]:
# changing "normalized-losses" column data-type to float using "astype" function
data['normalized-losses'] = data['normalized-losses'].astype("float")

In [78]:
# checking if the above command worked or not... 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    float64
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [79]:
# converting the column "bore" data-type to float
data['bore'] = data['bore'].astype("float")

In [80]:
# converting the column "peak-rpm" data-type to float
data['peak-rpm'] = data['peak-rpm'].astype("float")

In [81]:
# converting the column "stroke" data-type to float
data['stroke'] = data['stroke'].astype("float")

In [82]:
# converting the column "price" data-type to float
data['price'] = data['price'].astype("float")

In [83]:
# converting the column "horsepower" data-type to float
data['horsepower'] = data['horsepower'].astype("float")

In [84]:
# to check if the above respective columns data-type has changed..... 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    float64
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [85]:
#data.to_csv(r"C:\Users\amit9\Downloads\archive\car_clean_data.csv")

# Data Preprocessing

In [86]:
# importing LabelEncoder function from scikit-learn
from sklearn.preprocessing import LabelEncoder

# creating an object of the LabelEncoder function
lb = LabelEncoder()

In [87]:
data.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [88]:
# A list of all columns having categorical data
obj_lst = [ 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location','engine-type',
       'num-of-cylinders', 'fuel-system', 'peak-rpm']

In [89]:
# dummy example to show how the Label encoder works 
# printing all values in 'make' column with their count
data['make'].value_counts()

toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
volkswagen       12
subaru           12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
audi              7
plymouth          7
saab              6
porsche           5
isuzu             4
jaguar            3
chevrolet         3
alfa-romero       3
renault           2
mercury           1
Name: make, dtype: int64

In [90]:
# converting all categorical values in the make column with numerical data
data['make'] = lb.fit_transform(data['make'])

In [91]:
# here it changed all the company names to numerical values, the count is still same
data['make'].value_counts()

19    32
12    18
8     17
11    13
5     13
20    12
18    12
13    11
21    11
4      9
9      8
2      8
1      7
14     7
17     6
15     5
6      4
7      3
3      3
0      3
16     2
10     1
Name: make, dtype: int64

In [92]:
# command to find number of unique values in each column
data.nunique()

symboling              6
normalized-losses     51
make                  22
fuel-type              2
aspiration             2
num-of-doors           2
body-style             5
drive-wheels           3
engine-location        2
wheel-base            53
length                75
width                 44
height                49
curb-weight          171
engine-type            7
num-of-cylinders       7
engine-size           44
fuel-system            8
bore                  39
stroke                37
compression-ratio     32
horsepower            60
peak-rpm              24
city-mpg              29
highway-mpg           30
price                187
dtype: int64

In [93]:
# a "for" loop to change all the columns with categorical data to numerical data
for i in obj_lst:
    print(i)
    # command to actually perform the LabelEncoder functionality
    data[i] = lb.fit_transform(data[i])

fuel-type
aspiration
num-of-doors
body-style
drive-wheels
engine-location
engine-type
num-of-cylinders
fuel-system
peak-rpm


In [94]:
data['peak-rpm'].unique()

array([10, 17, 20,  2, 16, 11,  8, 22,  7,  6,  1,  3,  5, 13,  0, 18, 21,
       19, 12, 14,  9,  4, 23, 15], dtype=int64)

In [95]:
# to check all columns now have numerical data...
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122.0,0,1,0,1,0,2,0,88.6,168.8,64.1,48.8,2548,0,2,130,5,3.47,2.68,9.0,111.0,10,21,27,13495.0
1,3,122.0,0,1,0,1,0,2,0,88.6,168.8,64.1,48.8,2548,0,2,130,5,3.47,2.68,9.0,111.0,10,21,27,16500.0
2,1,122.0,0,1,0,1,2,2,0,94.5,171.2,65.5,52.4,2823,5,3,152,5,2.68,3.47,9.0,154.0,10,19,26,16500.0
3,2,164.0,1,1,0,0,3,1,0,99.8,176.6,66.2,54.3,2337,3,2,109,5,3.19,3.4,10.0,102.0,17,24,30,13950.0
4,2,164.0,1,1,0,0,3,0,0,99.4,176.6,66.4,54.3,2824,3,1,136,5,3.19,3.4,8.0,115.0,17,18,22,17450.0


# Train-Test_split

In [96]:
# differenciating the set of inputs and output in two different variables X and y respectively
X = data[['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg']]

y = data['price']

In [97]:
# importing train_test_split function from scikit-learn
from sklearn.model_selection import train_test_split

# The function takes 3 parameters - set of inputs(X), output(y), test_size i.e. 10% = 0.1 in decimal
# remember the sequence (x_train, x_test, y_train, y_test)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 2023)

In [98]:
# printing training inputs
x_train.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
32,1,101.0,5,1,0,1,2,1,0,93.7,150.0,64.0,52.6,1837,3,2,79,0,2.91,3.07,10.1,60.0,17,38,42
61,1,129.0,8,1,0,1,2,1,0,98.8,177.8,66.5,53.7,2385,3,2,122,1,3.39,3.39,8.6,84.0,8,26,32
146,0,89.0,18,1,0,0,4,1,0,97.0,173.5,65.4,53.0,2290,4,2,108,1,3.62,2.64,9.0,82.0,8,28,32
23,1,118.0,4,1,1,1,2,1,0,93.7,157.3,63.8,50.8,2128,3,2,98,5,3.03,3.39,7.6,102.0,17,24,30
122,1,154.0,14,1,0,0,3,1,0,93.7,167.3,63.8,50.8,2191,3,2,98,1,2.97,3.23,9.4,68.0,17,31,38


In [99]:
# printing test inputs data 
x_test.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
13,0,188.0,2,1,0,0,3,2,0,101.2,176.8,64.8,54.3,2765,3,3,164,5,3.31,3.19,9.0,121.0,2,21,28
143,0,102.0,18,1,0,0,3,1,0,97.2,172.0,65.4,52.5,2340,4,2,108,5,3.62,2.64,9.0,94.0,13,26,32
99,0,106.0,12,1,0,0,2,1,0,97.2,173.4,65.2,54.7,2324,3,2,120,1,3.33,3.47,8.5,97.0,13,27,34
178,3,197.0,19,1,0,1,2,2,0,102.9,183.5,67.7,52.0,2976,0,3,171,5,3.27,3.35,9.3,161.0,13,20,24
202,-1,95.0,21,1,0,0,3,2,0,109.1,188.8,68.9,55.5,3012,5,3,173,5,3.58,2.87,8.8,134.0,17,18,23


## Performing Scaling on data

In [100]:
# importing MinMaxscaler function
from sklearn.preprocessing import MinMaxScaler

In [101]:
# creating object for the MinMaxScaler
mn = MinMaxScaler()

# scaling the training inputs data
x_train_scaled = mn.fit_transform(x_train)

In [102]:
# printing output for the above operation
x_train_scaled

array([[0.6       , 0.18848168, 0.23809524, ..., 0.73913043, 0.69444444,
        0.68421053],
       [0.6       , 0.33507853, 0.38095238, ..., 0.34782609, 0.36111111,
        0.42105263],
       [0.4       , 0.12565445, 0.85714286, ..., 0.34782609, 0.41666667,
        0.42105263],
       ...,
       [0.        , 0.19895288, 1.        , ..., 0.47826087, 0.11111111,
        0.15789474],
       [0.6       , 0.43455497, 0.19047619, ..., 0.73913043, 0.5       ,
        0.57894737],
       [0.6       , 0.31413613, 0.52380952, ..., 0.73913043, 0.27777778,
        0.36842105]])

In [103]:
# scaling the testing inputs data
x_test_scaled = mn.fit_transform(x_test)

In [104]:
# printing output for the above operation
x_test_scaled[0:5]

array([[0.25      , 0.93181818, 0.0952381 , 1.        , 0.        ,
        0.        , 0.75      , 1.        , 0.        , 0.55617978,
        0.53424658, 0.13157895, 0.74193548, 0.59109043, 0.6       ,
        1.        , 0.62184874, 1.        , 0.67021277, 0.58510638,
        0.08496732, 0.45138889, 0.        , 0.22727273, 0.2962963 ],
       [0.25      , 0.28030303, 0.85714286, 1.        , 0.        ,
        0.        , 0.75      , 0.5       , 0.        , 0.33146067,
        0.40273973, 0.21052632, 0.4516129 , 0.30851064, 0.8       ,
        0.5       , 0.1512605 , 1.        , 1.        , 0.        ,
        0.08496732, 0.26388889, 0.61111111, 0.45454545, 0.44444444],
       [0.25      , 0.31060606, 0.57142857, 1.        , 0.        ,
        0.        , 0.5       , 0.5       , 0.        , 0.33146067,
        0.44109589, 0.18421053, 0.80645161, 0.29787234, 0.6       ,
        0.5       , 0.25210084, 0.2       , 0.69148936, 0.88297872,
        0.05228758, 0.28472222, 0.61111111, 0.

# Machine Learning

## Linear Regression Algorithm

In [105]:
# importing algorithm 
from sklearn.linear_model import LinearRegression

In [106]:
# creating a model with name "clf"
clf = LinearRegression()

In [107]:
# training the model using "fit" with data i.e not scaled data(bcoz it is decreasing the accuracy)
clf.fit(x_train, y_train)

In [108]:
# printing the coeficients/weights of all input variables
clf.coef_

array([ 3.84600048e+02, -9.16047519e+00, -1.62719245e+02,  2.16289355e+03,
        2.39015462e+03, -1.28487353e+03, -1.23489533e+03,  1.88754008e+03,
        1.28108057e+04,  1.17529445e+02,  1.65885637e+01,  5.21422638e+02,
        2.91735585e+02,  1.53608917e+00,  2.80807796e+02,  6.74096225e+02,
        1.29756647e+02,  1.72111864e+02, -1.58228748e+03, -2.70524115e+03,
        1.26439072e+02, -3.85465040e+01,  1.82929362e+02,  5.14735840e+01,
       -3.34549766e+01])

In [109]:
# printing intercept or bias
clf.intercept_

-58748.940372956684

In [110]:
# performing test on the creating model using "predict" function
y_pred = clf.predict(x_test)

In [111]:
# printing first five actual results
y_test[0:5]

13     21105.0
143     9960.0
99      8949.0
178    16558.0
202    21485.0
Name: price, dtype: float64

In [112]:
# printing first five predicted results
y_pred[0:5]

array([18262.45313584,  9658.59422448, 10976.38574848, 18280.76261294,
       24031.27371044])

### Accuracy of model

In [113]:
# importing accuracy metrics i.e. mean_absolute_error, r2_score
from sklearn.metrics import mean_absolute_error, r2_score

In [114]:
# It tells us the amount of error between actual and predicted values
print(f"Mean Absolute Error = {mean_absolute_error(y_test, y_pred)}")

Mean Absolute Error = 2744.705267078024


In [115]:
# It tells us the model accuracy in decimal (to convert into percentage multiply by 100)
print(f"R2 Score for the model = {r2_score(y_test, y_pred)}")

R2 Score for the model = 0.7056532694219461


# Lasso and Ridge Regression

In [122]:
# importing lasso(l1 regualrization) and ridge(l2 regualarization) algorithm
from sklearn.linear_model import Lasso, Ridge

# creating a Lasso based regreesion model
# alpha = penalty on model for wrong prediction
# max_iter = number of iterations
model = Lasso(alpha = 5, max_iter = 500)

In [123]:
# Training the Lasso algorithm model using "fit" method/function
model.fit(x_train, y_train)

In [124]:
# testing the lasso model
y_pred_lasso = model.predict(x_test)

In [125]:
# first five predicted values 
y_pred_lasso[0:5]

array([18085.662759  ,  9753.41308816, 11115.15920433, 18272.12692091,
       24118.65078545])

In [126]:
# first five actual values
y_test[0:5]

13     21105.0
143     9960.0
99      8949.0
178    16558.0
202    21485.0
Name: price, dtype: float64

In [127]:
# Accuracy/ R2_score for the Lasso Model
print(f"R2 Score for the Lasso model = {r2_score(y_test, y_pred_lasso)}")

R2 Score for the Lasso model = 0.7079919272835016


### Ridge Regression Algorithm

In [136]:
# creating a model based on ridge regreesion algorithm
# alpha = penalty on model for wrong prediction
# max_iter = number of iterations

# here you can try different values to check its affect on accuracy
model2 = Ridge(alpha = 170, max_iter = 100)

In [137]:
# training the model
model2.fit(x_train, y_train)

In [138]:
# making predictions on test data and storing it in a variable...
y_pred_ridge = model2.predict(x_test)

In [139]:
# Accuracy of the model...
print(f"R2 Score for the Ridge model = {r2_score(y_test, y_pred_ridge)}")

R2 Score for the Ridge model = 0.7312745676639392


In [None]:
y_test[0:5]

In [None]:
y_pred_ridge[0:5]