# Import All Necessary Dependancies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder
from scipy.stats import mode
from IPython.display import display

# Data Gathering

In [3]:
df = pd.read_csv(r"D:\PYTHON_NOTES\CSV\autos_dataset.csv")
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:
df.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

# EDA

In [5]:
df=df.replace({"?":np.nan})

In [6]:
df.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [7]:
df['horsepower']=df['horsepower'].astype(float)
df['peak-rpm']=df['peak-rpm'].astype(float)
df['price']=df['price'].astype(float)
df['normalized-losses']=df['normalized-losses'].astype(float)
df['stroke']=df['stroke'].astype(float)
df['bore']=df['bore'].astype(float)

In [8]:
(df.isnull().sum()*100)/df.shape[0]

symboling             0.00000
normalized-losses    20.00000
make                  0.00000
fuel-type             0.00000
aspiration            0.00000
num-of-doors          0.97561
body-style            0.00000
drive-wheels          0.00000
engine-location       0.00000
wheel-base            0.00000
length                0.00000
width                 0.00000
height                0.00000
curb-weight           0.00000
engine-type           0.00000
num-of-cylinders      0.00000
engine-size           0.00000
fuel-system           0.00000
bore                  1.95122
stroke                1.95122
compression-ratio     0.00000
horsepower            0.97561
peak-rpm              0.97561
city-mpg              0.00000
highway-mpg           0.00000
price                 1.95122
dtype: float64

In [9]:
df.isnull().mean()*100

symboling             0.00000
normalized-losses    20.00000
make                  0.00000
fuel-type             0.00000
aspiration            0.00000
num-of-doors          0.97561
body-style            0.00000
drive-wheels          0.00000
engine-location       0.00000
wheel-base            0.00000
length                0.00000
width                 0.00000
height                0.00000
curb-weight           0.00000
engine-type           0.00000
num-of-cylinders      0.00000
engine-size           0.00000
fuel-system           0.00000
bore                  1.95122
stroke                1.95122
compression-ratio     0.00000
horsepower            0.97561
peak-rpm              0.97561
city-mpg              0.00000
highway-mpg           0.00000
price                 1.95122
dtype: float64

In [10]:
df['normalized-losses']=df['normalized-losses'].fillna(df['normalized-losses'].mean())

df['num-of-doors']=df['num-of-doors'].fillna(df['num-of-doors'].mode()[0])
df['bore']=df['bore'].fillna(df['bore'].mean())
df['stroke']=df['stroke'].fillna(df['stroke'].mean())
df['price']=df['price'].fillna(df['price'].mean())
df['horsepower']=df['horsepower'].fillna(df['horsepower'].mean())
df['peak-rpm']=df['peak-rpm'].fillna(df['peak-rpm'].mean())

In [11]:
for i in df.columns:
    df.rename(columns={i:i.replace("-","_")},inplace=True)
df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


# Feature Engineering

### One hot Encoding

In [13]:
one_hot_encode =OneHotEncoder()
array = one_hot_encode.fit_transform(df[['make']]).toarray()
df1 = pd.DataFrame(array,columns=df['make'].unique())
df1


Unnamed: 0,alfa-romero,audi,bmw,chevrolet,dodge,honda,isuzu,jaguar,mazda,mercedes-benz,...,nissan,peugot,plymouth,porsche,renault,saab,subaru,toyota,volkswagen,volvo
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
df=pd.concat([df,df1],axis=1)
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,nissan,peugot,plymouth,porsche,renault,saab,subaru,toyota,volkswagen,volvo
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df.drop('make',axis=1,inplace=True)  

In [16]:
one_hot = OneHotEncoder()
array= one_hot.fit_transform(df[['body_style']]).toarray()

In [17]:
df2 = pd.DataFrame(array,columns=df['body_style'].unique())
df2.head(1)

Unnamed: 0,convertible,hatchback,sedan,wagon,hardtop
0,1.0,0.0,0.0,0.0,0.0


In [18]:
df=df.join(df2)

In [19]:
df.drop('body_style',axis=1,inplace=True)

# Encoding by Replacing

In [20]:
df['fuel_type']=df['fuel_type'].replace({'gas':0,'diesel':1})

In [21]:
df['aspiration']=df['aspiration'].replace({'std':0,'turbo':1})

In [22]:
df['num_of_cylinders']=df['num_of_cylinders'].replace({'four':4,'six':6,'five':5,'three':3,'two':2,'eight':8,'twelve':12})

In [23]:
df['num_of_cylinders'].unique()

array([ 4,  6,  5,  3, 12,  2,  8], dtype=int64)

# Ordinal Encoding

In [24]:
ordinalencoder = OrdinalEncoder(categories=[['two','four']])
ordinalencoder.fit(df[['num_of_doors']])
df['num_of_doors']=ordinalencoder.transform(df[['num_of_doors']])

In [25]:
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 51 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  205 non-null    float64
 2   fuel_type          205 non-null    int64  
 3   aspiration         205 non-null    int64  
 4   num_of_doors       205 non-null    float64
 5   drive_wheels       205 non-null    object 
 6   engine_location    205 non-null    object 
 7   wheel_base         205 non-null    float64
 8   length             205 non-null    float64
 9   width              205 non-null    float64
 10  height             205 non-null    float64
 11  curb_weight        205 non-null    int64  
 12  engine_type        205 non-null    object 
 13  num_of_cylinders   205 non-null    int64  
 14  engine_size        205 non-null    int64  
 15  fuel_system        205 non-null    object 
 16  bore               205 non

In [27]:
ordinalencoder = OrdinalEncoder(categories=[['front','rear']])
ordinalencoder.fit(df[['engine_location']])
df['engine_location']=ordinalencoder.transform(df[['engine_location']])

# Using get_dummies

In [28]:
df3=pd.get_dummies(df,columns=['drive_wheels','engine_type','fuel_system'])

In [29]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 66 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  205 non-null    float64
 2   fuel_type          205 non-null    int64  
 3   aspiration         205 non-null    int64  
 4   num_of_doors       205 non-null    float64
 5   engine_location    205 non-null    float64
 6   wheel_base         205 non-null    float64
 7   length             205 non-null    float64
 8   width              205 non-null    float64
 9   height             205 non-null    float64
 10  curb_weight        205 non-null    int64  
 11  num_of_cylinders   205 non-null    int64  
 12  engine_size        205 non-null    int64  
 13  bore               205 non-null    float64
 14  stroke             205 non-null    float64
 15  compression_ratio  205 non-null    float64
 16  horsepower         205 non

In [33]:
x =df3.drop('price',axis=1)
y =df3['price']

# Train_Test_Split()

In [34]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)