In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Automobile.csv')
df.head()

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa
4,ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa


## Handle missing values

In [3]:
df.isna().sum()

name            0
mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [4]:
df.horsepower.head()

0    130.0
1    165.0
2    150.0
3    150.0
4    140.0
Name: horsepower, dtype: float64

In [6]:
mean = df.horsepower.mean()
df.horsepower.fillna(mean,inplace = True)

In [7]:
df.isna().sum()

name            0
mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

## Convert catagorical data into float

In [21]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['origin_label_encoded'] = label_encoder.fit_transform(df['origin'])
df['name_label_encoded'] = label_encoder.fit_transform(df['name'])

In [22]:
df.head()

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,origin_label_encoded,name_label_encoded
0,chevrolet chevelle malibu,18.0,8,307,130,0.63087,12.0,70,usa,2,49
1,buick skylark 320,15.0,8,350,165,0.854333,11.5,70,usa,2,36
2,plymouth satellite,18.0,8,318,150,0.55047,11.0,70,usa,2,231
3,amc rebel sst,16.0,8,304,150,0.546923,12.0,70,usa,2,14
4,ford torino,17.0,8,302,140,0.565841,10.5,70,usa,2,161


In [8]:
df.dtypes

name             object
mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
dtype: object

In [10]:
df.displacement.unique()

array([307. , 350. , 318. , 304. , 302. , 429. , 454. , 440. , 455. ,
       390. , 383. , 340. , 400. , 113. , 198. , 199. , 200. ,  97. ,
       110. , 107. , 104. , 121. , 360. , 140. ,  98. , 232. , 225. ,
       250. , 351. , 258. , 122. , 116. ,  79. ,  88. ,  71. ,  72. ,
        91. ,  97.5,  70. , 120. ,  96. , 108. , 155. ,  68. , 114. ,
       156. ,  76. ,  83. ,  90. , 231. , 262. , 134. , 119. , 171. ,
       115. , 101. , 305. ,  85. , 130. , 168. , 111. , 260. , 151. ,
       146. ,  80. ,  78. , 105. , 131. , 163. ,  89. , 267. ,  86. ,
       183. , 141. , 173. , 135. ,  81. , 100. , 145. , 112. , 181. ,
       144. ])

In [13]:
df.horsepower.unique()

array([130.        , 165.        , 150.        , 140.        ,
       198.        , 220.        , 215.        , 225.        ,
       190.        , 170.        , 160.        ,  95.        ,
        97.        ,  85.        ,  88.        ,  46.        ,
        87.        ,  90.        , 113.        , 200.        ,
       210.        , 193.        , 104.46938776, 100.        ,
       105.        , 175.        , 153.        , 180.        ,
       110.        ,  72.        ,  86.        ,  70.        ,
        76.        ,  65.        ,  69.        ,  60.        ,
        80.        ,  54.        , 208.        , 155.        ,
       112.        ,  92.        , 145.        , 137.        ,
       158.        , 167.        ,  94.        , 107.        ,
       230.        ,  49.        ,  75.        ,  91.        ,
       122.        ,  67.        ,  83.        ,  78.        ,
        52.        ,  61.        ,  93.        , 148.        ,
       129.        ,  96.        ,  71.        ,  98.  

In [14]:
df['horsepower'] = df['horsepower'].astype(int)
df['displacement'] = df['displacement'].astype(int)
df.dtypes

name             object
mpg             float64
cylinders         int64
displacement      int32
horsepower        int32
weight            int64
acceleration    float64
model_year        int64
origin           object
dtype: object

## Normalize or Scale data 

In [17]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.424623,104.462312,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.271,38.19923,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,76.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,95.0,2803.5,15.5,76.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['weight'] = scaler.fit_transform(df[['weight']])
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.424623,104.462312,-1.606755e-16,15.56809,76.01005
std,7.815984,1.701004,104.271,38.19923,1.001259,2.757689,3.697627
min,9.0,3.0,68.0,46.0,-1.604943,8.0,70.0
25%,17.5,4.0,104.25,76.0,-0.8828266,13.825,73.0
50%,23.0,4.0,148.5,95.0,-0.1973624,15.5,76.0
75%,29.0,8.0,262.0,125.0,0.7538337,17.175,79.0
max,46.6,8.0,455.0,230.0,2.565185,24.8,82.0


## Check for duplicates

In [26]:
df[df.duplicated()]

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,origin_label_encoded,name_label_encoded


## Find Mean Median Mode

In [27]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_label_encoded,name_label_encoded
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.424623,104.462312,-1.606755e-16,15.56809,76.01005,1.449749,148.550251
std,7.815984,1.701004,104.271,38.19923,1.001259,2.757689,3.697627,0.775076,89.49588
min,9.0,3.0,68.0,46.0,-1.604943,8.0,70.0,0.0,0.0
25%,17.5,4.0,104.25,76.0,-0.8828266,13.825,73.0,1.0,65.25
50%,23.0,4.0,148.5,95.0,-0.1973624,15.5,76.0,2.0,150.0
75%,29.0,8.0,262.0,125.0,0.7538337,17.175,79.0,2.0,225.75
max,46.6,8.0,455.0,230.0,2.565185,24.8,82.0,2.0,304.0


In [30]:
df.drop(['name','origin'], axis= 'columns', inplace= True)
mean_values = df.mean()
print(mean_values)

mpg                     2.351457e+01
cylinders               5.454774e+00
displacement            1.934246e+02
horsepower              1.044623e+02
weight                 -1.606755e-16
acceleration            1.556809e+01
model_year              7.601005e+01
origin_label_encoded    1.449749e+00
name_label_encoded      1.485503e+02
dtype: float64


In [31]:
median_values = df.median()
print(median_values)

mpg                      23.000000
cylinders                 4.000000
displacement            148.500000
horsepower               95.000000
weight                   -0.197362
acceleration             15.500000
model_year               76.000000
origin_label_encoded      2.000000
name_label_encoded      150.000000
dtype: float64


In [32]:
mode_values = df.mode()
print(mode_values)

    mpg  cylinders  displacement  horsepower    weight  acceleration  \
0  13.0        4.0          97.0       150.0 -1.165111          14.5   
1   NaN        NaN           NaN         NaN -0.993671           NaN   

   model_year  origin_label_encoded  name_label_encoded  
0        73.0                   2.0               156.0  
1         NaN                   NaN                 NaN  
