In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Jul_2023/EDA/data.csv')
data.shape

(11914, 16)

In [4]:
data.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [5]:
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [6]:
data.describe()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity,MSRP
count,11914.0,11845.0,11884.0,11908.0,11914.0,11914.0,11914.0,11914.0
mean,2010.384338,249.38607,5.628829,3.436093,26.637485,19.733255,1554.911197,40594.74
std,7.57974,109.19187,1.780559,0.881315,8.863001,8.987798,1441.855347,60109.1
min,1990.0,55.0,0.0,2.0,12.0,7.0,2.0,2000.0
25%,2007.0,170.0,4.0,2.0,22.0,16.0,549.0,21000.0
50%,2015.0,227.0,6.0,4.0,26.0,18.0,1385.0,29995.0
75%,2016.0,300.0,6.0,4.0,30.0,22.0,2009.0,42231.25
max,2017.0,1001.0,16.0,4.0,354.0,137.0,5657.0,2065902.0


In [7]:
# columns to drop
data.drop(['Vehicle Style','Number of Doors','Vehicle Size','Popularity','Market Category'],axis=1,inplace =True)


In [8]:
# renaming the columns
data.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price" },inplace=True)

NAN value imputation

In [9]:
data.isna().sum()

Make                 0
Model                0
Year                 0
Engine Fuel Type     3
HP                  69
Cylinders           30
Transmission         0
Drive Mode           0
MPG-H                0
MPG-C                0
Price                0
dtype: int64

In [10]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
imputer_object = SimpleImputer(strategy='constant', fill_value='Unknown')

data['Engine Fuel Type'] = imputer_object.fit_transform(data['Engine Fuel Type'].values.reshape(-1,1))
data['HP'] = imputer.fit_transform(data['HP'].values.reshape(-1,1))
data['Cylinders'] = imputer.fit_transform(data['Cylinders'].values.reshape(-1,1))

In [11]:
# dropping duplicate rows
print('duplicate rows: ',data[data.duplicated()].shape)
data.drop_duplicates(inplace=True)
data.shape

duplicate rows:  (918, 11)


(10996, 11)

In [12]:
categorical_cols, numerical_cols = list(), list()
[categorical_cols.append(x) if data[x].dtype == 'object' else numerical_cols.append(x) for x in data.columns]

[None, None, None, None, None, None, None, None, None, None, None]

In [13]:
# detecting outliers
for col in numerical_cols:
  fig = px.box(data,x='Price',title=col)
  fig.update_layout(width = 400,height = 200, margin=dict(l=20,r=20,t=30,b=10))
  fig.show()

In [14]:
# remove outliers
q1 = data.quantile(0.25,numeric_only=True)
q3 = data.quantile(0.75,numeric_only=True)
iqr = q3-q1
iqr

Year             9.0
HP             129.0
Cylinders        2.0
MPG-H            8.0
MPG-C            6.0
Price        21295.0
dtype: float64

In [15]:
df = data[~(data < (q1 - 1.5 * iqr)) | (data > (q3 + 1.5 * iqr)).any(axis=1)]
df.shape

(10996, 11)

In [25]:
fig = px.bar(x = df.Make.value_counts().nlargest(30).index, y = df.Make.value_counts().nlargest(30).values,
             labels={"x":'Make',"y":"Number of cars"})
fig.update_layout()

In [41]:
# heat maps
fig = px.imshow(df.corr().round(2), text_auto=True)
fig.update_layout(width = 500,height = 500, margin=dict(l=30,r=30,t=30,b=10))
fig.show()

In [48]:
# relationship between price and horse power
fig = px.scatter(x=df.HP, y=df.Price, labels={"x":"Horse power","y":"Price"})
fig.update_layout(width = 500,height = 400, margin=dict(l=30,r=30,t=30,b=10))
fig.show()