In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# read data

In [2]:
dataset = pd.read_csv("data/raw_data.csv")
dataset.head()

Unnamed: 0,car_name,Year,fuel_type,transmission,owner,Selling Price
0,Jeep Compass 2.0 Longitude Option BSIV,2017,Diesel,Manual,1,10.03
1,Renault Duster RXZ Turbo CVT,2021,Petrol,Automatic,1,12.83
2,Toyota Camry 2.5 G,2016,Petrol,Automatic,1,16.4
3,Honda Jazz VX CVT,2018,Petrol,Automatic,1,7.77
4,Volkswagen Polo 1.2 MPI Highline,2016,Petrol,Manual,1,5.15


# null value

In [3]:
dataset.isnull().sum()

car_name         0
Year             0
fuel_type        0
transmission     0
owner            0
Selling Price    0
dtype: int64

# duplicate value

In [4]:
dataset.duplicated().sum()

629

In [5]:
dataset = dataset.drop_duplicates()

In [6]:
dataset

Unnamed: 0,car_name,Year,fuel_type,transmission,owner,Selling Price
0,Jeep Compass 2.0 Longitude Option BSIV,2017,Diesel,Manual,1,10.03
1,Renault Duster RXZ Turbo CVT,2021,Petrol,Automatic,1,12.83
2,Toyota Camry 2.5 G,2016,Petrol,Automatic,1,16.40
3,Honda Jazz VX CVT,2018,Petrol,Automatic,1,7.77
4,Volkswagen Polo 1.2 MPI Highline,2016,Petrol,Manual,1,5.15
...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,2018,Diesel,Automatic,1,28.90
5508,BMW M Series M4 Coupe,2015,Petrol,Automatic,2,64.90
5509,Jaguar XF 2.2 Litre Luxury,2013,Diesel,Automatic,2,13.75
5510,BMW 7 Series 730Ld,2015,Diesel,Automatic,3,29.90


# data type

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4883 entries, 0 to 5511
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   car_name       4883 non-null   object 
 1   Year           4883 non-null   int64  
 2   fuel_type      4883 non-null   object 
 3   transmission   4883 non-null   object 
 4   owner          4883 non-null   int64  
 5   Selling Price  4883 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 267.0+ KB


# Statistical Info

In [8]:
dataset.describe()

Unnamed: 0,Year,owner,Selling Price
count,4883.0,4883.0,4883.0
mean,2015.238173,1.441122,1879.170385
std,3.964767,0.719857,12031.667108
min,1995.0,0.0,1.0
25%,2013.0,1.0,3.15
50%,2016.0,1.0,5.38
75%,2018.0,2.0,9.5
max,2022.0,5.0,99999.0


# Feature Enginnering

In [9]:
print("Categories in 'car_name' variable:     ",end=" " )
print(dataset['car_name'].unique())

print("Categories in 'fuel_type' variable:  ",end=" ")
print(dataset['fuel_type'].unique())

print("Categories in'transmission' variable:",end=" " )
print(dataset['transmission'].unique())



Categories in 'car_name' variable:      ['Jeep Compass 2.0 Longitude Option BSIV' 'Renault Duster RXZ Turbo CVT'
 'Toyota Camry 2.5 G' ... 'Volvo XC 90 D5 Momentum BSIV'
 'Mercedes-Benz E-Class E250 Edition E' 'BMW M Series M4 Coupe']
Categories in 'fuel_type' variable:   ['Diesel' 'Petrol' 'Cng' 'Electric' 'Lpg']
Categories in'transmission' variable: ['Manual' 'Automatic']


In [10]:
# define numerical & categorical columns
numeric_features = [feature for feature in dataset.columns if dataset[feature].dtype != 'O']
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 3 numerical features : ['Year', 'owner', 'Selling Price']

We have 3 categorical features : ['car_name', 'fuel_type', 'transmission']
