In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

1. LOADING AND INITIAL INSPECTION

1.1 Initial Data Loading

In [3]:
#Importing The Dataset
cars = pd.read_csv('cars24data.csv')

In [7]:
#First Ten Rows Of Data Set For Inspection
cars.head(10)

Unnamed: 0,Model Name,Price,Manufacturing_year,Engine capacity,Spare key,Transmission,KM driven,Ownership,Fuel type,Imperfections,Repainted Parts
0,2017 Maruti Swift VXI,561000,2017,1197,No,Manual,25847,2,Petrol,6,2
1,2016 Maruti Baleno DELTA PETROL 1.2,498000,2016,1197,Yes,Manual,55511,2,Petrol,12,1
2,2020 Maruti Swift VXI,577000,2020,1197,No,Manual,47110,1,Petrol,4,2
3,2022 Maruti Ertiga VXI AT SHVS,1084000,2022,1462,Yes,Automatic,35378,1,Petrol,2,3
4,2019 Maruti Dzire VXI,603000,2019,1197,Yes,Manual,91856,1,Petrol,3,2
5,2014 Maruti Alto 800 LXI,233000,2014,796,No,Manual,43780,1,Petrol,10,2
6,2020 Maruti Swift VXI,593000,2020,1197,Yes,Manual,49583,1,Petrol,1,0
7,2018 Maruti Dzire VXI AMT,583000,2018,1197,No,Automatic,86837,2,Petrol,4,6
8,2016 Maruti Swift Dzire VXI,513000,2016,1197,Yes,Manual,58570,2,Petrol,10,3
9,2019 Maruti S PRESSO VXI,378000,2019,998,Yes,Manual,50645,1,Petrol,0,0


In [6]:
#Last Ten Rows Of Data Set For Inspection
cars.tail(10)

Unnamed: 0,Model Name,Price,Manufacturing_year,Engine capacity,Spare key,Transmission,KM driven,Ownership,Fuel type,Imperfections,Repainted Parts
1435,2018 Maruti Ciaz ZETA 1.4 AT PETROL,648000,2018,1373,Yes,Automatic,64686,1,Petrol,14,1
1436,2017 Maruti Baleno ALPHA PETROL 1.2,552000,2017,1197,Yes,Manual,75059,1,Petrol,5,6
1437,2020 Maruti Baleno ZETA PETROL 1.2,638000,2020,1197,No,Manual,58346,1,Petrol,1,0
1438,2016 Maruti Ciaz ZDI SHVS,512000,2016,1248,Yes,Manual,79630,1,Diesel,8,7
1439,2021 Maruti Swift VXI,621000,2021,1197,No,Manual,44299,1,Petrol,1,0
1440,2021 Maruti Ertiga VXI SHVS,862000,2021,1462,No,Manual,19901,1,Petrol,1,0
1441,2015 Maruti Ciaz ZXI,507000,2015,1373,No,Manual,50022,1,Petrol,5,2
1442,2019 Maruti Baleno DELTA PETROL 1.2,554000,2019,1197,Yes,Manual,58679,1,Petrol,24,4
1443,2017 Maruti Ciaz S 1.4 MT PETROL,557000,2017,1373,Yes,Manual,73948,2,Petrol,4,5
1444,2012 Maruti Wagon R 1.0 LXI,256000,2012,998,No,Manual,55994,1,Petrol,20,9


1.2 Initial Data Overview

In [10]:
#Checking For Number Of Rows And Columns
cars.shape

(1445, 11)

In [13]:
#General Information on each column
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1445 entries, 0 to 1444
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Model Name          1445 non-null   object
 1   Price               1445 non-null   int64 
 2   Manufacturing_year  1445 non-null   int64 
 3   Engine capacity     1445 non-null   int64 
 4   Spare key           1445 non-null   object
 5   Transmission        1445 non-null   object
 6   KM driven           1445 non-null   int64 
 7   Ownership           1445 non-null   int64 
 8   Fuel type           1445 non-null   object
 9   Imperfections       1445 non-null   int64 
 10  Repainted Parts     1445 non-null   int64 
dtypes: int64(7), object(4)
memory usage: 124.3+ KB


2. DATA CLEANING AND PROCESSING

2.1 Missing Value Analysis

In [5]:
# Checking For Null Values
cars.isnull().sum()

Model Name            0
Price                 0
Manufacturing_year    0
Engine capacity       0
Spare key             0
Transmission          0
KM driven             0
Ownership             0
Fuel type             0
Imperfections         0
Repainted Parts       0
dtype: int64

2.2 Duplicate Detection

In [7]:
# Checking For Duplicates Values
cars.duplicated().sum()

0

2.3 Outlier Detection

In [12]:
# Function to detect outliers using the IQR method
def detect_outliers_iqr(cars, columns):
    outliers = {}
    for col in columns:
        Q1 = cars[col].quantile(0.25)
        Q3 = cars[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers[col] = cars[(cars[col] < lower_bound) | (cars[col] > upper_bound)][col]
    return outliers

# Numeric columns to check for outliers
numeric_columns = ["Price", "Manufacturing_year", "Engine capacity", "KM driven", 
                   "Ownership", "Imperfections", "Repainted Parts"]

# Detect outliers
outliers = detect_outliers_iqr(cars, numeric_columns)

# Display summary of outliers
for col in outliers:
    print(f"{col}: {len(outliers[col])} outliers")

Price: 46 outliers
Manufacturing_year: 0 outliers
Engine capacity: 0 outliers
KM driven: 0 outliers
Ownership: 0 outliers
Imperfections: 36 outliers
Repainted Parts: 23 outliers


3. DESCRIPTIVE STATISTICAL ANALYSIS

3.1 Numerical Variables Analysis

In [8]:
# Calculating basic descriptive stats
cars.describe()

Unnamed: 0,Price,Manufacturing_year,Engine capacity,KM driven,Ownership,Imperfections,Repainted Parts
count,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0
mean,526354.3,2017.817301,1142.104498,50588.903114,1.285121,9.597232,3.228374
std,196369.6,2.986554,169.020818,27339.562631,0.489877,8.398637,3.364578
min,139000.0,2010.0,796.0,1207.0,1.0,0.0,0.0
25%,390000.0,2016.0,998.0,28803.0,1.0,3.0,0.0
50%,501000.0,2018.0,1197.0,47849.0,1.0,8.0,2.0
75%,631000.0,2020.0,1197.0,70337.0,2.0,14.0,5.0
max,1599000.0,2023.0,1462.0,124716.0,3.0,43.0,27.0


3.2 Categorical Variable Analysis