## Cleaning the data

In [436]:
## importing required packages

import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import re

In [437]:
## reading data

car_details = pd.read_csv('car_details.csv', dtype={13: 'str'})

car_details

Unnamed: 0,ignition_type,fuel_type,body_type,kilometers_driven,transmission,ownerNo,owner,oem,model,modelYear,...,Compression Ratio,Super Charger,Front Tread,Rear Tread,Gross Weight,Turning Radius,Top Speed,Acceleration,BoreX Stroke,Ground Clearance Unladen
0,0,Petrol,SUV,20000,Automatic,1,1st Owner,Kia,Kia Sonet,2022,...,,,,,,,,,,
1,0,Petrol,Minivans,20687,Manual,1,1st Owner,Maruti,Maruti Eeco,2015,...,9.9:1,No,1280mm,1290mm,1540kg,4.5 metres,145 Kmph,15.7 Seconds,,
2,0,Petrol,SUV,30000,Manual,1,1st Owner,Nissan,Nissan Magnite,2021,...,,No,,,,5.0,,11.7,72.2 x 81.3,
3,0,Petrol,Hatchback,59247,Manual,1,1st Owner,Hyundai,Hyundai i10,2015,...,,No,1400mm,1385mm,,4.7 metres,165 Kmph,14.3 Seconds,,
4,0,Petrol,Hatchback,50000,Manual,1,1st Owner,Honda,Honda Jazz,2015,...,,No,,,,5.1 meters,172 Kmph,13.7 Seconds,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8364,0,Petrol,Hatchback,10000,Manual,1,1st Owner,Maruti,Maruti Celerio,2022,...,,,1430,1440,1260,,,,,
8365,0,Petrol,Hatchback,120000,Manual,1,1st Owner,Maruti,Maruti Alto 800,2014,...,,No,1295mm,1290mm,1185kg,4.6 metres,140 kmph,19 Seconds,,
8366,0,Petrol,Sedan,50000,Automatic,3,3rd Owner,Mercedes-Benz,Mercedes-Benz C-Class,2011,...,9.3:1,No,1549mm,1552mm,2020kg,5.42 metres,230km/hr,8.8 Seconds,,
8367,0,Petrol,Hatchback,40000,Manual,1,1st Owner,Maruti,Maruti Ritz,2012,...,,No,1470mm,1480mm,1430kg,4.7 metres,156 Kmph,15 Seconds,,


In [438]:
car_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8369 entries, 0 to 8368
Data columns (total 64 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ignition_type             8369 non-null   int64  
 1   fuel_type                 8369 non-null   object 
 2   body_type                 8365 non-null   object 
 3   kilometers_driven         8369 non-null   object 
 4   transmission              8369 non-null   object 
 5   ownerNo                   8369 non-null   int64  
 6   owner                     8369 non-null   object 
 7   oem                       8369 non-null   object 
 8   model                     8369 non-null   object 
 9   modelYear                 8369 non-null   int64  
 10  centralVariantId          8369 non-null   int64  
 11  variantName               8369 non-null   object 
 12  price                     8369 non-null   object 
 13  priceActual               1670 non-null   object 
 14  priceSav

In [439]:
## initializing encoders

label = LabelEncoder()
ohe = OneHotEncoder()

In [440]:
## fuel_type 

car_details['fuel_type'].nunique()

car_details['Fuel_type'] = label.fit_transform(car_details['fuel_type'])

car_details.drop('fuel_type', axis=1, inplace=True)

In [441]:
## body_type

## cleaning null 
mode_ = car_details['body_type'].mode()[0]
car_details['body_type'].fillna(mode_, inplace=True)

## encoding the data

car_details['Body_type'] = label.fit_transform(car_details['body_type'])

car_details.drop('body_type', axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_details['body_type'].fillna(mode_, inplace=True)


In [442]:
# kilometers_driven

car_details['kilometers_driven'] = car_details['kilometers_driven'].str.replace(",", "")
car_details['kilometers_driven'] = car_details['kilometers_driven'].astype(float)

In [443]:
## transmission

car_details['Transmission'] = label.fit_transform(car_details['transmission'])

car_details.drop('transmission', axis=1, inplace=True)

In [444]:
# oem

car_details['oem'].nunique()
car_details['Oem'] = label.fit_transform(car_details['oem'])

car_details.drop('oem', axis=1, inplace=True)

In [445]:
## price

def get_price(row):
    try:
        try:
            pattern = r'\d+\.\d+'#creatign a pattern to extract price 
            result = re.search(pattern, str(row))# searching for pattern
            return float(result.group()) #returning the result 
        except AttributeError:
            pattern = r'\d+'# if the above pattern didn't found searching for this pattern
            result =re.search(pattern, str(row))
            return float(result.group())
    except:
        return None

car_details['price'] = car_details['price'].apply(get_price)
car_details['price'] = car_details['price'].astype(float)

In [446]:
## location

car_details['Location'] = label.fit_transform(car_details['location'])

car_details.drop('location', axis=1, inplace=True)

In [447]:
## Engine Displacement

def clean_engine(row):
    try:
        if (row == "Manual" or row=="Automatic"):
            return 0
        else:
            pattern = r'\d+'
            result =re.search(pattern, str(row))
            return result.group()
    except:
        return 0
def clean_null(row):
    if int(row) == 0:
        result = row + round(car_details['Engine Displacement'].mean())
        return result
    else: 
        return row

car_details['Engine Displacement'] = car_details['Engine Displacement'].apply(clean_engine)
car_details['Engine Displacement'] = car_details['Engine Displacement'].astype(float)
car_details['Engine Displacement'] = car_details['Engine Displacement'].apply(clean_null)

In [448]:
car_details['Insurance Validity'].unique()

array(['Third Party insurance', 'Comprehensive', 'Third Party',
       'Zero Dep', 'Petrol', 'Diesel', 'Not Available', '2', '1',
       'Electric'], dtype=object)

In [449]:
# cleaning insurance validity
def clean_insurance(row):
    if (row.lower() == 'petrol' or row.lower() == 'diesel' or row.lower() == 'electric'):
        return None
    elif row == 'Third Party insurance':
        return 'Third Party'
    elif (row == 'Not Available' or str(row) == '1' or str(row) == '2'):
        return None
    else: 
        return row

car_details['Insurance Validity'] = car_details['Insurance Validity'].apply(clean_insurance)
car_details['Insurance Validity'] = label.fit_transform(car_details['Insurance Validity'])

In [450]:
car_details['Insurance Validity']

0       1
1       0
2       1
3       0
4       1
       ..
8364    1
8365    1
8366    1
8367    1
8368    1
Name: Insurance Validity, Length: 8369, dtype: int32

In [451]:
#Color

car_details['Color'].unique()

car_details['Color'] = car_details['Color'].fillna(car_details['Color'].mode()[0])

## label encoding 

car_details['Color'] = label.fit_transform(car_details['Color'])

In [452]:
# cleaning Engine Type

car_details['Engine Type'].unique()

## label encoding

car_details['Engine Type'] = car_details['Engine Type'].fillna(car_details['Engine Type'].mode()[0])
car_details['Engine Type'] = label.fit_transform(car_details['Engine Type'])


In [453]:
## No of Cylinder

car_details['No of Cylinder'].unique()

car_details['No of Cylinder'] = car_details['No of Cylinder'].fillna(car_details['No of Cylinder'].mode()[0])

In [454]:
# FTurbo Charger

car_details['Turbo Charger'].unique()

def clean_turbo(row):
    if (str(row) == 'no' or str(row) == 'NO'):
        return 'No'
    elif (str(row) == 'YES' or str(row) == 'yes' or str(row) == 'twin' or str(row) == 'Twin' or str(row) == 'Turbo'):
        return 'Yes'
    else:
        return row

car_details['Turbo Charger'] = car_details['Turbo Charger'].apply(clean_turbo)

car_details['Turbo Charger'] = car_details['Turbo Charger'].fillna(car_details['Turbo Charger'].mode()[0])

car_details['Turbo Charger'] = label.fit_transform(car_details['Turbo Charger'])



In [455]:
# length
car_details['Length'] = car_details['Length'].apply(get_price)
car_details['Length'] = car_details['Length'].replace(0, None)
car_details['Length'] = car_details['Length'].fillna(car_details['Length'].median())

# Width
car_details['Width'] = car_details['Width'].apply(get_price)
car_details['Width'] = car_details['Width'].replace(0, None)
car_details['Width'] = car_details['Width'].fillna(car_details['Width'].median())

# Height
car_details['Height'] = car_details['Height'].apply(get_price)
car_details['Height'] = car_details['Height'].replace(0, None)
car_details['Height'] = car_details['Height'].fillna(car_details['Height'].median())

# Wheel Base
car_details['Wheel Base'] = car_details['Wheel Base'].apply(get_price)
car_details['Wheel Base'] = car_details['Wheel Base'].replace(0, None)
car_details['Wheel Base'] = car_details['Wheel Base'].fillna(car_details['Wheel Base'].median())

# Kerb Weight
car_details['Kerb Weight'] = car_details['Kerb Weight'].apply(get_price)
car_details['Kerb Weight'] = car_details['Kerb Weight'].replace(0, None)
car_details['Kerb Weight'] = car_details['Kerb Weight'].fillna(car_details['Kerb Weight'].median())

# No Door Numbers
car_details['No Door Numbers'] = car_details['No Door Numbers'].apply(get_price)
car_details['No Door Numbers'] = car_details['No Door Numbers'].replace(0, None)
car_details['No Door Numbers'] = car_details['No Door Numbers'].fillna(car_details['No Door Numbers'].median())



In [456]:
## Gear Box

car_details['Gear Box'] = car_details['Gear Box'].apply(get_price)

car_details['Gear Box'] = car_details['Gear Box'].fillna(car_details['Gear Box'].mode()[0])

In [457]:
## Drive Type

car_details['Drive Type'] = car_details['Drive Type'].fillna(car_details['Drive Type'].mode()[0])

car_details['Drive Type'] = label.fit_transform(car_details['Drive Type'])

car_details['Drive Type'].unique()

array([12, 16,  9,  7,  1,  4, 14, 13,  0,  6,  5, 19, 18,  2, 11,  8, 10,
       17, 15,  3])

In [458]:
# cleaning datas in Steering Type
def steering_type(row):
    row = str(row)
    if (row.lower() == 'electric' or row.lower() == 'electrical' or row.lower() == "electronic" or row.lower() == 'epas'):
        return 'electric'
    elif (row.lower() == 'manual'):
        return 'manual'
    elif (row.lower() == 'power'):
        return 'power'
    else:
        return 'hydraulic'

car_details['Steering Type'] = car_details['Steering Type'].apply(steering_type)
car_details['Steering Type'] = label.fit_transform(car_details['Steering Type'])

In [459]:
## cargo volume

car_details['Cargo Volumn'] = car_details['Cargo Volumn'].apply(get_price)

car_details['Cargo Volumn'] = car_details['Cargo Volumn'].fillna(car_details['Cargo Volumn'].mean())

In [460]:
## Value Configuration

car_details['Value Configuration'] = car_details['Value Configuration'].fillna(car_details['Value Configuration'].mode()[0])

car_details['Value Configuration'] = label.fit_transform(car_details['Value Configuration'])

In [461]:
# Turning Radius

car_details['Turning Radius'] = car_details['Turning Radius'].apply(get_price)

car_details['Turning Radius'] = car_details['Turning Radius'].fillna(car_details['Turning Radius'].mode()[0])

In [462]:
car_details.drop(['owner', 'model', "variantName", 'index_number', 'priceActual', 'priceSaving', 'priceFixedText', 'trendingText', 'Registration Year', "Fuel Type", 'Seats', 'Kms Driven', 'RTO', 'Ownership', 'Transmission', 'Year of Manufacture', 'Displacement', 'Values per Cylinder', 'Fuel Suppy System', 'Seating Capacity', 'Front Brake Type', 'Rear Brake Type','Tyre Type', 'Alloy Wheel Size', 'Compression Ratio', 'Super Charger', 'Front Tread', 'Rear Tread','Max Power', 'Max Torque','Gross Weight', 'Top Speed', 'Acceleration', 'BoreX Stroke', 'Ground Clearance Unladen'], axis=1, inplace=True)

In [463]:
car_details

Unnamed: 0,ignition_type,kilometers_driven,ownerNo,modelYear,centralVariantId,price,Insurance Validity,Engine Displacement,Color,Engine Type,...,Drive Type,Steering Type,No Door Numbers,Cargo Volumn,Value Configuration,Turning Radius,Fuel_type,Body_type,Oem,Location
0,0,20000.0,1,2022,8654,11.50,1,998.0,8,369,...,12,0,5.0,392.0,3,5.30,4,7,13,1
1,0,20687.0,1,2015,4025,4.15,0,1196.0,57,278,...,16,2,5.0,540.0,3,4.50,4,5,20,1
2,0,30000.0,1,2021,8135,7.50,1,999.0,86,263,...,12,0,5.0,336.0,8,5.00,4,7,24,1
3,0,59247.0,1,2015,1579,3.98,0,1086.0,116,265,...,12,3,5.0,225.0,8,4.70,4,2,9,1
4,0,50000.0,1,2015,1341,5.50,1,1199.0,86,476,...,12,3,5.0,354.0,8,5.10,4,2,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8364,0,10000.0,1,2022,8682,5.10,1,1269.0,86,296,...,12,0,5.0,313.0,3,5.30,4,2,20,5
8365,0,120000.0,1,2014,3943,1.80,1,1269.0,86,251,...,12,3,5.0,177.0,3,4.60,4,2,20,5
8366,0,50000.0,3,2011,4672,5.50,1,1796.0,86,193,...,16,3,4.0,475.0,3,5.42,4,8,21,5
8367,0,40000.0,1,2012,4144,1.40,1,1269.0,86,289,...,12,3,5.0,236.0,3,4.70,4,2,20,5


In [464]:
car_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8369 entries, 0 to 8368
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ignition_type        8369 non-null   int64  
 1   kilometers_driven    8369 non-null   float64
 2   ownerNo              8369 non-null   int64  
 3   modelYear            8369 non-null   int64  
 4   centralVariantId     8369 non-null   int64  
 5   price                8369 non-null   float64
 6   Insurance Validity   8369 non-null   int32  
 7   Engine Displacement  8369 non-null   float64
 8   Color                8369 non-null   int32  
 9   Engine Type          8369 non-null   int32  
 10  No of Cylinder       8369 non-null   float64
 11  Turbo Charger        8369 non-null   int32  
 12  Length               8369 non-null   float64
 13  Width                8369 non-null   float64
 14  Height               8369 non-null   float64
 15  Wheel Base           8369 non-null   f

In [496]:
car_details.to_csv('cleaned_details.csv')