In [5]:
# Import basic libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('Cars_24.csv')

In [12]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [13]:
df.shape

(8128, 12)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                8128 non-null   object 
 1   year                8128 non-null   int64  
 2   selling_price       8128 non-null   int64  
 3   km_driven           8128 non-null   int64  
 4   fuel                8128 non-null   object 
 5   seller_type         8128 non-null   object 
 6   transmission        8128 non-null   object 
 7   owner               8128 non-null   object 
 8   mileage(km/ltr/kg)  7907 non-null   float64
 9   engine              7907 non-null   float64
 10  max_power           7913 non-null   object 
 11  seats               7907 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 762.1+ KB


In [15]:
df.isnull().sum()

name                    0
year                    0
selling_price           0
km_driven               0
fuel                    0
seller_type             0
transmission            0
owner                   0
mileage(km/ltr/kg)    221
engine                221
max_power             215
seats                 221
dtype: int64

In [26]:
df['max_power'].nunique()

320

In [31]:
df[df['max_power']== ' bhp']

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats


In [33]:
# 2. Data Type Corrections
# Convert 'max_power' to numeric (some values may be invalid)
df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce')

In [34]:
df.dtypes

name                   object
year                    int64
selling_price           int64
km_driven               int64
fuel                   object
seller_type            object
transmission           object
owner                  object
mileage(km/ltr/kg)    float64
engine                float64
max_power             float64
seats                 float64
dtype: object

In [35]:
# 3. Missing Value Treatment
# Check again for missing
missing_cols = df.columns[df.isnull().any()].tolist()

In [36]:
missing_cols

['mileage(km/ltr/kg)', 'engine', 'max_power', 'seats']

In [37]:
# Fill numeric columns with median
numeric_cols = df.select_dtypes(include=['int64','float64']).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

In [38]:
# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [40]:
df.isna().sum() 

name                  0
year                  0
selling_price         0
km_driven             0
fuel                  0
seller_type           0
transmission          0
owner                 0
mileage(km/ltr/kg)    0
engine                0
max_power             0
seats                 0
dtype: int64

In [43]:
for col in categorical_cols:
    print(f"{col}: {df[col].value_counts()} ")
    print('-'*50)

name: name
Maruti Swift Dzire VDI              129
Maruti Alto 800 LXI                  82
Maruti Alto LXi                      71
BMW X4 M Sport X xDrive20d           62
Maruti Swift VDI                     61
                                   ... 
Maruti 800 DX BSII                    1
Ford Figo Aspire Titanium Diesel      1
Hyundai Verna CRDi 1.6 SX             1
Maruti Baleno Alpha Diesel            1
Tata New Safari Dicor VX 4X2          1
Name: count, Length: 2058, dtype: int64 
--------------------------------------------------
fuel: fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64 
--------------------------------------------------
seller_type: seller_type
Individual          6766
Dealer              1126
Trustmark Dealer     236
Name: count, dtype: int64 
--------------------------------------------------
transmission: transmission
Manual       7078
Automatic    1050
Name: count, dtype: int64 
----------------------------------------

In [44]:
for col in categorical_cols:
    print(f"{col}: {df[col].nunique()} unique values")

name: 2058 unique values
fuel: 4 unique values
seller_type: 3 unique values
transmission: 2 unique values
owner: 5 unique values


In [48]:
encode_dict = {
    'fuel_type': {'Petrol': 1, 'Diesel': 2, 'CNG': 3, 'LPG': 4},
    'transmission_type': {'Manual': 1, 'Automatic': 2},
    'seller_type': {'Individual': 1, 'Dealer': 2, 'Trustmark Dealer': 3},
    'owner': {'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4, 'Test Drive Car': 5}    
}

In [50]:
df.replace(encode_dict, inplace=True)

  df.replace(encode_dict, inplace=True)


In [51]:
df.head(3)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,1,Manual,1,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,1,Manual,2,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,1,Manual,3,17.7,1497.0,78.0,5.0


In [52]:
X = df.drop(columns = ['name','selling_price'])
y = df['selling_price']

In [58]:
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)