In [3]:
import pandas as pd
import numpy as np
import joblib

In [4]:
df = pd.read_csv("cars_data.csv")

In [5]:
df.sample(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
8071,Toyota Innova 2.5 V Diesel 7-seater,2010,425000,200000,Diesel,Individual,Manual,Second Owner,12.8 kmpl,2494 CC,102 bhp,20.4@ 1400-3400(kgm@ rpm),7.0
4246,Maruti Swift AMT ZXI,2018,600000,69779,Petrol,Dealer,Automatic,First Owner,22.0 kmpl,1197 CC,81.80 bhp,113Nm@ 4200rpm,5.0
5971,Mahindra Bolero 2011-2019 ZLX,2015,540000,110000,Diesel,Individual,Manual,Second Owner,15.96 kmpl,2523 CC,62.1 bhp,195Nm@ 1400-2200rpm,7.0
4827,Maruti Swift AMT ZXI,2018,600000,69779,Petrol,Dealer,Automatic,First Owner,22.0 kmpl,1197 CC,81.80 bhp,113Nm@ 4200rpm,5.0
3218,Maruti Alto 800 LXI,2020,350000,5000,Petrol,Individual,Manual,First Owner,22.05 kmpl,796 CC,47.3 bhp,69Nm@ 3500rpm,5.0


In [6]:
df.isna().sum()
df.dropna(inplace=True)

In [7]:
def convert_mileage(val):
    try:
        num, unit = val.split()
        num = float(num)
        if unit == 'km/kg':
            return round(num * 0.77, 2)
        else:
            return num
    except:
        return None

df['mileage'] = df['mileage'].apply(convert_mileage)


In [8]:
df['engine'] = df['engine'].str.replace(' CC', '').astype(float)
df['max_power'] = df['max_power'].str.replace(' bhp', '').astype(float)

In [9]:
df.sample(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
6543,Ford Figo Aspire Titanium,2015,500000,18000,Petrol,Dealer,Manual,Second Owner,18.5,1194.0,94.93,119Nm@ 4250rpm,5.0
5015,Maruti Ertiga SHVS ZDI Plus,2017,800000,55000,Diesel,Individual,Manual,First Owner,24.52,1248.0,88.5,200Nm@ 1750rpm,7.0
6224,Maruti Ertiga ZDI,2015,650000,80000,Diesel,Individual,Manual,First Owner,20.77,1248.0,88.8,200Nm@ 1750rpm,7.0
6385,Maruti Alto 800 LXI,2013,210000,50000,Petrol,Individual,Manual,Second Owner,22.74,796.0,47.3,69Nm@ 3500rpm,5.0
6113,Maruti Ciaz 1.3 Alpha,2017,720000,60000,Diesel,Individual,Manual,First Owner,28.09,1248.0,88.5,200Nm@ 1750rpm,5.0


In [10]:
df.drop(columns=['name','torque'],inplace=True)

In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in ['fuel', 'seller_type', 'transmission', 'owner']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [12]:
df.sample(5)

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
6199,2013,300000,75000,3,1,1,4,18.9,1197.0,82.0,5.0
4742,2013,1000000,75010,1,0,0,0,13.73,1995.0,148.0,5.0
8026,2008,100000,120000,1,1,1,0,19.09,1396.0,69.0,5.0
4367,2019,750000,6500,3,1,0,0,21.21,1197.0,81.8,5.0
8119,2017,360000,80000,3,1,1,0,20.51,998.0,67.04,5.0


In [13]:
df.to_csv('cars_processed.csv',index=False)

In [14]:
joblib.dump(label_encoders, "label_encoders.joblib")

['label_encoders.joblib']