In [77]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [78]:
df = pd.read_csv('../Raw Data/cars_india.csv')

## Data Cleaning

In [79]:
# Drop all rows that have NaN/missing values
df.dropna(inplace=True)

In [80]:
# Convert the price of the cars from Rupees to USD
df['Price'] = df['Price'] * 0.012

In [81]:
# Convert the odometer reading from km to miles
df['Kilometer'] = df['Kilometer'] * 0.621

In [82]:
# Remove the 'cc' at the end of every engine displacement number
# And convert it from cubic centimeters to liters
df['Engine'] = df['Engine'].map(lambda i: str(i).rstrip(' cc'))
df['Engine'] = pd.to_numeric(df['Engine'])
df['Engine'] = df['Engine'] * 0.001

In [83]:
# Split the 'Max Power' column into 2 columns (Horsepower and torque)
# And drop the original 'Max Power' column
df[['Horsepower', 'HP RPM']] = df['Max Power'].str.split('@', expand=True)
df.drop('Max Power', inplace=True, axis=1)

In [84]:
# Drop the ' bhp' and ' rpm' from the two new columns
df['Horsepower'] = df['Horsepower'].map(lambda i: str(i).rstrip(' bhp'))
df['HP RPM'] = df['HP RPM'].map(lambda i: str(i).rstrip(' rpm'))

In [85]:
# Do the exact same thing for the 'Max Torque' column
df[['Torque', 'Torque RPM']] = df['Max Torque'].str.split('@', expand=True)
df.drop('Max Torque', inplace=True, axis=1)

df['Torque'] = df['Torque'].map(lambda i: str(i).rstrip(' Nm'))
df['Torque RPM'] = df['Torque RPM'].map(lambda i: str(i).rstrip(' rpm'))

In [86]:
df = df.drop(columns=['Model'])

In [87]:
labels = ['very low', 'low', 'medium', 'high', 'very high']
boundaries = [0, 10000, 20000, 30000, 40000, 1000000]
df['Price'] = pd.cut(df['Price'], bins=boundaries, labels=labels, include_lowest=True)

## Label Encoding of non-numeric columns

In [88]:
label_encodings_file = open("../Clean Data/india_label_encodings.txt", "w")

In [89]:
le = LabelEncoder()
le.fit(df['Make'])
le_make_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
df['Make'] = le.transform(df['Make'])

label_encodings_file.write("Make: \n")
label_encodings_file.write(str(le_make_mapping))

456

In [90]:
le = LabelEncoder()
le.fit(df['Fuel Type'])
le_make_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
df['Fuel Type'] = le.transform(df['Fuel Type'])

label_encodings_file.write("\n\nFuel Type: \n")
label_encodings_file.write(str(le_make_mapping))

94

In [91]:
le = LabelEncoder()
le.fit(df['Location'])
le_make_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
df['Location'] = le.transform(df['Location'])

label_encodings_file.write("\n\nLocation: \n")
label_encodings_file.write(str(le_make_mapping))

1160

In [92]:
le = LabelEncoder()
le.fit(df['Color'])
le_make_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
df['Color'] = le.transform(df['Color'])

label_encodings_file.write("\n\nColor: \n")
label_encodings_file.write(str(le_make_mapping))

200

In [93]:
le = LabelEncoder()
le.fit(df['Owner'])
le_make_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
df['Owner'] = le.transform(df['Owner'])

label_encodings_file.write("\n\nOwner: \n")
label_encodings_file.write(str(le_make_mapping))

60

In [94]:
le = LabelEncoder()
le.fit(df['Transmission'])
le_make_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
df['Transmission'] = le.transform(df['Transmission'])

label_encodings_file.write("\n\nTransmission: \n")
label_encodings_file.write(str(le_make_mapping))

29

In [95]:
le = LabelEncoder()
le.fit(df['Drivetrain'])
le_make_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
df['Drivetrain'] = le.transform(df['Drivetrain'])

label_encodings_file.write("\n\nDrivetrain: \n")
label_encodings_file.write(str(le_make_mapping))

30

In [96]:
le = LabelEncoder()
le.fit(df['Seller Type'])
le_make_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
df['Seller Type'] = le.transform(df['Seller Type'])

label_encodings_file.write("\n\nSeller Type: \n")
label_encodings_file.write(str(le_make_mapping))

63

In [98]:
price_map = {'very low': 0, 'low': 1, 'medium': 2, 'high': 3, 'very high': 4}
df['Price'] = df['Price'].map(price_map)

label_encodings_file.write("\n\nPrice: \n")
label_encodings_file.write(str(price_map))

65

In [99]:
label_encodings_file.close()
df.reset_index(drop=True, inplace=True)
df.dropna(inplace=True)

In [100]:
# Convert all columns to numeric
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [101]:
df.dropna(inplace=True, how='any')

In [102]:
df

Unnamed: 0,Make,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,...,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity,Horsepower,HP RPM,Torque,Torque RPM
0,7,0,2017,54120.150,5,1,56,7,0,1,...,1,3990.0,1680.0,1505.0,5.0,35.0,87.0,6000.0,109.0000,4500
1,18,0,2014,46575.000,2,1,40,14,1,2,...,1,3995.0,1695.0,1555.0,5.0,42.0,74.0,4000.0,190.0000,2000
2,8,0,2011,41607.000,5,1,39,8,0,2,...,1,3585.0,1595.0,1550.0,5.0,35.0,79.0,6000.0,112.7619,4000
3,29,0,2019,23287.500,5,1,41,12,0,2,...,1,3995.0,1745.0,1510.0,5.0,37.0,82.0,6000.0,113.0000,4200
4,29,2,2018,42849.000,2,1,45,7,0,2,...,2,4735.0,1830.0,1795.0,7.0,55.0,148.0,3400.0,343.0000,1400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1869,18,0,2014,49059.000,5,1,20,14,1,2,...,1,3775.0,1680.0,1620.0,5.0,43.0,85.0,6000.0,113.0000,4500
1870,17,1,2016,56076.300,2,1,66,14,0,2,...,1,4585.0,1890.0,1785.0,7.0,70.0,138.0,3750.0,330.0000,1600
1871,8,0,2014,51543.000,5,1,1,14,1,2,...,1,3495.0,1550.0,1500.0,5.0,32.0,55.0,5500.0,75.0000,4000
1872,6,0,2013,45333.000,5,1,67,13,0,2,...,1,3795.0,1680.0,1427.0,5.0,45.0,70.0,6250.0,102.0000,4000


In [103]:
df.to_csv('../Clean Data/cars_india_clean.csv', index=False)