# IMPORTING THE LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("car_price.csv", index_col='index')
df

# Removing the car name columns so the model does not consider car companies for prediction.

In [5]:
df.pop("car_name")
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...
5507,28.90 Lakh,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,64.90 Lakh,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,13.75 Lakh,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,29.90 Lakh,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


# Removing lakhs and crores prefix in car_prices column and converting prices to NRS from INR

In [6]:
crore = (df["car_prices_in_lakh"].str.contains("Crore"))
df["car_prices_in_lakh"] = df["car_prices_in_lakh"].str.removesuffix("Lakh").str.removesuffix("Crore").str.replace(',','').astype(float)
df["car_prices_in_lakh"] = (df["car_prices_in_lakh"] * 100).where(crore, other=df["car_prices_in_lakh"])
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,12.83,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,16.40,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,7.77,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,5.15,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...
5507,28.90,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,64.90,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,13.75,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,29.90,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


# Removing km suffix from km driven columns

In [7]:
df["kms_driven"] = df["kms_driven"].str.removesuffix("kms").str.replace(',','').astype(int)
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,86226,Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,12.83,13248,Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,16.40,60343,Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,7.77,26696,Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,5.15,69414,Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...
5507,28.90,45000,Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,64.90,29000,Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,13.75,90000,Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,29.90,79000,Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


# Labeling fuel types

In [8]:
fuels = {"Petrol":1,
         "Diesel": 2,
         "Cng": 3,
         "Lpg": 4,
         "Electric": 5
         } 
df["fuel_type"] = df["fuel_type"].replace(fuels)  #df["fuel_type"].replace(fuels, inplace=True)
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,86226,2,Manual,1st Owner,2017,1956 cc,5 Seats
1,12.83,13248,1,Automatic,1st Owner,2021,1330 cc,5 Seats
2,16.40,60343,1,Automatic,1st Owner,2016,2494 cc,5 Seats
3,7.77,26696,1,Automatic,1st Owner,2018,1199 cc,5 Seats
4,5.15,69414,1,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...
5507,28.90,45000,2,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,64.90,29000,1,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,13.75,90000,2,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,29.90,79000,2,Automatic,3rd Owner,2015,2967 cc,6 Seats


# Changing seats colums to int from str by removing suffix

In [9]:
df["Seats"] = df["Seats"].str.removesuffix("Seats").astype(int)

# Converting engine column in int as well as renaming column

In [10]:
df = df.rename({"engine": "engine_cc"}, axis="columns")
df["engine_cc"] = df["engine_cc"].str.removesuffix("cc").astype(int)
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine_cc,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,86226,2,Manual,1st Owner,2017,1956,5
1,12.83,13248,1,Automatic,1st Owner,2021,1330,5
2,16.40,60343,1,Automatic,1st Owner,2016,2494,5
3,7.77,26696,1,Automatic,1st Owner,2018,1199,5
4,5.15,69414,1,Manual,1st Owner,2016,1199,5
...,...,...,...,...,...,...,...,...
5507,28.90,45000,2,Automatic,1st Owner,2018,2995,7
5508,64.90,29000,1,Automatic,2nd Owner,2015,1968,5
5509,13.75,90000,2,Automatic,2nd Owner,2013,2755,5
5510,29.90,79000,2,Automatic,3rd Owner,2015,2967,6


# Giving tansmission a number for either manual or automatic

In [11]:
transmissions = {"Manual": 0, "Automatic": 1}
df['transmission'] = df['transmission'].replace(transmissions)
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine_cc,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,86226,2,0,1st Owner,2017,1956,5
1,12.83,13248,1,1,1st Owner,2021,1330,5
2,16.40,60343,1,1,1st Owner,2016,2494,5
3,7.77,26696,1,1,1st Owner,2018,1199,5
4,5.15,69414,1,0,1st Owner,2016,1199,5
...,...,...,...,...,...,...,...,...
5507,28.90,45000,2,1,1st Owner,2018,2995,7
5508,64.90,29000,1,1,2nd Owner,2015,1968,5
5509,13.75,90000,2,1,2nd Owner,2013,2755,5
5510,29.90,79000,2,1,3rd Owner,2015,2967,6


 # Changing owner columns to int as well

In [12]:

owners = {"1st":1, "2nd":2, "3rd":3, "4th":4, "5th":5, "0th": 0}
df["ownership"] = df["ownership"].str.removesuffix("Owner").str.strip().replace(owners)
df['ownership']

index
0       1
1       1
2       1
3       1
4       1
       ..
5507    1
5508    2
5509    2
5510    3
5511    2
Name: ownership, Length: 5512, dtype: int64

# Reordering prices at last cols

In [13]:
cols = list(df.columns.values)
price = cols.pop(0)
seat = cols.pop(6)
cols.insert(1, seat)
cols.insert(8, price)
df = df.reindex(columns = cols)
df

Unnamed: 0_level_0,kms_driven,Seats,fuel_type,transmission,ownership,manufacture,engine_cc,car_prices_in_lakh
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,86226,5,2,0,1,2017,1956,10.03
1,13248,5,1,1,1,2021,1330,12.83
2,60343,5,1,1,1,2016,2494,16.40
3,26696,5,1,1,1,2018,1199,7.77
4,69414,5,1,0,1,2016,1199,5.15
...,...,...,...,...,...,...,...,...
5507,45000,7,2,1,1,2018,2995,28.90
5508,29000,5,1,1,2,2015,1968,64.90
5509,90000,5,2,1,2,2013,2755,13.75
5510,79000,6,2,1,3,2015,2967,29.90


In [14]:
df.loc[df["car_prices_in_lakh"] >= 10, "more_than_10"] = 'True'
df.loc[df["car_prices_in_lakh"] < 10, "more_than_10"] = 'False'
df["more_than_10"] = df["more_than_10"].replace({"True":1, "False":0})
df.pop("car_prices_in_lakh")
df

Unnamed: 0_level_0,kms_driven,Seats,fuel_type,transmission,ownership,manufacture,engine_cc,more_than_10
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,86226,5,2,0,1,2017,1956,1
1,13248,5,1,1,1,2021,1330,1
2,60343,5,1,1,1,2016,2494,1
3,26696,5,1,1,1,2018,1199,0
4,69414,5,1,0,1,2016,1199,0
...,...,...,...,...,...,...,...,...
5507,45000,7,2,1,1,2018,2995,1
5508,29000,5,1,1,2,2015,1968,1
5509,90000,5,2,1,2,2013,2755,1
5510,79000,6,2,1,3,2015,2967,1


# Dividing dataset into train , valiadate and test data set

In [15]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])
train

Unnamed: 0_level_0,kms_driven,Seats,fuel_type,transmission,ownership,manufacture,engine_cc,more_than_10
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1719,75099,5,2,1,1,2014,1498,1
4509,50000,5,2,0,1,2020,1956,1
5262,38000,5,5,1,1,2020,2755,1
3508,11081,5,1,1,1,2021,1995,1
2006,74000,5,2,0,1,2012,1061,0
...,...,...,...,...,...,...,...,...
98,32000,5,2,0,1,2015,1498,0
4663,30000,5,2,0,1,2021,796,1
289,69828,5,1,1,3,2011,1598,0
2762,10400,5,2,1,1,2020,1493,1


# Scaling dataset to reduce too much skewness

In [16]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder

In [17]:
def scale_dataset(df, oversample=False):
    x = df[df.columns[:-1]].values
    y = df[df.columns[-1]].values

    lab = LabelEncoder()
    y = lab.fit_transform(y)
    
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    
    if oversample:
        ros = RandomOverSampler()
        x, y = ros.fit_resample(x, y)
    
    data = np.hstack((x, np.reshape(y, (-1, 1))))
    
    return data, x, y

train, x_train, y_train = scale_dataset(train, oversample=True)
valid, x_valid, y_valid = scale_dataset(valid)
test, x_test, y_test = scale_dataset(test)

# K - Nearest Neighbours model

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

#Trainig the K-NN model with K as 20

In [21]:
knn_model = KNeighborsClassifier(n_neighbors=20)
knn_model.fit(x_train, y_train)

#Checking the accuracy of the model trained

In [22]:
y_pred = knn_model.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89       737
           1       0.81      0.69      0.74       366

    accuracy                           0.84      1103
   macro avg       0.83      0.80      0.82      1103
weighted avg       0.84      0.84      0.84      1103



In [82]:
import pickle

pickle.dump(knn_model, open('knn_model.pkl', 'wb'))