# IMPORTING THE LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("car_price.csv", index_col='index')
df

Unnamed: 0_level_0,car_name,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90 Lakh,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,BMW M Series M4 Coupe,64.90 Lakh,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,Jaguar XF 2.2 Litre Luxury,13.75 Lakh,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,BMW 7 Series 730Ld,29.90 Lakh,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


# Removing the car name columns so the model does not consider car companies for prediction.

In [None]:
df.pop("car_name")
df

KeyError: 'car_name'

# Removing lakhs and crores prefix in car_prices column and converting prices to NRS from INR

In [None]:
crore = (df["car_prices_in_lakh"].str.contains("Crore"))
df["car_prices_in_lakh"] = df["car_prices_in_lakh"].str.removesuffix("Lakh").str.removesuffix("Crore").str.replace(',','').astype(float)
df["car_prices_in_lakh"] = (df["car_prices_in_lakh"] * 100).where(crore, other=df["car_prices_in_lakh"])
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,12.83,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,16.40,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,7.77,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,5.15,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...
5507,28.90,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,64.90,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,13.75,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,29.90,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


# Removing km suffix from km driven columns

In [None]:
df["kms_driven"] = df["kms_driven"].str.removesuffix("kms").str.replace(',','').astype(int)
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,86226,Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,12.83,13248,Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,16.40,60343,Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,7.77,26696,Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,5.15,69414,Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...
5507,28.90,45000,Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,64.90,29000,Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,13.75,90000,Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,29.90,79000,Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


# Labeling fuel types

In [None]:
fuels = {"Petrol":1,
         "Diesel": 2,
         "Cng": 3,
         "Lpg": 4,
         "Electric": 5
         } 
df["fuel_type"] = df["fuel_type"].replace(fuels)  #df["fuel_type"].replace(fuels, inplace=True)
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,86226,2,Manual,1st Owner,2017,1956 cc,5 Seats
1,12.83,13248,1,Automatic,1st Owner,2021,1330 cc,5 Seats
2,16.40,60343,1,Automatic,1st Owner,2016,2494 cc,5 Seats
3,7.77,26696,1,Automatic,1st Owner,2018,1199 cc,5 Seats
4,5.15,69414,1,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...
5507,28.90,45000,2,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,64.90,29000,1,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,13.75,90000,2,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,29.90,79000,2,Automatic,3rd Owner,2015,2967 cc,6 Seats


# Changing seats colums to int from str by removing suffix

In [None]:
df["Seats"] = df["Seats"].str.removesuffix("Seats").astype(int)

# Converting engine column in int as well as renaming column

In [None]:
df = df.rename({"engine": "engine_cc"}, axis="columns")
df["engine_cc"] = df["engine_cc"].str.removesuffix("cc").astype(int)
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine_cc,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,86226,2,Manual,1st Owner,2017,1956,5
1,12.83,13248,1,Automatic,1st Owner,2021,1330,5
2,16.40,60343,1,Automatic,1st Owner,2016,2494,5
3,7.77,26696,1,Automatic,1st Owner,2018,1199,5
4,5.15,69414,1,Manual,1st Owner,2016,1199,5
...,...,...,...,...,...,...,...,...
5507,28.90,45000,2,Automatic,1st Owner,2018,2995,7
5508,64.90,29000,1,Automatic,2nd Owner,2015,1968,5
5509,13.75,90000,2,Automatic,2nd Owner,2013,2755,5
5510,29.90,79000,2,Automatic,3rd Owner,2015,2967,6


# Giving tansmission a number for either manual or automatic

In [None]:
transmissions = {"Manual": 0, "Automatic": 1}
df['transmission'] = df['transmission'].replace(transmissions)
df

Unnamed: 0_level_0,car_prices_in_lakh,kms_driven,fuel_type,transmission,ownership,manufacture,engine_cc,Seats
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.03,86226,2,0,1st Owner,2017,1956,5
1,12.83,13248,1,1,1st Owner,2021,1330,5
2,16.40,60343,1,1,1st Owner,2016,2494,5
3,7.77,26696,1,1,1st Owner,2018,1199,5
4,5.15,69414,1,0,1st Owner,2016,1199,5
...,...,...,...,...,...,...,...,...
5507,28.90,45000,2,1,1st Owner,2018,2995,7
5508,64.90,29000,1,1,2nd Owner,2015,1968,5
5509,13.75,90000,2,1,2nd Owner,2013,2755,5
5510,29.90,79000,2,1,3rd Owner,2015,2967,6


 # Changing owner columns to int as well

In [None]:

owners = {"1st":1, "2nd":2, "3rd":3, "4th":4, "5th":5, "0th": 0}
df["ownership"] = df["ownership"].str.removesuffix("Owner").str.strip().replace(owners)
df['ownership']

index
0       1
1       1
2       1
3       1
4       1
       ..
5507    1
5508    2
5509    2
5510    3
5511    2
Name: ownership, Length: 5512, dtype: int64

# Reordering prices at last cols

In [None]:
cols = list(df.columns.values)
price = cols.pop(0)
seat = cols.pop(6)
cols.insert(1, seat)
cols.insert(8, price)
df = df.reindex(columns = cols)
df

Unnamed: 0_level_0,kms_driven,Seats,fuel_type,transmission,ownership,manufacture,engine_cc,car_prices_in_lakh
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,86226,5,2,0,1,2017,1956,10.03
1,13248,5,1,1,1,2021,1330,12.83
2,60343,5,1,1,1,2016,2494,16.40
3,26696,5,1,1,1,2018,1199,7.77
4,69414,5,1,0,1,2016,1199,5.15
...,...,...,...,...,...,...,...,...
5507,45000,7,2,1,1,2018,2995,28.90
5508,29000,5,1,1,2,2015,1968,64.90
5509,90000,5,2,1,2,2013,2755,13.75
5510,79000,6,2,1,3,2015,2967,29.90


In [None]:
df.loc[df["car_prices_in_lakh"] >= 10, "more_than_10"] = 'True'
df.loc[df["car_prices_in_lakh"] < 10, "more_than_10"] = 'False'
df["more_than_10"] = df["more_than_10"].replace({"True":1, "False":0})
df.pop("car_prices_in_lakh")
df

Unnamed: 0_level_0,kms_driven,Seats,fuel_type,transmission,ownership,manufacture,engine_cc,more_than_10
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,86226,5,2,0,1,2017,1956,1
1,13248,5,1,1,1,2021,1330,1
2,60343,5,1,1,1,2016,2494,1
3,26696,5,1,1,1,2018,1199,0
4,69414,5,1,0,1,2016,1199,0
...,...,...,...,...,...,...,...,...
5507,45000,7,2,1,1,2018,2995,1
5508,29000,5,1,1,2,2015,1968,1
5509,90000,5,2,1,2,2013,2755,1
5510,79000,6,2,1,3,2015,2967,1


# Dividing dataset into train , valiadate and test data set

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])
train

Unnamed: 0_level_0,kms_driven,Seats,fuel_type,transmission,ownership,manufacture,engine_cc,more_than_10
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4477,80000,5,3,0,2,2019,1497,0
1583,70000,4,2,0,1,2017,1991,0
5133,48616,5,1,0,1,2010,1373,0
1224,12000,5,1,1,1,2021,1497,1
746,100527,5,1,0,1,2012,0,0
...,...,...,...,...,...,...,...,...
1099,39356,5,2,1,1,2015,2143,1
704,82145,5,1,0,3,2012,2184,0
3401,60000,5,2,0,1,2015,1968,0
7,14470,5,2,0,1,2021,1498,1


# Scaling dataset to reduce too much skewness

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder

In [None]:
def scale_dataset(df, oversample=False):
    x = df[df.columns[:-1]].values
    y = df[df.columns[-1]].values

    lab = LabelEncoder()
    y = lab.fit_transform(y)
    
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    
    if oversample:
        ros = RandomOverSampler()
        x, y = ros.fit_resample(x, y)
    
    data = np.hstack((x, np.reshape(y, (-1, 1))))
    
    return data, x, y

train, x_train, y_train = scale_dataset(train, oversample=True)
valid, x_valid, y_valid = scale_dataset(valid)
test, x_test, y_test = scale_dataset(test)

# K - Nearest Neighbours model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

#Trainig the K-NN model with K as 20

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=20)
knn_model.fit(x_train, y_train)

#Checking the accuracy of the model trained

In [None]:
y_pred = knn_model.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89       740
           1       0.82      0.67      0.74       363

    accuracy                           0.84      1103
   macro avg       0.84      0.80      0.81      1103
weighted avg       0.84      0.84      0.84      1103



In [None]:
x_test.shape

(1103, 7)

In [None]:
import pickle

pickle.dump(knn_model, open('knn_model.pkl', 'wb'))

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
nn_model = keras.Sequential([
    keras.layers.Dense(7),
    keras.layers.Dense(1024, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
    ]
)

nn_model.compile(optimizer='adam', loss='binary_crossentropy')

nn_model.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x27fd06265c0>

In [None]:
nn_model.evaluate(x_valid, y_valid)



0.6933554410934448

In [None]:
pred = nn_model.predict(x_valid)
pred
y_valid



array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [None]:
nn_model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_23 (Dense)            (None, 7)                 56        
                                                                 
 dense_24 (Dense)            (None, 1024)              8192      
                                                                 
 dense_25 (Dense)            (None, 2)                 2050      
                                                                 
Total params: 10,298
Trainable params: 10,298
Non-trainable params: 0
_________________________________________________________________
