# Predict Quality of  a Used Car

# K-Nearest-Neighbors

In [1]:
# Data
# http://archive.ics.uci.edu/ml/datasets/Car+Evaluation

In [2]:
# [Car Evaluation Data Set](http://archive.ics.uci.edu/ml/datasets/Car+Evaluation)

# Python Implementation with SciKit-Learn

In [3]:
# 1.
# Import necessary libraries
# Import the necessary modules from specific libraries.

import os
import numpy as np
import pandas as pd
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

In [4]:
# 2.
# Load the data set
# Use pandas module to read the bike data from the file system. 
# Check few records of the dataset.

data = pd.read_csv('./data/car/car.data',names=['buying','maint','doors','persons','lug_boot','safety','class'])
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
# 3. 
# Check information about the data set

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
class       1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [7]:
# 4.
# Identify the target variable

In [8]:
data['class'],class_names = pd.factorize(data['class'])

In [9]:
# Let’s check the encoded values now.

In [10]:
print(class_names)
print(data['class'].unique())

Index(['unacc', 'acc', 'vgood', 'good'], dtype='object')
[0 1 2 3]


In [11]:
# 5. 
# Identify the predictor variables, and 
# encode any string variables to equivalent integer codes

In [12]:
data['buying'],_ = pd.factorize(data['buying'])
data['maint'],_ = pd.factorize(data['maint'])
data['doors'],_ = pd.factorize(data['doors'])
data['persons'],_ = pd.factorize(data['persons'])
data['lug_boot'],_ = pd.factorize(data['lug_boot'])
data['safety'],_ = pd.factorize(data['safety'])
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,2,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,1,0


In [13]:
# Check the data types now:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null int64
maint       1728 non-null int64
doors       1728 non-null int64
persons     1728 non-null int64
lug_boot    1728 non-null int64
safety      1728 non-null int64
class       1728 non-null int64
dtypes: int64(7)
memory usage: 94.6 KB


In [14]:
# Everything is now converted in integer form.

In [15]:
# 6 
# Select the predictor feature and select the target variable
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [16]:
X.shape

(1728, 6)

In [17]:
y.shape

(1728,)

In [18]:
#7. 
# Train test split:
# split data randomly into 70% training and 30% test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [19]:
len(X_train)

1209

In [20]:
len(X_test)

519

In [21]:
len(y_train)

1209

In [22]:
len(y_test)

519

In [23]:
# 8. Training/model fitting

## train the KNN
## Instantiate the model with 5 neighbors. 
model = KNeighborsClassifier(n_neighbors=5)

## Fit the model on the training data.
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [24]:
## 9. Model parameters study :
# use the model to make predictions with the test data
y_pred = model.predict(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

# Compute confusion matrix to evaluate the accuracy of a classification.
# By definition a confusion matrix C is such that C[i,j] is equal to the 
# number of observations known to be in group i and predicted to be in group j.
# Thus in binary classification, the count of 
#    true negatives is  C[0, 0], 
#    false negatives is C[1, 0], 
#    true positives is  C[1, 1]
#    false positives is C[0, 1]
#
from sklearn.metrics import confusion_matrix
the_confusion_matrix = confusion_matrix(y_test, y_pred)
print("")
print("confusion matrix:")
print(the_confusion_matrix)

Misclassified samples: 32
Accuracy: 0.94

confusion matrix:
[[360   3   0   0]
 [ 14  99   0   2]
 [  0   3  13   0]
 [  7   1   2  15]]


# How to decide the value of n-neighbors

In [25]:
# Choosing a large value of K will lead to a greater 
# amount of execution time & underfitting. Selecting 
# the small value of K  will lead to overfitting. 
# There is no such guaranteed way to find the best value of K.

In [26]:
from sklearn.metrics import accuracy_score
for K in range(25):
    K_value = K+1
    knn = KNeighborsClassifier(n_neighbors = K_value)
    knn.fit(X_train, y_train) 
    y_pred = knn.predict(X_test)
    print("Accuracy is ", accuracy_score(y_test,y_pred)*100,"% for K-Value:",K_value)


Accuracy is  83.62235067437379 % for K-Value: 1
Accuracy is  80.15414258188824 % for K-Value: 2
Accuracy is  89.21001926782274 % for K-Value: 3
Accuracy is  88.82466281310212 % for K-Value: 4
Accuracy is  93.83429672447014 % for K-Value: 5
Accuracy is  92.8709055876686 % for K-Value: 6
Accuracy is  92.8709055876686 % for K-Value: 7
Accuracy is  89.78805394990366 % for K-Value: 8
Accuracy is  90.94412331406551 % for K-Value: 9
Accuracy is  88.82466281310212 % for K-Value: 10
Accuracy is  89.40269749518305 % for K-Value: 11
Accuracy is  88.6319845857418 % for K-Value: 12
Accuracy is  88.82466281310212 % for K-Value: 13
Accuracy is  89.01734104046243 % for K-Value: 14
Accuracy is  89.78805394990366 % for K-Value: 15
Accuracy is  88.6319845857418 % for K-Value: 16
Accuracy is  88.82466281310212 % for K-Value: 17
Accuracy is  88.4393063583815 % for K-Value: 18
Accuracy is  88.6319845857418 % for K-Value: 19
Accuracy is  88.6319845857418 % for K-Value: 20
Accuracy is  88.2466281310212 % for 

In [27]:
# It shows that we are getting a 93.83% accuracy on K = 5. 
# Hence, we are considering K = 5 for this tutorial.