# Predicting Car Class using Classification

In [7]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model, preprocessing

### Get and Prepare The Data

In [3]:
#GET the data
data = pd.read_csv("car.data")
data

Unnamed: 0,buying,maint,door,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [6]:
#Change non-integer values to integers in data using preprocessor

le = preprocessing.LabelEncoder()

#creates the column into a list with integer values
buying = le.fit_transform(list(data["buying"]))
maint = le.fit_transform(list(data["maint"]))
door = le.fit_transform(list(data["door"]))
persons = le.fit_transform(list(data["persons"]))
lug_boot = le.fit_transform(list(data["lug_boot"]))
safety = le.fit_transform(list(data["safety"]))
cls = le.fit_transform(list(data["class"]))

###  Create Training and Testing Datasets

In [9]:
#create X and y datasets

#combine all the lists for the input dataset
X = list(zip(buying, maint, door, persons, lug_boot, safety))

#y is a list of values in the cls list, i.e class, expected outputs
y = list(cls)

In [14]:
#create the test and train datasets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, test_size = 0.1)

### Training A Model

In [19]:
#specify number of neighbours in parenthesis
model = KNeighborsClassifier(n_neighbors = 9)

#fit model to data, and print accuracy
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print(accuracy)

0.953757225433526


In [21]:
#Check predictions list vs actual

predicted = model.predict(X_test)

#create names array that corresponds to numerical value of data
names = ["unacc","acc", "good", "vgood"]

#loop and print prediction vs actual
for x in range(len(predicted)):
    print("Data: ", X_test[x], " Predicted: ", names[predicted[x]], " Actual: ", names[y_test[x]])

Data:  (2, 0, 3, 1, 0, 1)  Predicted:  good  Actual:  good
Data:  (0, 2, 2, 2, 0, 1)  Predicted:  good  Actual:  good
Data:  (0, 0, 2, 2, 1, 1)  Predicted:  good  Actual:  good
Data:  (0, 2, 0, 1, 1, 2)  Predicted:  good  Actual:  good
Data:  (1, 2, 3, 1, 0, 1)  Predicted:  good  Actual:  good
Data:  (3, 2, 3, 1, 2, 1)  Predicted:  good  Actual:  good
Data:  (2, 0, 0, 0, 2, 0)  Predicted:  good  Actual:  good
Data:  (1, 0, 2, 1, 1, 2)  Predicted:  unacc  Actual:  unacc
Data:  (3, 0, 1, 0, 1, 0)  Predicted:  good  Actual:  good
Data:  (0, 2, 2, 0, 1, 1)  Predicted:  good  Actual:  good
Data:  (0, 3, 3, 1, 0, 0)  Predicted:  good  Actual:  good
Data:  (1, 3, 0, 2, 1, 2)  Predicted:  good  Actual:  good
Data:  (2, 2, 0, 0, 1, 1)  Predicted:  good  Actual:  good
Data:  (0, 0, 1, 2, 2, 1)  Predicted:  good  Actual:  good
Data:  (0, 0, 0, 0, 0, 1)  Predicted:  good  Actual:  good
Data:  (2, 0, 1, 1, 1, 2)  Predicted:  good  Actual:  good
Data:  (2, 1, 3, 2, 1, 1)  Predicted:  good  Actual:  

### Extra Functionality

In [26]:
# get the neighbors of the point ... finds 3 neigbors for instance in X_test3
neighbors = model.kneighbors([X_test[3]], 3 , True)
print(neighbors)

(array([[1., 1., 1.]]), array([[ 716, 1315,  829]]))
