In [1]:
# Name: Ante Zovko
# Date: September 22nd, 2021
# Car Evaluation Model

In [2]:
# Following libraries should be imported to run the experiment........
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [3]:
df = pd.read_csv("./car_evaluation.csv")
df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [4]:
# Assign column names as dataset doesn't have proper column names.
df.columns = ['Buying', 'Maint', 'Doors','Persons','LugBoot','Safety','Class']
df

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety,Class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [5]:
# Check for missing values
df.isnull().sum()

Buying     0
Maint      0
Doors      0
Persons    0
LugBoot    0
Safety     0
Class      0
dtype: int64

In [6]:
X = df.drop(['Class'], axis=1)
X

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety
0,vhigh,vhigh,2,2,small,med
1,vhigh,vhigh,2,2,small,high
2,vhigh,vhigh,2,2,med,low
3,vhigh,vhigh,2,2,med,med
4,vhigh,vhigh,2,2,med,high
...,...,...,...,...,...,...
1722,low,low,5more,more,med,med
1723,low,low,5more,more,med,high
1724,low,low,5more,more,big,low
1725,low,low,5more,more,big,med


In [7]:
y = df['Class']
y

0       unacc
1       unacc
2       unacc
3       unacc
4       unacc
        ...  
1722     good
1723    vgood
1724    unacc
1725     good
1726    vgood
Name: Class, Length: 1727, dtype: object

In [8]:
#Using pandas dummies function to encode the data into categorical data
a = pd.get_dummies(X, prefix_sep='_', drop_first=True)
a

Unnamed: 0,Buying_low,Buying_med,Buying_vhigh,Maint_low,Maint_med,Maint_vhigh,Doors_3,Doors_4,Doors_5more,Persons_4,Persons_more,LugBoot_med,LugBoot_small,Safety_low,Safety_med
0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1
1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0
3,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1
4,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,1,0,0,1,0,0,0,0,1,0,1,1,0,0,1
1723,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0
1724,1,0,0,1,0,0,0,0,1,0,1,0,0,1,0
1725,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1


In [9]:
x = StandardScaler().fit_transform(a)
Y = pd.DataFrame(x, columns=a.columns)
Y

Unnamed: 0,Buying_low,Buying_med,Buying_vhigh,Maint_low,Maint_med,Maint_vhigh,Doors_3,Doors_4,Doors_5more,Persons_4,Persons_more,LugBoot_med,LugBoot_small,Safety_low,Safety_med
0,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,-0.577573,-0.707414,-0.707414,-0.707414,1.415443,-0.706493,1.413600
1,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,-0.577573,-0.707414,-0.707414,-0.707414,1.415443,-0.706493,-0.707414
2,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,-0.577573,-0.707414,-0.707414,1.413600,-0.706493,1.415443,-0.707414
3,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,-0.577573,-0.707414,-0.707414,1.413600,-0.706493,-0.706493,1.413600
4,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,1.734059,-0.577573,-0.577573,-0.577573,-0.707414,-0.707414,1.413600,-0.706493,-0.706493,-0.707414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,1.731382,-0.577573,-0.576682,1.731382,-0.577573,-0.576682,-0.577573,-0.577573,1.731382,-0.707414,1.413600,1.413600,-0.706493,-0.706493,1.413600
1723,1.731382,-0.577573,-0.576682,1.731382,-0.577573,-0.576682,-0.577573,-0.577573,1.731382,-0.707414,1.413600,1.413600,-0.706493,-0.706493,-0.707414
1724,1.731382,-0.577573,-0.576682,1.731382,-0.577573,-0.576682,-0.577573,-0.577573,1.731382,-0.707414,1.413600,-0.707414,-0.706493,1.415443,-0.707414
1725,1.731382,-0.577573,-0.576682,1.731382,-0.577573,-0.576682,-0.577573,-0.577573,1.731382,-0.707414,1.413600,-0.707414,-0.706493,-0.706493,1.413600


In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 42)

In [11]:
# check the shape of X_train and X_test
X_train.shape, X_test.shape

((1157, 15), (570, 15))

In [12]:
# Using Logistic regression
clf = LogisticRegression(random_state = 0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_LR=f1_score(y_test,y_pred, average='macro')
print("Training Accuracy: ",clf.score(X_train, y_train))
print("Testing Accuracy: ", clf.score(X_test, y_test))
cm1 = confusion_matrix(y_test, y_pred)
print(cm1)
print(classification_report(y_test,y_pred))

Training Accuracy:  0.9308556611927399
Testing Accuracy:  0.9052631578947369
[[104   6  15   2]
 [  6   6   0   6]
 [ 16   0 383   0]
 [  3   0   0  23]]
              precision    recall  f1-score   support

         acc       0.81      0.82      0.81       127
        good       0.50      0.33      0.40        18
       unacc       0.96      0.96      0.96       399
       vgood       0.74      0.88      0.81        26

    accuracy                           0.91       570
   macro avg       0.75      0.75      0.75       570
weighted avg       0.90      0.91      0.90       570



In [13]:
# Using K-Nearest Neighbors
clf = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_KNN=f1_score(y_test,y_pred, average='macro')
print("Training Accuracy: ",clf.score(X_train, y_train))
print("Testing Accuracy: ", clf.score(X_test, y_test))
cm2 = confusion_matrix(y_test, y_pred)
print(cm2)
print(classification_report(y_test,y_pred))

Training Accuracy:  0.8660328435609335
Testing Accuracy:  0.8087719298245614
[[ 76   8  43   0]
 [ 14   0   2   2]
 [ 15   1 383   0]
 [ 10  10   4   2]]
              precision    recall  f1-score   support

         acc       0.66      0.60      0.63       127
        good       0.00      0.00      0.00        18
       unacc       0.89      0.96      0.92       399
       vgood       0.50      0.08      0.13        26

    accuracy                           0.81       570
   macro avg       0.51      0.41      0.42       570
weighted avg       0.79      0.81      0.79       570

