## Step A: DATA PREPROCESSING

#### Step 1: Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Step 2: Import the dataset

In [2]:
dataset = pd.read_csv("vehicle.data")

In [3]:
dataset.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


#### Step 3: Checking for missing data?

In [4]:
dataset.isnull().sum()

vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
unacc      0
dtype: int64

In [5]:
dataset.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


#### Step 4: S Splitting the data as Feature Matrix (X) and output column vector (Y)

In [6]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values

In [7]:
X

array([['vhigh', 'vhigh', '2', '2', 'small', 'med'],
       ['vhigh', 'vhigh', '2', '2', 'small', 'high'],
       ['vhigh', 'vhigh', '2', '2', 'med', 'low'],
       ...,
       ['low', 'low', '5more', 'more', 'big', 'low'],
       ['low', 'low', '5more', 'more', 'big', 'med'],
       ['low', 'low', '5more', 'more', 'big', 'high']],
      shape=(1727, 6), dtype=object)

In [8]:
Y

array(['unacc', 'unacc', 'unacc', ..., 'unacc', 'good', 'vgood'],
      shape=(1727,), dtype=object)

from sklearn.preprocessing import OrdinalEncoder  # Ordinal Relationship (Order Matters)
from sklearn.preprocessing import OneHotEncoder  # Non-Ordinal Relationship (Order Doesn't Matter)

from sklearn.preprocessing import LabelEncoder  # Y Categorical Data

### Convertion from categorical data into numerical form so that machine learning algorithm can work with it

#### A] Using class OrdinaryEncoder: converts categorical data in the feature matrix, which has ordinal relationship, into integers

In [9]:
from sklearn.preprocessing import OrdinalEncoder

In [10]:
data = [["low"], ["medium"], ["high"]]

In [11]:
OE = OrdinalEncoder()
OE.fit_transform(data)

array([[1.],
       [2.],
       [0.]])

#### B] Using class OneHotEncoder: Converts categorical data which has no meaningful order, into binary vectors

In [12]:
data = [["red"], ["blue"], ["orange"]]

In [13]:
from sklearn.preprocessing import OneHotEncoder
OHE = OneHotEncoder()
OHE.fit_transform(data)

<3x3 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [14]:
OHE.fit_transform(data).toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

#### C] Use of class LabelEncoder: Converts the target categorical data(output) into integers

In [15]:
data = ["yes", "no"]

from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
LE.fit_transform(data)

array([1, 0])

In [16]:
data = ["party", "put", "tv", "study"]

from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
LE.fit_transform(data)

array([0, 1, 3, 2])

#### Step 5: Convert categorical data in X to integers

In [17]:
X

array([['vhigh', 'vhigh', '2', '2', 'small', 'med'],
       ['vhigh', 'vhigh', '2', '2', 'small', 'high'],
       ['vhigh', 'vhigh', '2', '2', 'med', 'low'],
       ...,
       ['low', 'low', '5more', 'more', 'big', 'low'],
       ['low', 'low', '5more', 'more', 'big', 'med'],
       ['low', 'low', '5more', 'more', 'big', 'high']],
      shape=(1727, 6), dtype=object)

In [18]:
Y

array(['unacc', 'unacc', 'unacc', ..., 'unacc', 'good', 'vgood'],
      shape=(1727,), dtype=object)

In [19]:
from sklearn.preprocessing import OrdinalEncoder

OE = OrdinalEncoder()
X = np.array(OE.fit_transform(X))
X

array([[3., 3., 0., 0., 2., 2.],
       [3., 3., 0., 0., 2., 0.],
       [3., 3., 0., 0., 1., 1.],
       ...,
       [1., 1., 3., 2., 0., 1.],
       [1., 1., 3., 2., 0., 2.],
       [1., 1., 3., 2., 0., 0.]], shape=(1727, 6))

#### Step 6: Convert categorical data in Y into integers

In [20]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
Y = np.array(LE.fit_transform(Y))
Y

array([2, 2, 2, ..., 2, 1, 3], shape=(1727,))

### Step B: Building multiclassifier using KNN, NB, DT, RF algorithm

#### Step 1: Split the data into training and testing dataset

In [21]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

#### Step 2: Build the model

#### i) KNN Model

In [22]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier(8)
KNN.fit(x_train, y_train)
y_pred_KNN = KNN.predict(x_test)

#### ii) Naive Bayes Model

In [23]:
from sklearn.naive_bayes import GaussianNB

NB = GaussianNB()
NB.fit(x_train, y_train)
y_pred_NB = NB.predict(x_test)

#### iii) Decision Tree Model

In [24]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier(criterion="entropy", max_depth=4)
DT.fit(x_train, y_train)
y_pred_DT = DT.predict(x_test)

#### iv) Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(1000, criterion="entropy")
RF.fit(x_train, y_train)
y_pred_RF = RF.predict(x_test)

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [27]:
print("**************" * 2)
print("KNN Classifier")
print("**************" * 2)
print(f"1. Confusion Matrix:\n{confusion_matrix(y_test, y_pred_KNN)}")
print(f"2. Accuracy Matrix: {accuracy_score(y_test, y_pred_KNN)}")
print(f"3. Precision Score: {precision_score(y_test, y_pred_KNN, average=None)}")
print(f"4. Recall Score: {recall_score(y_test, y_pred_KNN, average=None)}")
print(f"5. F1 Score: {f1_score(y_test, y_pred_KNN, average=None)}")
print("**************" * 2)
print("NB Classifier")
print("**************" * 2)
print(f"1. Confusion Matrix:\n{confusion_matrix(y_test, y_pred_NB)}")
print(f"2. Accuracy Matrix: {accuracy_score(y_test, y_pred_NB)}")
print(f"3. Precision Score: {precision_score(y_test, y_pred_NB, average=None)}")
print(f"4. Recall Score: {recall_score(y_test, y_pred_NB, average=None)}")
print(f"5. F1 Score: {f1_score(y_test, y_pred_NB, average=None)}")
print("**************" * 2)
print("DT Classifier")
print("**************" * 2)
print(f"1. Confusion Matrix:\n{confusion_matrix(y_test, y_pred_DT)}")
print(f"2. Accuracy Matrix: {accuracy_score(y_test, y_pred_DT)}")
print(f"3. Precision Score: {precision_score(y_test, y_pred_DT, average=None)}")
print(f"4. Recall Score: {recall_score(y_test, y_pred_DT, average=None)}")
print(f"5. F1 Score: {f1_score(y_test, y_pred_DT, average=None)}")
print("**************" * 2)
print("RF Classifier")
print("**************" * 2)
print(f"1. Confusion Matrix:\n{confusion_matrix(y_test, y_pred_RF)}")
print(f"2. Accuracy Matrix: {accuracy_score(y_test, y_pred_RF)}")
print(f"3. Precision Score: {precision_score(y_test, y_pred_RF, average=None)}")
print(f"4. Recall Score: {recall_score(y_test, y_pred_RF, average=None)}")
print(f"5. F1 Score: {f1_score(y_test, y_pred_RF, average=None)}")

****************************
KNN Classifier
****************************
1. Confusion Matrix:
[[ 58   0  22   0]
 [  7   5   0   0]
 [ 10   0 233   0]
 [  0   0   1  10]]
2. Accuracy Matrix: 0.884393063583815
3. Precision Score: [0.77333333 1.         0.91015625 1.        ]
4. Recall Score: [0.725      0.41666667 0.95884774 0.90909091]
5. F1 Score: [0.7483871  0.58823529 0.93386774 0.95238095]
****************************
NB Classifier
****************************
1. Confusion Matrix:
[[  7   0  31  42]
 [  6   0   4   2]
 [  4   0 186  53]
 [  0   0   0  11]]
2. Accuracy Matrix: 0.5895953757225434
3. Precision Score: [0.41176471 0.         0.84162896 0.10185185]
4. Recall Score: [0.0875    0.        0.7654321 1.       ]
5. F1 Score: [0.1443299  0.         0.80172414 0.18487395]
****************************
DT Classifier
****************************
1. Confusion Matrix:
[[ 69   0  11   0]
 [ 12   0   0   0]
 [ 36   0 207   0]
 [ 11   0   0   0]]
2. Accuracy Matrix: 0.7976878612716763
3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
