# Step A: Data Pre-Processing

# Step 1- Importing Libraries

In [1]:
!pip install category_encoders

Collecting category_encoders
  Using cached category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
Collecting statsmodels>=0.9.0
  Downloading statsmodels-0.13.2-cp38-cp38-win_amd64.whl (9.1 MB)
Collecting patsy>=0.5.1
  Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
Collecting packaging>=21.3
  Using cached packaging-21.3-py3-none-any.whl (40 kB)
Installing collected packages: patsy, packaging, statsmodels, category-encoders
  Attempting uninstall: packaging
    Found existing installation: packaging 21.0
    Uninstalling packaging-21.0:
      Successfully uninstalled packaging-21.0
Successfully installed category-encoders-2.5.0 packaging-21.3 patsy-0.5.2 statsmodels-0.13.2


In [2]:
import numpy as np
import pandas as pd

## Step 2: importing dataset

In [3]:
dataset=pd.read_csv("vehicle.data")

In [4]:
dataset

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [5]:
dataset.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


## Step 3: Feature matrix and Dependent variable vector

In [6]:
x=dataset.iloc[:, :-1].values
y=dataset.iloc[:, -1].values

In [7]:
x

array([['vhigh', 'vhigh', '2', '2', 'small', 'med'],
       ['vhigh', 'vhigh', '2', '2', 'small', 'high'],
       ['vhigh', 'vhigh', '2', '2', 'med', 'low'],
       ...,
       ['low', 'low', '5more', 'more', 'big', 'low'],
       ['low', 'low', '5more', 'more', 'big', 'med'],
       ['low', 'low', '5more', 'more', 'big', 'high']], dtype=object)

In [8]:
y

array(['unacc', 'unacc', 'unacc', ..., 'unacc', 'good', 'vgood'],
      dtype=object)

## Step 4- Replace missing values(not required)

## Step 5- Encoding

### Feature matrix

In [9]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [10]:
enc=OrdinalEncoder()
x=enc.fit_transform(x)

In [11]:
x

array([[3., 3., 0., 0., 2., 2.],
       [3., 3., 0., 0., 2., 0.],
       [3., 3., 0., 0., 1., 1.],
       ...,
       [1., 1., 3., 2., 0., 1.],
       [1., 1., 3., 2., 0., 2.],
       [1., 1., 3., 2., 0., 0.]])

### Dependent variable vector

In [12]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=np.array(le.fit_transform(y))
y

array([2, 2, 2, ..., 2, 1, 3])

## Step 6- Splitting into training and testing dataset

In [13]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=0)

In [14]:
xtrain

array([[1., 1., 1., 0., 2., 0.],
       [1., 0., 2., 0., 0., 0.],
       [3., 1., 1., 1., 2., 0.],
       ...,
       [1., 1., 1., 0., 0., 2.],
       [0., 0., 0., 2., 2., 0.],
       [0., 2., 1., 1., 2., 2.]])

In [15]:
ytrain

array([2, 2, 0, ..., 2, 2, 2])

## Step 7- Feature Scaling (not required because no huge variations among data)

# Step B: Build Classification model

## Logistic Regression model

In [16]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression(random_state=0)             
LR.fit(xtrain,ytrain)

In [17]:
yestimatedLR=LR.predict(xtest)
print(np.concatenate((yestimatedLR.reshape(len(yestimatedLR),1),ytest.reshape(len(ytest),1)),1))

[[2 0]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [0 1]
 [2 2]
 [2 2]
 [2 1]
 [2 0]
 [2 2]
 [2 2]
 [2 0]
 [2 2]
 [2 0]
 [2 0]
 [2 2]
 [2 2]
 [2 0]
 [2 2]
 [0 2]
 [2 2]
 [2 1]
 [2 0]
 [2 2]
 [2 2]
 [2 0]
 [2 2]
 [0 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 1]
 [0 0]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 0]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [2 2]
 [2 0]
 [2 0]
 [2 2]
 [0 0]
 [2 0]
 [2 0]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [0 2]
 [2 2]
 [2 2]
 [2 0]
 [0 2]
 [0 2]
 [2 0]
 [2 2]
 [0 0]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [0 2]
 [2 2]
 [0 2]
 [2 2]
 [0 1]
 [2 3]
 [2 2]
 [0 0]
 [2 2]
 [2 2]
 [3 0]
 [2 0]
 [2 0]
 [2 2]
 [0 0]
 [2 0]
 [2 2]
 [2 3]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [0 2]
 [2 2]
 [2 2]
 [2 3]
 [0 0]
 [2 0]
 [2 2]
 [2 2]
 [2 2]
 [0 2]
 [2 2]
 [2 2]
 [2 3]
 [3 0]
 [2 2]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 3]
 [2 2]
 [2 1]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [0 0]
 [0 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 1]
 [2 2]
 [2 2]
 [2 2]
 [2 3]
 [2 2]
 [3 2]
 [2 2]
 [2 2]
 [2 0]
 [2 0]
 [2 2]
 [2 2]
 [2 2]
 [2 2]

# Step C: Calculating metrics

In [18]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score
cm=confusion_matrix(ytest, yestimatedLR) #confusion matrix
print('confusion matrix')
print(cm)
print('accuracy=', accuracy_score(ytest,yestimatedLR)) 
print('precision=', precision_score(ytest,yestimatedLR, average=None,sample_weight=None)) 

confusion matrix
[[ 18   0  55   4]
 [  2   0  12   0]
 [ 16   0 221   2]
 [  4   0  12   0]]
accuracy= 0.6907514450867052
precision= [0.45       0.         0.73666667 0.        ]


  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
from sklearn.metrics import classification_report
print(classification_report(ytest, yestimatedLR))

              precision    recall  f1-score   support

           0       0.45      0.23      0.31        77
           1       0.00      0.00      0.00        14
           2       0.74      0.92      0.82       239
           3       0.00      0.00      0.00        16

    accuracy                           0.69       346
   macro avg       0.30      0.29      0.28       346
weighted avg       0.61      0.69      0.63       346



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
print("training data set score=",LR.score(xtrain,ytrain))
print("testing data set score=",LR.score(xtest,ytest))

training data set score= 0.6915278783490224
testing data set score= 0.6907514450867052


### Decision Tree Classifier

In [21]:
from sklearn.tree import DecisionTreeClassifier

In [22]:
DT=DecisionTreeClassifier(criterion='entropy', random_state=0)
DT.fit(xtrain,ytrain)

In [23]:
yestimatedDTC=DT.predict(xtest)

In [24]:
cm1=confusion_matrix(ytest, yestimatedDTC) #confusion matrix
print('confusion matrix')
print(cm1)
print('accuracy=', accuracy_score(ytest,yestimatedDTC)) 
print('precision=', precision_score(ytest,yestimatedDTC, average=None,sample_weight=None)) 

confusion matrix
[[ 69   3   5   0]
 [  1  13   0   0]
 [  0   0 239   0]
 [  0   0   0  16]]
accuracy= 0.9739884393063584
precision= [0.98571429 0.8125     0.9795082  1.        ]


In [25]:
print(classification_report(ytest, yestimatedDTC))

              precision    recall  f1-score   support

           0       0.99      0.90      0.94        77
           1       0.81      0.93      0.87        14
           2       0.98      1.00      0.99       239
           3       1.00      1.00      1.00        16

    accuracy                           0.97       346
   macro avg       0.94      0.96      0.95       346
weighted avg       0.98      0.97      0.97       346



### Naive Bayes classifier

In [26]:
from sklearn.naive_bayes import GaussianNB
NB=GaussianNB()             
NB.fit(xtrain,ytrain)

In [27]:
yestimatedNB=NB.predict(xtest)
print(np.concatenate((yestimatedNB.reshape(len(yestimatedNB),1),ytest.reshape(len(ytest),1)),1))

[[3 0]
 [3 2]
 [2 2]
 [2 2]
 [2 2]
 [3 1]
 [2 2]
 [2 2]
 [3 1]
 [3 0]
 [2 2]
 [2 2]
 [2 0]
 [2 2]
 [2 0]
 [3 0]
 [2 2]
 [3 2]
 [3 0]
 [3 2]
 [2 2]
 [2 2]
 [3 1]
 [2 0]
 [2 2]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 1]
 [0 0]
 [3 2]
 [2 2]
 [2 2]
 [2 2]
 [3 0]
 [3 2]
 [3 0]
 [2 2]
 [2 2]
 [2 2]
 [3 0]
 [3 0]
 [3 2]
 [0 0]
 [2 0]
 [3 0]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [0 2]
 [2 2]
 [2 2]
 [3 0]
 [2 2]
 [3 2]
 [2 0]
 [2 2]
 [3 0]
 [2 2]
 [3 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [3 1]
 [3 3]
 [2 2]
 [3 0]
 [2 2]
 [2 2]
 [3 0]
 [3 0]
 [3 0]
 [2 2]
 [3 0]
 [2 0]
 [2 2]
 [3 3]
 [2 2]
 [2 2]
 [3 2]
 [2 2]
 [3 2]
 [2 2]
 [3 2]
 [3 3]
 [0 0]
 [3 0]
 [2 2]
 [3 2]
 [2 2]
 [3 2]
 [2 2]
 [2 2]
 [3 3]
 [3 0]
 [2 2]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [2 2]
 [3 2]
 [3 3]
 [2 2]
 [3 1]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [2 2]
 [0 0]
 [0 2]
 [2 2]
 [2 2]
 [2 2]
 [3 2]
 [2 2]
 [2 1]
 [2 2]
 [2 2]
 [2 2]
 [3 3]
 [2 2]
 [3 2]
 [3 2]
 [2 2]
 [2 0]
 [2 0]
 [2 2]
 [2 2]
 [2 2]
 [3 2]

In [28]:
print('accuracy for NB classifier=', accuracy_score(ytest,yestimatedNB)) 

accuracy for NB classifier= 0.6358381502890174


### Random Forest Classifier

In [29]:
from sklearn.ensemble import RandomForestClassifier
RFC=RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=0)

In [30]:
RFC.fit(xtrain,ytrain)

In [31]:
yestimatedRFC=RFC.predict(xtest)

In [32]:
print('accuracy for Random Forest classifier=', accuracy_score(ytest,yestimatedRFC))

accuracy for Random Forest classifier= 0.976878612716763


In [33]:
cm=confusion_matrix(ytest, yestimatedRFC) #confusion matrix
print(cm)

[[ 73   0   4   0]
 [  1  10   0   3]
 [  0   0 239   0]
 [  0   0   0  16]]


In [34]:
##Random forest classifier has an accuracy of 97% while the accuracy of naive bayes classifier, logistic regression, decision tree are 63%, 69%, 96%. 
# We conclude that random forest gives higher accuracy among the above mentioned classifier