# Building a multi-class classifier model with open-source zoo dataset

### Zoo dataset is extracted from UCI database (see PDF file)

### 1. Import Python libraries

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix

### 2. Import dataset, assigning x-features and y-target, followed by data pre-processing

In [9]:
dataset = pd.read_csv('datasets/zoo_data.data', names=['animal_name','hair','feathers','eggs','milk',
                                                       'airbone','aquatic','predator','toothed','backbone',
                                                       'breathes','venomous','fins','legs','tail','domestic',
                                                       'catsize','class',])
dataset.drop(columns = ['animal_name'],axis = 1,inplace=True)
dataset

x_feature = dataset.iloc[:,:-1]
y_target = dataset.iloc[:,-1]
print(x_feature)
print('')
print(y_target)
print('')

x_feature = np.array(x_feature).reshape(len(x_feature),-1)
y_target = np.array(y_target).reshape(len(y_target),-1)

     hair  feathers  eggs  milk  airbone  aquatic  predator  toothed  \
0       1         0     0     1        0        0         1        1   
1       1         0     0     1        0        0         0        1   
2       0         0     1     0        0        1         1        1   
3       1         0     0     1        0        0         1        1   
4       1         0     0     1        0        0         1        1   
5       1         0     0     1        0        0         0        1   
6       1         0     0     1        0        0         0        1   
7       0         0     1     0        0        1         0        1   
8       0         0     1     0        0        1         1        1   
9       1         0     0     1        0        0         0        1   
10      1         0     0     1        0        0         1        1   
11      0         1     1     0        1        0         0        0   
12      0         0     1     0        0        1         1     

### 3. Splitting dataset into training and testing

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_feature,
                                                    y_target,
                                                    test_size = 0.20,
                                                    random_state = 42)
print(x_train)
print('')
print(x_test)
print('')
print(y_train)
print('')
print(y_test)
print('')

[[0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 1 0 1]
 [1 0 1 ... 0 0 0]]

[[1 0 0 1 0 0 0 1 1 1 0 0 2 1 0 0]
 [1 0 0 1 0 0 0 1 1 1 0 0 4 1 0 1]
 [0 0 0 1 0 1 1 1 1 1 0 1 0 1 0 1]
 [1 0 0 1 0 0 1 1 1 1 0 0 4 1 0 1]
 [1 0 0 1 0 0 1 1 1 1 0 0 4 1 0 1]
 [1 0 1 0 1 0 0 0 0 1 1 0 6 0 1 0]
 [1 0 0 1 0 0 0 1 1 1 0 0 4 1 0 1]
 [1 0 0 1 0 0 1 1 1 1 0 0 4 1 0 1]
 [1 0 0 1 0 0 1 1 1 1 0 0 4 1 0 1]
 [1 0 0 1 0 0 1 1 1 1 0 0 4 0 0 1]
 [0 0 1 0 0 1 1 1 1 0 0 1 0 1 0 1]
 [0 0 1 0 1 0 0 0 0 1 0 0 6 0 0 0]
 [1 0 1 0 1 0 0 0 0 1 1 0 6 0 0 0]
 [0 1 1 0 1 1 1 0 1 1 0 0 2 1 0 0]
 [0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0]
 [1 0 0 1 0 0 1 1 1 1 0 0 4 1 0 1]
 [1 0 0 1 1 0 0 1 1 1 0 0 2 1 0 0]
 [0 1 1 0 1 1 1 0 1 1 0 0 2 1 0 0]
 [0 0 1 0 0 1 1 1 1 0 0 1 0 1 0 0]
 [1 0 0 1 0 0 0 1 1 1 0 0 4 1 1 1]
 [0 0 0 0 0 1 1 1 1 0 1 0 0 1 0 0]]

[[5]
 [5]
 [6]
 [1]
 [7]
 [6]
 [7]
 [1]
 [1]
 [2]
 [3]
 [1]
 [1]
 [2]
 [1]
 [1]
 [7]
 [1]
 [2]
 [7]
 [4]
 [4]
 [2]
 [4]
 [1]
 [1]
 [1]
 [5]
 [

### 4. Building binary classifier for classes 1 to 7

In [12]:
# training Softmax Regression model
softmax_model = LogisticRegression(multi_class = 'multinomial', 
                                    solver = 'lbfgs', 
                                    max_iter=500)
# (‘lbfgs’, ‘newton-cg’, ‘liblinear’, ‘sag’, ‘saga’ ----> different types of solver)
softmax_model.fit(x_train,y_train)
pred_results_softmax_model = softmax_model.predict(x_test)

visualise_table = pd.DataFrame(columns=['Actual Class (Test dataset)',
                                        'Predicted Class - softmax model'])
visualise_table['Actual Class (Test dataset)'] = [y_test[i][0] for i in range(len(y_test))]
visualise_table['Predicted Class - softmax model'] = pred_results_softmax_model
visualise_table

  y = column_or_1d(y, warn=True)


Unnamed: 0,Actual Class (Test dataset),Predicted Class - softmax model
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,6,6
6,1,1
7,1,1
8,1,1
9,1,1


### 5. Performance evaluation

#### a. Cross-Validation

In [13]:
number_of_folds = 20
score_1 = cross_val_score(softmax_model,
                         x_train,
                         y_train,
                         cv = number_of_folds,
                         scoring = 'accuracy') # cv = 5 means splitting the training dataset into 5 folds, then
# making predictions and evaluating them on each of the 5 folds using the model 
# which is trained on the remaining folds, i.e. 4 folds

score_1 = [round(i,3) *100 for i in (score_1)]

CV_table = pd.DataFrame(columns=['Fold number','Accuracy (%) for softmax model'])
CV_table['Fold number'] = [i+1 for i in range (number_of_folds)]
CV_table['Accuracy (%) for softmax model'] = score_1
CV_table

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Fold number,Accuracy (%) for softmax model
0,1,87.5
1,2,87.5
2,3,100.0
3,4,87.5
4,5,100.0
5,6,100.0
6,7,100.0
7,8,100.0
8,9,100.0
9,10,100.0


#### b. Confusion Matrix

In [16]:
y_pred_softmax_model = cross_val_predict(softmax_model,
                         x_train,
                         y_train,
                         cv = number_of_folds)
score_2 = confusion_matrix(y_train,y_pred_softmax_model)
print(score_2)
print('')

# first row represents negative class, i.e. non-8s images (different from powerpoint slides)
# second row represents positive class, i.e. 8s images (different from powerpoint slides)

TN_SOFTMAX = score_2[0][0]
FP_SOFTMAX = score_2[0][1]
FN_SOFTMAX = score_2[1][0]
TP_SOFTMAX = score_2[1][1]
print('The estimated TN, FP, FN and TP values for the predictions from the Softmax Regression model are: '  
      + str(TN_SOFTMAX) + ', '+ str(FP_SOFTMAX) + ', '+ str(FN_SOFTMAX) + ' and '+ str(TP_SOFTMAX) + ' respectively.')
print('')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[[29  0  0  0  0  0  0]
 [ 0 18  0  0  0  0  0]
 [ 0  1  2  0  1  0  0]
 [ 0  0  0 11  0  0  0]
 [ 0  0  0  0  4  0  0]
 [ 0  0  0  0  0  5  0]
 [ 0  0  0  0  0  1  8]]

The estimated TN, FP, FN and TP values for the predictions from the Softmax Regression model are: 29, 0, 0 and 18 respectively.



  y = column_or_1d(y, warn=True)


#### c. Precision, Recall, F1 score

In [15]:
precision = TP_SOFTMAX/(TP_SOFTMAX+FP_SOFTMAX)
recall = TP_SOFTMAX/(TP_SOFTMAX+FN_SOFTMAX)
F1_SCORE = 2* ((precision*recall)/(precision+recall))
print('Precision value for Softmax Regression model is '+ str(round(precision,3))+'.')
print('')
print('Recall value for Softmax Regression model is '+str(round(recall,3))+'.')
print('')
print('F1 score for Softmax Regression model is '+str(round(F1_SCORE,3))+'.')
print('')

Precision value for Softmax Regression model is 1.0.

Recall value for Softmax Regression model is 1.0.

F1 score for Softmax Regression model is 1.0.

