# Supervised Machine Learning

Writing some basic ML models to classify data.

In the cell below is the loading of the data set.


In [2]:
import pandas as pd
fruit = pd.read_table("C:/Users/Christopher/Desktop/fruit_data_with_colours.txt");
fruit.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


# k Nearest Neighbours

An implementation of the K-nearest neighbours algorithm for classifying data.


In [155]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

x = fruit[['mass', 'width', 'height', 'color_score']]
y = fruit['fruit_name']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 4321)
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
for each in range (1,9):
    classify = KNeighborsClassifier(each)
    classify.fit(x_train,y_train)
    y_pred = classify.predict(x_test)
    print(confusion_matrix(y_test, y_pred))
    print("\kNN%")
    print(accuracy_score(y_test, y_pred) * 100)
    

[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 6]]
\kNN%
100.0
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 6]]
\kNN%
100.0
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 6]]
\kNN%
100.0
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 6]]
\kNN%
100.0
[[4 0 0 3]
 [0 2 0 0]
 [0 3 0 0]
 [0 0 0 6]]
\kNN%
66.66666666666666
[[4 0 0 3]
 [0 2 0 0]
 [0 3 0 0]
 [0 1 0 5]]
\kNN%
61.111111111111114
[[2 0 0 5]
 [0 2 0 0]
 [0 2 0 1]
 [0 0 0 6]]
\kNN%
55.55555555555556
[[2 0 0 5]
 [0 2 0 0]
 [0 2 0 1]
 [0 2 0 4]]
\kNN%
44.44444444444444


# Use of sklearn's Gaussian Naïve Bayes to classify data

Same as above, but instead uses the GNB classifier from sklearn

In [153]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

x = fruit[['mass', 'width', 'height', 'color_score']]
y = fruit['fruit_name']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
classify = GaussianNB()
classify.fit(x_train,y_train)
y_pred = classify.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print("\kNN%")
print(accuracy_score(y_test, y_pred) * 100)

[[6 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [5 1 0 1]]
\kNN%
66.66666666666666


# Handcoded version of Gaussian Naïve Bayes classifier

Below is code for a Gaussian Naïve Bayes classifier, which is designed to be able to replace the Gaussian Naïve Bayes from sklearn in Python.

Uses a MAP (Maximum A Posteriori) approach. 

In [150]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import math

class myGaussian:
    
    def fit(self, x, y):
        self.classNames = dict()
        self.classes = dict()
        self.massMean = dict()
        self.widthMean = dict()
        self.heightMean = dict()
        self.colorMean = dict()
        self.massVariance = dict()
        self.widthVariance = dict()
        self.heightVariance = dict()
        self.colorVariance = dict()
        
        #adding classnames
        tempCounter = 0
        for each in y:
            if each not in self.classNames:
                self.classNames[tempCounter] = each
            tempCounter+=1
        
        #counting number of elements of each class
        for each in y:
            if each not in self.classes:
                self.classes[each] = 1
            else:
                self.classes[each] +=1
        
        #probability of a random data item being in each class (prior)
        self.pc = dict()
        for each in self.classes:
            self.pc[each] = self.classes[each] / (len(x))
        
        
        counter = 0
        
        #calculating mean of each column(attribute) for each class
        for each in y:
            if(each in self.massMean):
                self.massMean[each] += x[counter][0]
            else:
                self.massMean[each] = x[counter][0]
            
            if each in self.widthMean:
                self.widthMean[each] += x[counter][1]
            else:
                self.widthMean[each] = x[counter][1]
                
            if each in self.heightMean:
                self.heightMean[each] += x[counter][2]
            else:
                self.heightMean[each] = x[counter][2]
                
            if each in self.colorMean:
                self.colorMean[each] += x[counter][3]
            else:
                self.colorMean[each] = x[counter][3]
            
            counter+=1
            
            
        for each in self.classes:
            self.massMean[each] /= self.classes[each]
            self.widthMean[each] /= self.classes[each]
            self.heightMean[each] /= self.classes[each]
            self.colorMean[each] /= self.classes[each]
            
        #calculating variance for each column(attribute) for each class
        counter = 0
        
        for each in y:
            if(each in self.massVariance):
                self.massVariance[each] += pow((self.massMean[each] - x[counter][0]), 2)
            else:
                self.massVariance[each] = pow((self.massMean[each] - x[counter][0]), 2)
                
            if(each in self.widthVariance):
                self.widthVariance[each] += pow((self.widthMean[each] - x[counter][1]), 2)
            else:
                self.widthVariance[each] = pow((self.widthMean[each] - x[counter][1]), 2)
                
            if(each in self.heightVariance):
                self.heightVariance[each] += pow((self.heightMean[each] - x[counter][2]), 2)
            else:
                self.heightVariance[each] = pow((self.heightMean[each] - x[counter][2]), 2)
                
            if(each in self.colorVariance):
                self.colorVariance[each] += pow((self.colorMean[each] - x[counter][3]), 2)
            else:
                self.colorVariance[each] = pow((self.colorMean[each] - x[counter][3]), 2)
            counter+=1
    
        for each in self.classes:
            self.massVariance[each] /= self.classes[each]
            self.widthVariance[each] /= self.classes[each]
            self.heightVariance[each] /= self.classes[each]
            self.colorVariance[each] /= self.classes[each]

        
                    
    def pdf(self, vector, c):
        pMass = (1/(math.sqrt(2 * math.pi * self.massVariance[c]))) * math.exp(-pow(vector[0] - self.massMean[c], 2)/(2*self.massVariance[c]))
        pWidth = (1/(math.sqrt(2 * math.pi * self.widthVariance[c]))) * math.exp(-pow(vector[1] - self.widthMean[c], 2)/(2*self.widthVariance[c]))
        pHeight = (1/(math.sqrt(2 * math.pi * self.heightVariance[c]))) * math.exp(-pow(vector[2] - self.heightMean[c], 2)/(2*self.heightVariance[c]))
        pColor = (1/(math.sqrt(2 * math.pi * self.colorVariance[c]))) * math.exp(-pow(vector[3] - self.colorMean[c], 2)/(2*self.colorVariance[c]))
        
        return pMass * pWidth * pHeight * pColor
    
    def pred(self, x):
        #probability (p(d|c) for each class)
        dataTypes = []

        #calculating probability p for each class for each data point
        for each in x:
            p = dict()
            #calculating p(d|c)
            for classes in self.classes:
                #calculation p(c) * p(d|c)
                p[classes] = self.pc[classes] * self.pdf(each, classes)

            #getting largest probability
            pValues = list(p.values())
            pMax = max(pValues)
            keyCounter = 0
            for n in p:
                if p[n] == pMax:
                    dataTypes.append(self.classNames[keyCounter])
                    break
                keyCounter+=1
        return dataTypes
    
    
        
        
x = fruit[['mass', 'width', 'height', 'color_score']]
y = fruit['fruit_name']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
classifier = myGaussian()
classifier.fit(x_train,y_train)
y_pred = classifier.pred(x_test)
print(confusion_matrix(y_test, y_pred))
print("\kNN%")
print(accuracy_score(y_test, y_pred) * 100)


[[7 0 0]
 [0 4 0]
 [5 0 2]]
\kNN%
72.22222222222221
