# Comparing Naive Bayes and Logistic Regression for classification of images as T-Shirts or Trousers

### All algorithms are written from scratch and no built in libraries are used for them.

In [1]:
import numpy as np # used to manipulate mattrices and calculate mean and standard deviation
import math # used to calculate log
import scipy.io # used to load dataset

### Mean and Standard Deviation are used as the features for the images

In [2]:
def getMeanFeature(x): # returns the mean value of all the pixels in each image in the sample
    
    x_mean = np.zeros((0, 1))
    
    for i in range(x.shape[0]): # sample is iterated through every image
        x_mean = np.append(x_mean, np.average(x[i]).reshape(1,1), axis = 0) 
        # mean of all pixels is taken and value is appended to x_mean array
        
    return x_mean

In [3]:
def getStandardDeviationFeature(x): # returns the standard deviation of all the pixels in each image in the sample
    
    x_std = np.zeros((0, 1))
    
    for i in range(x.shape[0]): # sample is iterated through every image
        x_std = np.append(x_std, np.std(x[i]).reshape(1,1), axis = 0)
        # standard deviation of all pixels is taken and value is appended to x_std array
        
    return x_std

In [4]:
def separateClasses(x, y): # separates the samples class wise using the labels
    
    x_class0 = np.zeros((0,1)) # contains samples of class 0(T-shirts)
    x_class1 = np.zeros((0,1)) # contains samples of class 1(Trousers)
    
    for i in range(x.shape[0]): # sample is iterated through every image

        # if label at ith position is 0 then sample at ith position belongs to class 0
        if(y[0][i] == 0): 
            x_class0 = np.append(x_class0, x[i].reshape(1, 1), axis = 0)
        else:
            x_class1 = np.append(x_class1, x[i].reshape(1, 1), axis = 0)
        
    return x_class0, x_class1

In [5]:
def getParameters(x): # gets the parameters for a given feature
    
    mu = np.average(x) 
    # mu or mean is the mean of all the values of the feature in the given array
    
    sigma = np.std(x) 
    # sigma or standard deviation is the standard deviation of all the values of the feature in the given array
    
    return mu, sigma

In [6]:
def gaussian(mu, sigma, x): 
    # finds the log likelihood of given feature x belonging to a distribution with mu and sigma as parameters
    
    likelihood = -((math.log(2 * math.pi * math.pow(sigma, 2)) / 2) + (math.pow((x - mu), 2) / (2 * math.pow(sigma, 2))))
    # uses the formula of 1-D gaussian distribution to find the likelihood of x belonging to the distribution
    
    return likelihood

In [7]:
def getAccuracy(y_pred, y_actual):
    #finds the accuracy of predicted labels with respect to actual labels
    
    accuracy = (y_pred == y_actual).all(axis = 0).mean() * 100 
    # compares all the elements of both the arrays along axis 0 or you can say column wise
    # returns 1 if exact match is found and then takes the mean of all the ones 
    # which ultimately gives us the accuracy of the model
    
    return accuracy

### Naive Bayes Classifier

In [8]:
def naive_bayes(data): # naive bayes classifier
    
    mean_feature = getMeanFeature(data['trX']) 
    # gets the mean feature vector for training samples
    
    std_feature = getStandardDeviationFeature(data['trX']) 
    # gets the standard deviation feature vector for training samples
    
    class0_mean_feature, class1_mean_feature = separateClasses(mean_feature, data['trY'])
    # the mean feature vector is separated based on its label as feature has different parameters for different class

    class0_std_feature, class1_std_feature = separateClasses(std_feature, data['trY'])
    # the standard deviation feature vector is separated based on its label as feature has different parameters for different class
    
    class0_mean_feature_mu, class0_mean_feature_sigma = getParameters(class0_mean_feature)
    #finds the parameter mu and sigma for mean feature vector of class 0
    
    class0_std_feature_mu, class0_std_feature_sigma = getParameters(class0_std_feature)
    #finds the parameter mu and sigma for standard deviation feature vector of class 0

    class1_mean_feature_mu, class1_mean_feature_sigma = getParameters(class1_mean_feature)
    #finds the parameter mu and sigma for mean feature vector of class 1

    class1_std_feature_mu, class1_std_feature_sigma = getParameters(class1_std_feature)
    #finds the parameter mu and sigma for standard deviation feature vector of class 1

    # at this point we have the trained model and we will check its performance using the test data
    
    py_1 = np.average(data['trY']) # finding probability of label 1 which is the mean of training label vector
    py_0 = 1 - py_1 # finding probability of label 0 which is (1 - probability of label 1)
    
    predicted_labels = np.zeros((1,0)) # initializing array to store predicted labels

    for i in range(data['tsX'].shape[0]): # prediction loop

        test_mean_feature = np.average(data['tsX'][i]) # finding the mean feature of test sample
        test_std_feature = np.std(data['tsX'][i]) # finding the standard deviation feature of test sample
        
        p_label0 = gaussian(class0_mean_feature_mu, class0_mean_feature_sigma, test_mean_feature) + gaussian(class0_std_feature_mu, class0_std_feature_sigma, test_std_feature) + math.log(py_0)
        #finding the probability of label 0 for the given test sample

        p_label1 = gaussian(class1_mean_feature_mu, class1_mean_feature_sigma, test_mean_feature) + gaussian(class1_std_feature_mu, class1_std_feature_sigma, test_std_feature) + math.log(py_1)
        #finding the probability of label 1 for the given test sample

        # if probability of label 0 is greater than label 1, the predicted label is 1 and vice versa
        if(p_label0 > p_label1): 
            predicted_labels = np.append(predicted_labels, [[0]], axis = 1)
        else:
            predicted_labels = np.append(predicted_labels, [[1]], axis = 1)
        
    
    actual_labels = data['tsY'] # actual labels of test data
    
    accuracy = getAccuracy(predicted_labels, actual_labels) # getting accuracy of the model.
        
    print("Accuracy of Naive-Bayes Classifier:", accuracy)
    print("\n")

### Logistic Regression Classifier

In [9]:
def logistic_regression(data): # logistic regression classifier
    
    x0 = np.ones((12000, 1)) # vector of ones for w0
    mean_feature = getMeanFeature(data['trX']) # vector of mean feature of training samples
    std_feature = getStandardDeviationFeature(data['trX']) # vector of standard deviation feature of training samples

    x = np.concatenate((x0, mean_feature, std_feature), axis = 1) # concatenating the three vectors
    y = data['trY'] # training labels
    w = np.array([[0.001, 0.001, 0.001]]) # initial random weights
    lr = .001 # learning rate
    epochs = 1000 # number of iterations
    
    current_epoch = 0
    
    # training of the model
    while current_epoch != epochs: 
        z = np.divide(np.exp(np.matmul(w, x.transpose())), (np.exp(np.matmul(w, x.transpose())) + 1)) 
        # sigmoid vector or predicted labels using training samples and current weights 
        w = w + (lr * np.matmul((y - z), x)) # updating weights using gradient ascent approach   
        current_epoch += 1
        
    # after this loop we have the trained model and we will check its performance using the test data
    
    # creating the test set
    test_x0 = np.ones((2000, 1))  # vector of ones for w0
    test_mean_feature = getMeanFeature(data['tsX']) # vector of mean feature of test samples
    test_std_feature = getStandardDeviationFeature(data['tsX']) # vector of standard deviation feature of test samples

    test_x = np.concatenate((test_x0, test_mean_feature, test_std_feature), axis = 1) # concatenating the three vectors
    
    predicted_labels = np.matmul(w, test_x.transpose()) 
    # predicting the labels for test data using the trained weights w

    # the values we have are not actually 0 and 1, they are values in between 0 and 1
    # so we use a threshold to convert those values into labels. In this case the threshold is 0.5
    for i in range(predicted_labels.shape[1]):
        if predicted_labels[0][i] < 0.5:
            predicted_labels[0][i] = 0
        else:
            predicted_labels[0][i] = 1
            
    actual_labels = data['tsY'] # actual labels of test data
    
    accuracy = getAccuracy(predicted_labels, actual_labels) # getting accuracy of the model.
    
    print("Accuracy of Logistic Regression Classifier:", accuracy)

In [10]:
# main function

data = scipy.io.loadmat('Datasets/Tshirts_Trousers.mat') # loading the data set
naive_bayes(data) # calling naive bayes classifier for the data
logistic_regression(data) # calling logistic regression classifier for the data

Accuracy of Naive-Bayes Classifier: 83.15


Accuracy of Logistic Regression Classifier: 92.2
