In [0]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os


## Preprocessing Input

42000 training images

Image pixels converted to binary (0 - for pixel value 0, 1 - for non-zero pixel value)

Image labels are converted to binary (0 - if label is not integer 'i', 1 - if label is integer 'i')

Equal samples drawn from both 0 and 1 class to form training data.

In [0]:
from collections import Counter
def preprocess_input(i):
    train_data = pd.read_csv('train.csv')
    train_data.loc[train_data.label != i, 'label'] = -1
    train_data.loc[train_data.label == i, 'label'] = -2
    train_data.loc[train_data.label == -1, 'label'] = 0
    train_data.loc[train_data.label == -2, 'label'] = 1
    cols = list(train_data.columns)
    cols.remove('label')
    train_data = train_data.astype(bool).astype(int)
    in_class = train_data[train_data['label']==1]
    non_class = train_data[train_data['label']==0]
    non_class = non_class.sample(n=len(in_class.index))
    final_data = pd.concat([non_class,in_class],sort=False)
    train_labels = final_data['label']
    train_X = np.array(final_data[cols])
#     plt.imshow(train_X[0].reshape(28,28),cmap='gray')
    return (train_X,train_labels)



## Logistic Regression

Loss - Binary Cross Entropy

Intercept added to input X for bias.

Prediction based on threshold value (default - 0.5)

In [0]:
class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def __loss(self, h, y):
#         m = len(y)
#         return (np.square(y - h))/m #.mean(axis=ax)
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
#             print(i)
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            m = len(y)
#             gradient = np.dot(X.T,(h-y))/m #.mean(axis=ax)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            if(self.verbose == True and i % 10000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'loss: {self.__loss(h, y)} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold

## Training

Trained 10 classifiers for 10 distinct digits (0-9).

Learning rate = 0.1

Iterations = 50000

In [0]:
def train_classifiers():
    classifiers = []
    for i in range(10):
        print('Training for ',i)
        train_X,train_labels = preprocess_input(i)
        model = LogisticRegression(lr=0.1, num_iter=50000)
        model.fit(train_X, train_labels)
        classifiers.append(model)
        filename = 'theta_'+str(i)
        np.savetxt(filename,model.theta,delimiter=',')
    return classifiers

# classifiers = train_classifiers()
    



## Testing

Preprocess test inputs (converted pixel values to binary and class labels to binary)

Given an integer load the weights of coressponding classifier (e.g. for integer 1, load weights of classifier from file 'theta_1'). Compute H(X*THETA) where H is sigmoid function. Compare with threshold to decide final class (0 or 1).

In [0]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
def check_i(i,threshold):
    weightfile = 'results/theta_'+str(i)
    weights = np.loadtxt(weightfile,delimiter=',')
    train_data = pd.read_csv('train.csv')
    train_data.loc[train_data.label != i, 'label'] = -1
    train_data.loc[train_data.label == i, 'label'] = -2
    train_data.loc[train_data.label == -1, 'label'] = 0
    train_data.loc[train_data.label == -2, 'label'] = 1
    cols = list(train_data.columns)
    cols.remove('label')
    train_data = train_data.astype(bool).astype(int)
    train_X = np.array(train_data[cols])
    train_labels = train_data['label']
    intercept = np.ones((train_X.shape[0], 1))
    train_X = np.concatenate((intercept, train_X), axis=1)
#     print(train_X[0])
    result = (1 / (1 + np.exp(-(np.dot(train_X,weights)))) >= threshold)
    print(accuracy_score(train_labels, result))
    print('Confusion Matrix')
    print(confusion_matrix(train_labels, result))
    print('F1 Score')
    print(f1_score(train_labels, result, average='weighted') )

for i in range(10):
    print('Accuracy for i = ',i,' with threshold = 0.5 :')
    check_i(i,0.5)

Accuracy for i =  0  with threshold = 0.5 :
0.9783333333333334
Confusion Matrix
[[36983   885]
 [   25  4107]]
F1 Score
0.9792300218596299
Accuracy for i =  1  with threshold = 0.5 :
0.980452380952381
Confusion Matrix
[[36525   791]
 [   30  4654]]
F1 Score
0.9810860079370226
Accuracy for i =  2  with threshold = 0.5 :
0.9468571428571428
Confusion Matrix
[[35787  2036]
 [  196  3981]]
F1 Score
0.9509908097767376
Accuracy for i =  3  with threshold = 0.5 :
0.9301190476190476
Confusion Matrix
[[34945  2704]
 [  231  4120]]
F1 Score
0.9366647761760928
Accuracy for i =  4  with threshold = 0.5 :
0.9591190476190476
Confusion Matrix
[[36288  1640]
 [   77  3995]]
F1 Score
0.9619802952753425
Accuracy for i =  5  with threshold = 0.5 :
0.9140952380952381
Confusion Matrix
[[34851  3354]
 [  254  3541]]
F1 Score
0.9247347345749259
Accuracy for i =  6  with threshold = 0.5 :
0.9700952380952381
Confusion Matrix
[[36659  1204]
 [   52  4085]]
F1 Score
0.971691665029103
Accuracy for i =  7  with thr