In [None]:
import matplotlib.pyplot as plt
import numpy as np
import mnist


class RidgeReg:
    maxIter = 200

    def load_dataset(self):
        mndata = mnist.MNIST("data/")
        X_train_, labels_train = map(np.array, mndata.load_training())
        X_test_, labels_test = map(np.array, mndata.load_testing())
        Y_train_ = labels_train
        Y_test_ = labels_test
        X_train_ = X_train_ / 255.0
        X_test_ = X_test_ / 255.0

        return X_train_, Y_train_, X_test_, Y_test_

    def jValue(self, X, Y, w, b, lambda_):
        return np.mean(np.log(1 + np.exp(-Y * (b + np.dot(w, X.T))))) \
               + lambda_ * np.sum(np.square(w))

    def mu(self, X, Y, w, b):
        return 1 / (1 + np.exp(-Y * (b + np.matmul(X, w))))

    def J_w(self, X, Y, w, b, lambda_, size):
        return (np.dot((-Y * (1 - self.mu(X, Y, w, b))), X)) / size \
               + 2 * lambda_ * w

    def J_b(self, X, Y, w, b):
        return np.mean(-Y * (1 - self.mu(X, Y, w, b)))

    # get the proportion of wrong answers
    def error(self, y1, y2):
        r = np.equal(y1, y2)
        return 1 - (np.count_nonzero(r) / len(y1))
        
    def toSign(self, X, w, b):
        return np.sign(np.matmul(X, w) + b)
    
    # stochastic gradient descent
    def sgd(self, X, X_test_, Y, Y_test_, batchSize, step_, lambda_):

        b = 0
        jtrain_list = np.empty(self.maxIter)
        jtest_list = np.empty(self.maxIter)
        error_train_list = np.empty(self.maxIter)
        error_test_list = np.empty(self.maxIter)
        iter_list = np.empty(self.maxIter)
        n = len(Y)  # = np.shape(X)[0]
        w = np.zeros(np.shape(X)[1])
        # safeguard for when batch size is more than actual data size
        # usually redundant - bust just present as a guard
        if batchSize > n:
            batchSize = n

        for i in range(1, self.maxIter + 1):
            batchIndex = np.random.choice(np.arange(n), batchSize, replace=False)
            wgrad = step_ * self.J_w(X[batchIndex, :], Y[batchIndex], w, b, lambda_, batchSize)
            bgrad = step_ * self.J_b(X[batchIndex, :], Y[batchIndex], w, b)
            w = w - wgrad
            b = b - bgrad
            jtrain_list[i - 1] = self.jValue(X, Y, w, b, lambda_)
            jtest_list[i - 1] = self.jValue(X_test_, Y_test_, w, b, lambda_)
            y_new_train = self.toSign(X, w, b)
            y_new_test = self.toSign(X_test_, w, b)
            error_train_list[i - 1] = self.error(y_new_train, Y)
            error_test_list[i - 1] = self.error(y_new_test, Y_test_)
            iter_list[i - 1] = i

        return jtrain_list, jtest_list, iter_list, error_train_list, error_test_list

    
R = RidgeReg()
X_train, Y_train, X_test, Y_test = R.load_dataset()

X_train = np.delete(X_train, np.where((Y_train != 2) & (Y_train != 7)), axis=0)
Y_train = np.delete(Y_train, np.where((Y_train != 2) & (Y_train != 7)))
# the default data type was unsigned int, so changed it to regular int
Y_train = Y_train.astype(np.int8)
Y_train[Y_train == 7] = 1
Y_train[Y_train == 2] = -1
# Now the test set
X_test = np.delete(X_test, np.where((Y_test != 2) & (Y_test != 7)), axis=0)
Y_test = np.delete(Y_test, np.where((Y_test != 2) & (Y_test != 7)))
Y_test = Y_test.astype(np.int8)
Y_test[Y_test == 7] = 1
Y_test[Y_test == 2] = -1

_lambda = 0.1
step = 0.05

# Stochastic Gradient Descent - Batch Size = 1
batchSize_ = 1
jtrain, jtest, iters, errTrain, errTest = R.sgd(X_train, X_test, Y_train, Y_test, batchSize_,step, _lambda)
plt.plot(iters, jtrain)
plt.plot(iters, jtest)
plt.xlabel('iteration')
plt.ylabel('J(w,b)')
plt.legend(['Training set', 'Test set'])
plt.title('(SGD - Batch Size = 1) Value of J(w,b) by iteration (A6.c.I)')
plt.show()

plt.plot(iters, errTrain)
plt.plot(iters, errTest)
plt.xlabel('iteration')
plt.ylabel('Proportion of wrong predictions')
plt.legend(['Training set Error', 'Test set Error'])
plt.title('(SGD - Batch Size = 1) Misclassification Error (A6.c.II)')
plt.show()

# Stochastic Gradient Descent - Batch Size = 100
batchSize_ = 100
jtrain, jtest, iters, errTrain, errTest = R.sgd(X_train, X_test, Y_train, Y_test, batchSize_,step, _lambda)
plt.plot(iters, jtrain)
plt.plot(iters, jtest)
plt.xlabel('iteration')
plt.ylabel('J(w,b)')
plt.legend(['Training set', 'Test set'])
plt.title('(SGD - Batch Size = 100) Value of J(w,b) by iteration (A6.d.I)')
plt.show()

plt.plot(iters, errTrain)
plt.plot(iters, errTest)
plt.xlabel('iteration')
plt.ylabel('Proportion of wrong predictions')
plt.legend(['Training set Error', 'Test set Error'])
plt.title('(SGD - Batch Size = 100) Misclassification Error (A6.d.II)')
plt.show()