In [None]:
from __future__ import print_function
import os, sys
module_path = os.path.abspath(os.path.join('../..'))
sys.path.append(module_path)

import numpy as np
import math
import copy
import pandas as pd
from keras.utils import np_utils
from keras.datasets import fashion_mnist
from sklearn.utils import shuffle
import scipy.stats as ss
import time
import pickle
from joblib import Parallel, delayed

In [None]:
# Prepare MNIST dataset

from keras.datasets import mnist
from keras.utils import np_utils

# load MNIST from server
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

x_train, y_train = shuffle(x_train, y_train, random_state=0)

x_train, x_val = x_train[:50000], x_train[50000:]
y_train, y_val = y_train[:50000], y_train[50000:]

# balanced dataset
numBatches = 1
imagesPerBatch = len(x_train)//numBatches

trainBatches = [[0, 0] for i in range(numBatches)]

for i in range(numBatches):
  idx = imagesPerBatch*i
  trainBatches[i][0] = np.subtract(x_train[idx:idx+imagesPerBatch], 128)
  trainBatches[i][0].dtype = np.int8
  trainBatches[i][1] = np_utils.to_categorical(y_train[idx:idx+imagesPerBatch]).astype(int)*16

x_val = np.subtract(x_val, 128)
x_val.dtype = np.int8

x_test = np.subtract(x_test, 128)
x_test.dtype = np.int8

## Net Architecture

In [None]:
SHRT_MAX = 32767
SHRT_MIN = (-SHRT_MAX - 1 )

def isqrt(n):
    x = n
    y = (x + 1) // 2
    while y < x:
        x = y
        y = (x + n // x) // 2
    return x

In [None]:
def pocketTanh(matIn, bits, inDims, outDims):
    yMax = 128
    yMin = -127
    joints = [128, 75, 32, -31, -74, -127]
    divisor = (1 << bits) * inDims
    slopesInv = [yMax, 8, 2, 1, 2, 8, yMax]

    matOut = np.full((matIn.shape[0], outDims), yMax)
    matActvGradInv = np.full((matIn.shape[0], outDims), slopesInv[0])

    for i in range(len(matIn)):
      for j in range(len(matIn[i].squeeze())):
        x = matIn[i].squeeze()[j] // divisor
        if x < joints[0]:
          matOut[i][j] = x // 4 + 88
          matActvGradInv[i][j] = slopesInv[1]
        if x < joints[1]:
          matOut[i][j] = x + 32
          matActvGradInv[i][j] = slopesInv[2]
        if x < joints[2]:
          matOut[i][j] = x * 2
          matActvGradInv[i][j] = slopesInv[3]
        if x < joints[3]:
          matOut[i][j] = x - 32
          matActvGradInv[i][j] = slopesInv[4]
        if x < joints[4]:
          matOut[i][j] = x // 4 - 88
          matActvGradInv[i][j] = slopesInv[5]
        if x < joints[5]:
          matOut[i][j] = yMin
          matActvGradInv[i][j] = slopesInv[6]
    return matOut.astype(int), matActvGradInv

In [None]:
def scalarL2Loss(y, yHat):
    return (yHat - y) * (yHat - y) // 2

def scalarL2LossDelta(y, yHat):
    return (yHat - y)

def batchL2Loss(yMat, yHatMat):
    # IMPORTANT: One sumLoss value per one sample
    accumLoss = 0
    # Each row corresponds to one input
    for i in range(len(yMat)):
      columnLoss = 0
      for j in range(len(yMat[i])):
        columnLoss += scalarL2Loss(yMat[i][j], yHatMat[i][j])
      accumLoss += columnLoss
    return accumLoss

def batchL2LossDelta(yMat, yHatMat):
    # Assumption: 1 input -> 1 scalar sumLoss value
    # for 1 output of dimention T, lossDeltaMat = (1, T)
    lossDeltaMat = np.zeros((yMat.shape[0], yMat.shape[1]))
    accumLossDelta = 0;
    # Per each input item
    for i in range(len(yMat)):
      columnLossDelta = 0
      for j in range(len(yMat[i])):
        scalarLossDelta = scalarL2LossDelta(yMat[i][j], yHatMat[i].squeeze()[j])
        lossDeltaMat[i][j] = scalarLossDelta
        columnLossDelta += scalarLossDelta
      accumLossDelta += columnLossDelta

    # return sum! (average is meaningless)
    return lossDeltaMat.astype(int), accumLossDelta

In [None]:
class FCLayer:
    # input_size = number of input neurons
    # output_size = number of output neurons
    def __init__(self, input_size, output_size, outLayer = False, debug=False):
      self.input_size = input_size
      self.output_size = output_size
      self.outLayer = outLayer
      self.debug = debug
      self.weights = np.zeros((input_size, output_size)).astype(int)
      self.bias = np.zeros((1, output_size)).astype(int)
      self.mDfaWeight = np.zeros((1, 1)).astype(int)

    # returns output for a given input
    def forward(self, input_data):
        self.input = input_data
        dot = self.input @ self.weights
        dot += self.bias
        output, self.matActvGradInv = pocketTanh(dot, 8, self.input_size, self.output_size)
        return output

    def backward(self, lastLayerDeltasMat, lrInv):   
      mDeltas = self.computeDeltas(lastLayerDeltasMat, lrInv)
      batchSize = len(mDeltas) # 1 for one item    
      mWeightUpdate = self.input.T @ mDeltas
      mWeightUpdate = (mWeightUpdate // lrInv).astype(int)
      self.weights -= mWeightUpdate

      ones = np.ones((batchSize, 1)).astype(int)
      mBiasUpdate = mDeltas.T @ ones
      mBiasUpdate = (mBiasUpdate.T // lrInv).astype(int)
      self.bias -= mBiasUpdate

      return lastLayerDeltasMat

    def computeDeltas(self, lastLayerDeltasMat, lrInv):
      if self.outLayer:
        mDeltas = np.floor_divide(lastLayerDeltasMat, self.matActvGradInv)
      else:
        dot = lastLayerDeltasMat @ self.mDfaWeight
        mDeltas = np.floor_divide(dot, self.matActvGradInv)
      return mDeltas

In [None]:
class FlattenLayer:

    def __init__(self):
        pass

    def forward(self, image):
        return image.reshape(image.shape[0], image.shape[1]*image.shape[2])

    def backward(self, lastLayerDeltasMat, lrInv):
      return lastLayerDeltasMat

In [None]:
class Network:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.loss_prime = None

    # add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # test
    def test(self, x_test, y_test):
      # sample dimension first
      samples = len(x_test)
      corr = 0
      for j in range(samples):
          # forward propagation
          pred = self.predict(x_test[j])

          if pred == y_test[j]:
            corr += 1
      return corr / samples * 100

    # predict output for given input
    def predict(self, input_data):
        output = np.expand_dims(input_data, axis=0)
        for layer in self.layers:
            output = layer.forward(output)

        return output.argmax()

    # train the network
    def fit(self, x_train, y_train, epochs, miniBatchSize, lrInv):
        # sample dimension first
        samples = len(x_train)
        train_accs, val_accs = [], []
        maxVal = 0
        weights = []

        # training loop
        for i in range(epochs):
            sumLoss = 0
            sumLossDelta = 0
            epochNumCorrect = 0
            numIter = int(samples/miniBatchSize)

            for j in range(numIter):
                batchNumCorrect = 0
                idxStart = j * miniBatchSize
                idxEnd = idxStart + miniBatchSize

                miniBatchImages = x_train[idxStart:idxEnd]
                miniBarchTargets = y_train[idxStart:idxEnd]

                # forward propagation
                output = miniBatchImages
                
                for layer in self.layers:
                  output = layer.forward(output)

                sumLoss += batchL2Loss(miniBarchTargets, output)
                lossDeltaMat, sumLossDelta = batchL2LossDelta(miniBarchTargets, output)

                for r in range(miniBatchSize):
                    if miniBarchTargets[r].argmax() == output[r].argmax():
                        batchNumCorrect += 1
                
                for layer in reversed(self.layers):
                    layer.backward(lossDeltaMat, lrInv)
                
                epochNumCorrect += batchNumCorrect;

            # Training accuracy
            trainAcc = epochNumCorrect/samples * 100
            train_accs.append(trainAcc)
            # print("Epoch: " + repr(i))
            # print("Train Accuracy: " + repr(trainAcc) + " %")
            
            # Validation accuracy
            valAcc = self.test(x_val, y_val)
            val_accs.append(valAcc)
            if len(val_accs) == 1 or val_accs[-1] > maxVal:
                weights = [np.copy(self.layers[1].weights), np.copy(self.layers[1].bias), np.copy(self.layers[2].weights), np.copy(self.layers[2].bias)]
                # , np.copy(self.layers[3].weights), np.copy(self.layers[3].bias)]
                maxVal = valAcc
        return train_accs, val_accs, weights

## Experiments

In [None]:
# Params
samples = 100
epochs = 20
lrs = [2048, 4096, 8192]
bs = 50

In [None]:
DFAWeights1 = np.load("DFAWeights1.npy")

In [None]:
def train(i, j, lr):
  # Net structure
  net = Network()
  net.add(FlattenLayer())
  net.add(FCLayer(28*28, 200))
  # net.add(FCLayer(200, 100))
  net.add(FCLayer(200, 10, outLayer=True))

  net.layers[1].mDfaWeight = DFAWeights1[j]
  # net.layers[2].mDfaWeight = DFAWeights2[j]

  # Train
  _, val_acc, weights = net.fit(trainBatches[i][0], trainBatches[i][1], epochs=epochs, miniBatchSize=bs, lrInv=lr)
  
  return {str(lr)+'-'+str(j): [val_acc, weights]}

In [None]:
start_time = time.time()
res = Parallel(n_jobs=48)(delayed(train)(i, j, lr) for lr in lrs for j in range(samples) for i in range(numBatches))
end_time = time.time()
print("Total time: " + str(end_time-start_time) + " s")

In [None]:
accsLRSingle = {}
for k in range(len(lrs)):
    accsMean = []
    accsMax = []
    W = []
    for j in range(samples):
        index, dictKey = samples*k+j, str(lrs[k])+'-'+str(j)
        max = np.max(res[index][dictKey][0])
        mean = np.mean(res[index][dictKey][0])
        weights = res[index][dictKey][1]
        accsMean.append(mean/100)
        accsMax.append(max/100)
        W.append(weights)
    accsLRSingle[lrs[k]] = [[accsMean, accsMax], W]

In [None]:
with open("accsSingle.pkl", "wb") as f:
    pickle.dump(accsLRSingle, f)