In [None]:
from __future__ import print_function
import numpy as np
import math
import copy
import pandas as pd
from keras.utils import np_utils
from keras.datasets import fashion_mnist
from sklearn.utils import shuffle
import time
import pickle
from joblib import Parallel, delayed

## Dataset

In [2]:
# Prepare FashionMNIST Dataset
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
x_train, y_train = shuffle(x_train, y_train, random_state=0)

# Train and Validation sets
x_train, x_val = x_train[:50000], x_train[50000:]
y_train, y_val = y_train[:50000], y_train[50000:]

# Split the train dataset into different batches
num_batches = 1
images_per_batch = int(len(x_train)/num_batches)

train_batches = [[0, 0] for i in range(num_batches)]

for i in range(num_batches):
    idx = images_per_batch*i
    train_batches[i][0] = np.subtract(x_train[idx:idx+images_per_batch], 128)
    train_batches[i][0].dtype = np.int8
    train_batches[i][1] = np_utils.to_categorical(y_train[idx:idx+images_per_batch]).astype(int)*16

# Validation set
x_val = np.subtract(x_val, 128)
x_val.dtype = np.int8

# Test set
x_test = np.subtract(x_test, 128)
x_test.dtype = np.int8

## Net Architecture

In [3]:
SHRT_MAX = 32767
SHRT_MIN = (-SHRT_MAX - 1 )

def isqrt(n):
    x = n
    y = (x + 1) // 2
    while y < x:
        x = y
        y = (x + n // x) // 2
    return x

In [None]:
# DFA WEIGHTS
def DFA_weights_uniform(in_dim, out_dim):
    range = isqrt((12 * SHRT_MAX) / (in_dim + out_dim))
    return np.random.randint(-range, range, (in_dim, out_dim))

In [None]:
# PLA tanh Activation function
def PLA_tanh(act_in, in_dim, out_dim):
    y_max, y_min = 128, -127
    intervals = [128, 75, 32, -31, -74, -127]
    slopes_inv = [y_max, 8, 2, 1, 2, 8, y_max]

    act_out, act_grad_inv  = np.full((act_in.shape[0], out_dim), y_max), np.full((act_in.shape[0], out_dim), slopes_inv[0])

    for i in range(len(act_in)):
        for j in range(len(act_in[i].squeeze())):
            val = act_in[i].squeeze()[j] // ((1 << 8) * in_dim)
            if val < intervals[0]:
                act_out[i][j] = val // 4 + 88
                act_grad_inv[i][j] = slopes_inv[1]
            if val < intervals[1]:
                act_out[i][j] = val + 32
                act_grad_inv[i][j] = slopes_inv[2]
            if val < intervals[2]:
                act_out[i][j] = val * 2
                act_grad_inv[i][j] = slopes_inv[3]
            if val < intervals[3]:
                act_out[i][j] = val - 32
                act_grad_inv[i][j] = slopes_inv[4]
            if val < intervals[4]:
                act_out[i][j] = val // 4 - 88
                act_grad_inv[i][j] = slopes_inv[5]
            if val < intervals[5]:
                act_out[i][j] = y_min
                act_grad_inv[i][j] = slopes_inv[6]
    return act_out.astype(int), act_grad_inv

In [None]:
# L2 Loss Function
def L2(y_true, net_out):
    loss = np.zeros((y_true.shape[0], y_true.shape[1]))
    for i in range(len(y_true)):
        for j in range(len(y_true[i])):
            loss[i][j] = net_out[i].squeeze()[j] - y_true[i][j]
    return loss.astype(int)

In [None]:
# Flatten Layer
class FlattenLayer:
    def __init__(self):
        pass

    def forward(self, flatten_in):
        return flatten_in.reshape(flatten_in.shape[0], flatten_in.shape[1]*flatten_in.shape[2])

    def backward(self, loss, lr_inv):
        return loss

In [None]:
# FC Layer
class FCLayer:
    def __init__(self, in_dim, out_dim, last_layer = False):
        self.in_dim, self.out_dim = in_dim, out_dim
        self.last_layer = last_layer
        self.weights = np.zeros((in_dim, out_dim)).astype(int)
        self.bias = np.zeros((1, out_dim)).astype(int)
        self.DFA_weights = np.zeros((1, 1)).astype(int)
    
    def forward(self, fc_in):
        self.input = fc_in
        dot = (self.input @ self.weights) + self.bias       
        output, self.act_grad_inv = PLA_tanh(dot, self.in_dim, self.out_dim)
        return output
    
    def backward(self, loss, lr_inv):   
        d_DFA = self.compute_dDFA(loss, lr_inv)
        weights_update = self.input.T @ d_DFA      
        weights_update = (weights_update // lr_inv).astype(int)
        self.weights -= weights_update
        ones = np.ones((len(d_DFA), 1)).astype(int)
        bias_update = d_DFA.T @ ones      
        bias_update = (bias_update.T // lr_inv).astype(int)
        self.bias -= bias_update        
        return loss
    
    def compute_dDFA(self, loss, lr_inv):
        if self.last_layer:
            d_DFA = np.floor_divide(loss, self.act_grad_inv)
        else:
            if self.DFA_weights.shape[0] != loss.shape[1] and  self.DFA_weights.shape[1] != self.weights.shape[1]: # 0 rows, 1 cols
                print("DFA not initialized!")
            dot = loss @ self.DFA_weights
            d_DFA = np.floor_divide(dot, self.act_grad_inv)
        return d_DFA

In [8]:
# Network
class Network:
    def __init__(self):
        self.layers = []

    # add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # Test
    def test(self, x_test, y_test):
        corr = 0
        for j in range(len(x_test)):
            pred = self.predict(x_test[j])
            if pred == y_test[j]:
                corr += 1
        return corr / len(x_test) * 100

    # Predict output
    def predict(self, input_data):
        output = np.expand_dims(input_data, axis=0)
        for layer in self.layers:
            output = layer.forward(output)
        return output.argmax()

    # train the network
    def fit(self, x_train, y_train, epochs, mini_batch_size, lr_inv):
        train_accs, val_accs, weights = [], [], []
        max_val_acc = 0
        for i in range(epochs):
            epoch_corr = 0
            for j in range(int(len(x_train)/mini_batch_size)):
                batch_corr = 0
                idx_start = j * mini_batch_size
                idx_end = idx_start + mini_batch_size

                batch_in = x_train[idx_start:idx_end]
                batch_target = y_train[idx_start:idx_end]

                # Forward propagation
                for layer in self.layers:
                  batch_in = layer.forward(batch_in)
                fwd_out = batch_in               

                # Loss
                loss = L2(batch_target, fwd_out)

                for r in range(mini_batch_size):
                    if batch_target[r].argmax() == fwd_out[r].argmax():
                        batch_corr += 1
                
                # Backward propagation
                for layer in reversed(self.layers):
                    layer.backward(loss, lr_inv)
                
                epoch_corr += batch_corr

            acc = epoch_corr/len(x_train) * 100
            train_accs.append(acc)

            # Validation accuracy
            val_acc = self.test(x_val, y_val)
            val_accs.append(val_acc)

            # Save weights of the best model
            if len(val_accs) == 1 or val_accs[-1] > max_val_acc:
                weights = [np.copy(self.layers[1].weights), np.copy(self.layers[1].bias), np.copy(self.layers[2].weights), np.copy(self.layers[2].bias)]
                max_val_acc = val_acc
        return train_accs, val_accs, weights

## Experiments

In [9]:
# Hyperparameters Configuration H
bs = 50
epochs = 20
lrs_inv = [2048, 4096, 8192]

# Number of runs
runs = 50

In [10]:
## UPLOAD DFA WEIGHTS
DFA_weights1 = np.load("res/dfa/DFA_weights1.npy")

### Training single model

In [None]:
def train(batch, run, lr):
    # Net structure
    net = Network()
    net.add(FlattenLayer())
    net.add(FCLayer(28*28, 200))
    net.add(FCLayer(200, 10, last_layer=True))

    net.layers[1].DFA_weights = DFA_weights1[run]

    # Train
    _, val_acc, weights = net.fit(train_batches[batch][0], train_batches[batch][1], epochs=epochs, mini_batch_size=bs, lr_inv=lr)
  
    return {str(lr)+'-'+str(run): [val_acc, weights]}

In [12]:
# Parallelization of the training procedure using joblib 
start_time = time.time()
single_res = Parallel(n_jobs=-1)(delayed(train)(batch, run, lr) for lr in lrs_inv for run in range(runs) for batch in range(num_batches))
end_time = time.time()
print("Total time: " + str(end_time-start_time) + " s")

Total time: 7304.724001407623 s


In [13]:
# Reorder results of single networks trained on the entire dataset
single_nets = {}
for lr_idx in range(len(lrs_inv)):
    accs_mean, accs_max, W  = [], [], []
    for runs in range(runs):
        index, dict_key = runs*lr_idx+runs, str(lrs_inv[lr_idx])+'-'+str(runs)
        accs_mean.append(np.mean(single_res[index][dict_key][0])/100)
        accs_max.append(np.max(single_res[index][dict_key][0])/100)
        W.append(single_res[index][dict_key][1])
    single_nets[lrs_inv[lr_idx]] = [[accs_mean, accs_max], W]

### Serialization

In [14]:
# Save single nets
with open("out/single/accs_single.pkl", "wb") as f:
    pickle.dump(single_nets, f)