# FeedForward NN Training

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import pickle
import numpy as np
from sklearn.utils import shuffle
from joblib import Parallel, delayed
from keras.utils import to_categorical
from keras.datasets import mnist, fashion_mnist

## Hyperparameter Definition

In [41]:
# Number of worker nodes
n_workers = 8

# Hyperparameters H
buffer_len = 50
mini_batch_size = 25
epochs = 5
lrs_inv = [2048, 4096, 8192]

# Number of runs
n_runs = 100

## Datasets

In [29]:
def prepare_dataset(x_train, y_train, x_test):
    x_train, y_train = shuffle(x_train, y_train, random_state=0)

    # Split the train dataset into different batches
    images_per_batch = len(x_train) // n_workers
    train_batches = [[0, 0] for i in range(n_workers)]

    for i in range(n_workers):
        idx = images_per_batch*i

        # Scale the images to [-128, 127]
        train_batches[i][0] = np.subtract(x_train[idx:idx+images_per_batch], 128)
        train_batches[i][0].dtype = np.int8

        # One-hot encode the labels
        train_batches[i][1] = to_categorical(y_train[idx:idx+images_per_batch]).astype(int)*16

    # Test set (scaled to [-128, 127])
    x_test = np.subtract(x_test, 128)
    x_test.dtype = np.int8

    return train_batches, x_test

## Net Architecture

In [5]:
SHRT_MAX = 32767
SHRT_MIN = (-SHRT_MAX - 1 )

def isqrt(n):
    x = n
    y = (x + 1) // 2
    while y < x:
        x = y
        y = (x + n // x) // 2
    return x

In [6]:
# DFA WEIGHTS
def DFA_uniform(in_dim, out_dim):
    range = isqrt((12 * SHRT_MAX) / (in_dim + out_dim))
    return np.random.randint(-range, range, (in_dim, out_dim))

In [7]:
# PLA tanh Activation function
def PLA_tanh(act_in, in_dim, out_dim):
    y_max, y_min = 128, -127
    intervals = [128, 75, 32, -31, -74, -127]
    slopes_inv = [y_max, 8, 2, 1, 2, 8, y_max]

    act_out, act_grad_inv  = np.full((act_in.shape[0], out_dim), y_max), np.full((act_in.shape[0], out_dim), slopes_inv[0])

    for i in range(len(act_in)):
        for j in range(len(act_in[i].squeeze())):
            val = act_in[i].squeeze()[j] // ((1 << 8) * in_dim)
            if val < intervals[0]:
                act_out[i][j] = val // 4 + 88
                act_grad_inv[i][j] = slopes_inv[1]
            if val < intervals[1]:
                act_out[i][j] = val + 32
                act_grad_inv[i][j] = slopes_inv[2]
            if val < intervals[2]:
                act_out[i][j] = val * 2
                act_grad_inv[i][j] = slopes_inv[3]
            if val < intervals[3]:
                act_out[i][j] = val - 32
                act_grad_inv[i][j] = slopes_inv[4]
            if val < intervals[4]:
                act_out[i][j] = val // 4 - 88
                act_grad_inv[i][j] = slopes_inv[5]
            if val < intervals[5]:
                act_out[i][j] = y_min
                act_grad_inv[i][j] = slopes_inv[6]
    return act_out.astype(int), act_grad_inv

In [8]:
# L2 Loss Function Gradient
def L2_gradient(y_true, net_out):
    loss = np.zeros((y_true.shape[0], y_true.shape[1]))
    for i in range(len(y_true)):
        for j in range(len(y_true[i])):
            loss[i][j] = net_out[i].squeeze()[j] - y_true[i][j]
    return loss.astype(int)

In [9]:
# Flatten Layer
class FlattenLayer:
    def __init__(self):
        pass

    def forward(self, image):
        dimension = image.shape
        try:
            return image.reshape(dimension[0], dimension[1]*dimension[2]*dimension[3])
        except:
            return image.reshape(dimension[0], dimension[1]*dimension[2])

    def backward(self, loss, lr_inv):
        return loss

In [10]:
# FC Layer
class FCLayer:
    def __init__(self, in_dim, out_dim, desc = "", last_layer = False):
        self.in_dim, self.out_dim = in_dim, out_dim
        self.desc = desc
        self.last_layer = last_layer
        self.weights = np.zeros((in_dim, out_dim)).astype(int)
        self.bias = np.zeros((1, out_dim)).astype(int)
        self.DFA_weights = np.zeros((1, 1)).astype(int)
    
    def forward(self, fc_in):
        self.input = fc_in
        dot = (self.input @ self.weights) + self.bias
        output, self.act_grad_inv = PLA_tanh(dot, self.in_dim, self.out_dim)
        return output
    
    def backward(self, loss, lr_inv):   
        d_DFA = self.compute_dDFA(loss, lr_inv)
        weights_update = self.input.T @ d_DFA
        weights_update = (weights_update // lr_inv).astype(int)
        self.weights -= weights_update
        ones = np.ones((len(d_DFA), 1)).astype(int)
        bias_update = d_DFA.T @ ones
        bias_update = (bias_update.T // lr_inv).astype(int)
        self.bias -= bias_update
        return loss
    
    def compute_dDFA(self, loss, lr_inv):
        if self.last_layer:
            d_DFA = np.floor_divide(loss, self.act_grad_inv)
        else:
            if self.DFA_weights.shape[0] != loss.shape[1] and  self.DFA_weights.shape[1] != self.weights.shape[1]: # 0 rows, 1 cols
                print("DFA not initialized!")
            dot = loss @ self.DFA_weights
            d_DFA = np.floor_divide(dot, self.act_grad_inv)
        return d_DFA

In [11]:
# Network
class Network:
    def __init__(self):
        self.layers = []

    # Add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # Test
    def test(self, x_test, y_test):
        corr = 0
        for j in range(len(x_test)):
            pred = self.predict(x_test[j])
            if pred == y_test[j]:
                corr += 1
        return corr / len(x_test) * 100

    # Predict output
    def predict(self, input_data):
        output = np.expand_dims(input_data, axis=0)
        for layer in self.layers:
            output = layer.forward(output)
        return output.argmax()
   
    # Federated training
    def federated_fit(self, data, target, lr_inv):
        # Forward propagation
        for layer in self.layers:
            data = layer.forward(data)
        fwd_out = data

        # Loss gradient
        loss = L2_gradient(target, fwd_out)

        # Backward propagation
        for layer in reversed(self.layers):
            layer.backward(loss, lr_inv)
            
        # Save weights of the layers
        weights = []
        for l in self.layers:
            if hasattr(l, "weights"):
                weights.append([np.copy(l.weights), np.copy(l.bias)])
        return weights

    # Federated Single layers training
    def federated_fit_single_layers(self, data, target, train_layer, lr_inv):
        # Forward propagation
        for layer in self.layers:
            data = layer.forward(data)
        fwd_out = data

        # Loss gradient
        loss = L2_gradient(target, fwd_out)

        # Backward propagation only for one layer
        self.layers[train_layer].backward(loss, lr_inv)
            
        # Save weights of the trained layer
        weights = []
        weights.append(np.copy(self.layers[train_layer].weights))
        weights.append(np.copy(self.layers[train_layer].bias))
        
        return weights

## Training Algorithms

### Upload DFA weights

In [15]:
# Upload DFA weights if they exist
try:
    DFA_weights = np.load("res/dfa/DFA_weights.npy")
    print("DFA weights loaded!")
except FileNotFoundError:
    print("DFA weights not found!")
    DFA_weights = [DFA_uniform(10, 200) for run in range(n_runs)]
    # Save DFA weights
    np.save("res/dfa/DFA_weights.npy", DFA_weights)
    print("DFA weights generated!")

DFA weights loaded!


### Full-Network Training

In [42]:
def fn_train(run, lr_inv, train_batches, x_test, y_test):
    # Net structure
    nets, n_layers = [], 0
    for n in range(n_workers):
        net = Network()
        net.add(FlattenLayer())
        net.add(FCLayer(28*28, 200, desc="First"))
        net.add(FCLayer(200, 10, desc="Second", last_layer=True))

        net.layers[1].DFA_weights = DFA_weights[run]

        nets.append(net)
    
    # Count number of trainable layers
    for l in net.layers:
        if hasattr(l, "weights"):
            n_layers += 1

    # Full-Network Training
    # Loop over the buffers until all the dataset is used
    for buffer in range(len(train_batches[0][0])//buffer_len):
        # Repeat for the number of epochs 
        for _ in range(epochs):
            # Loop over the mini batches in the buffer
            for mini_batch in range(buffer_len//mini_batch_size):
                # Compute the starting and ending index of the mini batch
                idx_start_batch = buffer * buffer_len + mini_batch * mini_batch_size
                idx_end_batch = idx_start_batch + mini_batch_size

                # Train the network and save the weights for each layer to be averaged
                weights, average_weights = [[] for _ in range(n_workers)], [[] for _ in range(n_layers)]

                for n in range(n_workers):
                    # Extract the correct mini batch data and target from the training dataset
                    data, target = train_batches[n][0][idx_start_batch:idx_end_batch], train_batches[n][1][idx_start_batch:idx_end_batch]
                    train_res = nets[n].federated_fit(data, target, lr_inv)
                    weights[n] = train_res
                    
            # Average the weights of the layers
            for l in range(n_layers):
                w_mean = np.mean([weights[n][l][0] for n in range(n_workers)], axis=0).astype(int)
                b_mean = np.mean([weights[n][l][1] for n in range(n_workers)], axis=0).astype(int)
                average_weights[l] = [w_mean, b_mean]
            # Set the computed average weights to the layers of the networks
            for n in range(n_workers):
                for l in range(n_layers):
                    nets[n].layers[l+1].weights = average_weights[l][0]
                    nets[n].layers[l+1].bias = average_weights[l][1]
    # Compute the final test accuracy
    test_acc = nets[0].test(x_test, y_test)
    return {f"{lr_inv}-{run}": [test_acc, average_weights]}

### Single-Layer Training

In [43]:
def sl_train(run, lr_inv, train_batches, x_test, y_test):
    # Net structure
    nets, n_layers = [], 0
    for n in range(n_workers):
        net = Network()
        net.add(FlattenLayer())
        net.add(FCLayer(28*28, 200, desc="First"))
        net.add(FCLayer(200, 10, desc="Second", last_layer=True))

        net.layers[1].DFA_weights = DFA_weights[run]

        nets.append(net)
    
    # Count number of trainable layers
    for l in net.layers:
        if hasattr(l, "weights"):
            n_layers += 1

    # Single-layers Training
    # Loop over the buffers until all the dataset is used
    for buffer in range(len(train_batches[0][0])//buffer_len):
        # Repeat for the number of epochs 
        for _ in range(epochs):
            # Loop over the mini batches in the buffer
            for mini_batch in range(buffer_len//mini_batch_size):
                # Compute the starting and ending index of the mini batch
                idx_start_batch = buffer * buffer_len + mini_batch * mini_batch_size
                idx_end_batch = idx_start_batch + mini_batch_size

                # Train the network and save the weights for each layer to be averaged
                weights, average_weights = [[] for _ in range(n_layers)], [[] for _ in range(n_layers)]

                # Initialize the layer to be trained
                train_layer = 0

                for n in range(n_workers):
                    if n%(n_workers//n_layers) == 0:
                        train_layer += 1
                    # Extract the correct mini batch data and target from the training dataset
                    data, target = train_batches[n][0][idx_start_batch:idx_end_batch], train_batches[n][1][idx_start_batch:idx_end_batch]
                    train_res = nets[n].federated_fit_single_layers(data, target, train_layer, lr_inv)
                    weights[train_layer-1].append(train_res)
            
            # Average the weights of the layers
            for l in range(n_layers):
                w_mean = np.mean([weights[l][n][0] for n in range(n_workers//n_layers)], axis=0).astype(int)
                b_mean = np.mean([weights[l][n][1] for n in range(n_workers//n_layers)], axis=0).astype(int)
                average_weights[l] = [w_mean, b_mean]
                
            # Set the computed average weights to the layers of the networks
            for n in range(n_workers):
                for l in range(n_layers):
                    nets[n].layers[l+1].weights = average_weights[l][0]
                    nets[n].layers[l+1].bias = average_weights[l][1]
    # Compute the final test accuracy
    test_acc = nets[0].test(x_test, y_test)
    return {f"{lr_inv}-{run}": [test_acc, average_weights]}

## Experiments

### MNIST

In [44]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
train_batches, x_test = prepare_dataset(x_train, y_train, x_test)

#### TIFeD Full-Network

In [45]:
# Parallelization of the training procedure using joblib 
fn_train_res = Parallel(n_jobs=8)(delayed(fn_train)(run, lr_inv, train_batches, x_test, y_test) for lr_inv in lrs_inv for run in range(n_runs))

In [46]:
# Reorder and save the results
fn_train_nets = {}
for lr_idx in range(len(lrs_inv)):
    accs, W = [], []
    for run in range(n_runs):
        # Extract the results
        index, dict_key = n_runs*lr_idx+run, f"{lrs_inv[lr_idx]}-{run}"
        accs.append(fn_train_res[index][dict_key][0]/100)
        W.append(fn_train_res[index][dict_key][1])
    fn_train_nets[lrs_inv[lr_idx]] = [accs, W]

with open(f"out/full_network/mnist_{n_workers}.pkl", "wb") as f:
    pickle.dump(fn_train_nets, f)

#### TIFeD Single-Layers

In [28]:
# Parallelization of the training procedure using joblib 
sl_train_res = Parallel(n_jobs=8)(delayed(sl_train)(run, lr_inv, train_batches, x_test, y_test) for lr_inv in lrs_inv for run in range(n_runs))

In [31]:
# Reorder and save the results
sl_train_nets = {}
for lr_idx in range(len(lrs_inv)):
    accs, W = [], []
    for run in range(n_runs):
        # Extract the results
        index, dict_key = n_runs*lr_idx+run, f"{lrs_inv[lr_idx]}-{run}"
        accs.append(sl_train_res[index][dict_key][0]/100)
        W.append(sl_train_res[index][dict_key][1])
    sl_train_nets[lrs_inv[lr_idx]] = [accs, W]

with open(f"out/single_layers/mnist_{n_workers}.pkl", "wb") as f:
    pickle.dump(sl_train_nets, f)

### FashionMNIST

In [34]:
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
train_batches, x_test = prepare_dataset(x_train, y_train, x_test)

#### TIFeD Full-Network

In [35]:
# Parallelization of the training procedure using joblib 
fn_train_res = Parallel(n_jobs=8)(delayed(fn_train)(run, lr_inv, train_batches, x_test, y_test) for lr_inv in lrs_inv for run in range(n_runs))

In [37]:
# Reorder and save the results
fn_train_nets = {}
for lr_idx in range(len(lrs_inv)):
    accs, W = [], []
    for run in range(n_runs):
        # Extract the results
        index, dict_key = n_runs*lr_idx+run, f"{lrs_inv[lr_idx]}-{run}"
        accs.append(fn_train_res[index][dict_key][0]/100)
        W.append(fn_train_res[index][dict_key][1])
    fn_train_nets[lrs_inv[lr_idx]] = [accs, W]

with open(f"out/full_network/fmnist_{n_workers}.pkl", "wb") as f:
    pickle.dump(fn_train_nets, f)

#### TIFeD Single-Layers

In [38]:
# Parallelization of the training procedure using joblib 
sl_train_res = Parallel(n_jobs=8)(delayed(sl_train)(run, lr_inv, train_batches, x_test, y_test) for lr_inv in lrs_inv for run in range(n_runs))

In [39]:
# Reorder and save the results
sl_train_nets = {}
for lr_idx in range(len(lrs_inv)):
    accs, W = [], []
    for run in range(n_runs):
        # Extract the results
        index, dict_key = n_runs*lr_idx+run, f"{lrs_inv[lr_idx]}-{run}"
        accs.append(sl_train_res[index][dict_key][0]/100)
        W.append(sl_train_res[index][dict_key][1])
    sl_train_nets[lrs_inv[lr_idx]] = [accs, W]

with open(f"out/single_layers/fmnist_{n_workers}.pkl", "wb") as f:
    pickle.dump(sl_train_nets, f)