# CNN with Transfer Learning Training

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import pickle
import numpy as np
from joblib import Parallel, delayed
from keras.datasets import fashion_mnist

## Hyperparameter Definition

In [None]:
# Number of worker nodes
n_workers_list = [4, 8, 16, 32, 64, 128]

# Hyperparameters H
buffer_len = 20
mini_batch_size = 10
epochs = 10
lrs_inv = [2048, 4096, 8192]

# Number of images per worker
size = 100

# Number of runs
n_runs = 100

## Datasets

In [None]:
# Min-Max quantization function
def quantize_tensor(x, num_bits, min_val=None, max_val=None):
    if not min_val and not max_val: 
        min_val, max_val = x.min(), x.max()
    qmin = -2.**(num_bits-1)
    qmax = 2.**(num_bits-1) - 1.
    
    x = x.astype(float)
    x = x - min_val
    x /= (max_val - min_val)
    x *= (qmax - qmin)
    x -= qmax
    q_x = x.astype(float).round().astype(int)
    
    return q_x

In [None]:
# Load the dataset
(_, train_labels), (_, test_labels) = fashion_mnist.load_data()

# Load the precomputed features on the ConvNet
train_features = np.load("res/features/train_features.npy")
test_features = np.load("res/features/test_features.npy")

## Net Architecture

In [None]:
SHRT_MAX = 32767
SHRT_MIN = (-SHRT_MAX - 1 )

def isqrt(n):
    x = n
    y = (x + 1) // 2
    while y < x:
        x = y
        y = (x + n // x) // 2
    return x

In [None]:
# DFA WEIGHTS
def DFA_uniform(in_dim, out_dim):
    range = isqrt((12 * SHRT_MAX) / (in_dim + out_dim))
    return np.random.randint(-range, range, (in_dim, out_dim))

In [None]:
# PLA tanh Activation function
def PLA_tanh(act_in, in_dim, out_dim):
    y_max, y_min = 128, -127
    intervals = [128, 75, 32, -31, -74, -127]
    slopes_inv = [y_max, 8, 2, 1, 2, 8, y_max]

    act_out, act_grad_inv  = np.full((act_in.shape[0], out_dim), y_max), np.full((act_in.shape[0], out_dim), slopes_inv[0])

    for i in range(len(act_in)):
        for j in range(len(act_in[i].squeeze())):
            val = act_in[i].squeeze()[j] // ((1 << 8) * in_dim)
            if val < intervals[0]:
                act_out[i][j] = val // 4 + 88
                act_grad_inv[i][j] = slopes_inv[1]
            if val < intervals[1]:
                act_out[i][j] = val + 32
                act_grad_inv[i][j] = slopes_inv[2]
            if val < intervals[2]:
                act_out[i][j] = val * 2
                act_grad_inv[i][j] = slopes_inv[3]
            if val < intervals[3]:
                act_out[i][j] = val - 32
                act_grad_inv[i][j] = slopes_inv[4]
            if val < intervals[4]:
                act_out[i][j] = val // 4 - 88
                act_grad_inv[i][j] = slopes_inv[5]
            if val < intervals[5]:
                act_out[i][j] = y_min
                act_grad_inv[i][j] = slopes_inv[6]
    return act_out.astype(int), act_grad_inv

In [None]:
# L2 Loss Function Gradient
def L2_gradient(y_true, net_out):
    loss = np.zeros((y_true.shape[0], y_true.shape[1]))
    for i in range(len(y_true)):
        for j in range(len(y_true[i])):
            loss[i][j] = net_out[i].squeeze()[j] - y_true[i][j]
    return loss.astype(int)

In [None]:
# Flatten Layer
class FlattenLayer:
    def __init__(self):
        pass

    def forward(self, image):
        dimension = image.shape
        try:
            return image.reshape(dimension[0], dimension[1]*dimension[2]*dimension[3])
        except:
            return image.reshape(dimension[0], dimension[1]*dimension[2])

    def backward(self, loss, lr_inv):
        return loss

In [None]:
# FC Layer
class FCLayer:
    def __init__(self, in_dim, out_dim, desc = "", last_layer = False):
        self.in_dim, self.out_dim = in_dim, out_dim
        self.desc = desc
        self.last_layer = last_layer
        self.weights = np.zeros((in_dim, out_dim)).astype(int)
        self.bias = np.zeros((1, out_dim)).astype(int)
        self.DFA_weights = np.zeros((1, 1)).astype(int)
    
    def forward(self, fc_in):
        self.input = fc_in
        dot = (self.input @ self.weights) + self.bias
        output, self.act_grad_inv = PLA_tanh(dot, self.in_dim, self.out_dim)
        return output
    
    def backward(self, loss, lr_inv):   
        d_DFA = self.compute_dDFA(loss, lr_inv)
        weights_update = self.input.T @ d_DFA
        weights_update = (weights_update // lr_inv).astype(int)
        self.weights -= weights_update
        ones = np.ones((len(d_DFA), 1)).astype(int)
        bias_update = d_DFA.T @ ones
        bias_update = (bias_update.T // lr_inv).astype(int)
        self.bias -= bias_update
        return loss
    
    def compute_dDFA(self, loss, lr_inv):
        if self.last_layer:
            d_DFA = np.floor_divide(loss, self.act_grad_inv)
        else:
            if self.DFA_weights.shape[0] != loss.shape[1] and  self.DFA_weights.shape[1] != self.weights.shape[1]: # 0 rows, 1 cols
                print("DFA not initialized!")
            dot = loss @ self.DFA_weights
            d_DFA = np.floor_divide(dot, self.act_grad_inv)
        return d_DFA

In [None]:
# Network
class Network:
    def __init__(self):
        self.layers = []

    # Add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # Test
    def tl_test(self, x_test, y_test):
        corr = 0
        for j in range(len(x_test)):
            pred = self.tl_predict(x_test[j])
            if pred == y_test[j]:
                corr += 1
        return corr / len(x_test) * 100

    # Predict output
    def tl_predict(self, input_data):
        for layer in self.layers:
            input_data = layer.forward(input_data)
        return input_data.argmax()
   
    # Federated transfer learning training
    def tl_federated_fit(self, data, target, lr_inv):
        batch_target = np.eye(10, dtype='uint8')[target]*16

        # Forward propagation
        for layer in self.layers:
            data = layer.forward(data)
        fwd_out = data

        # Loss gradient
        loss = L2_gradient(batch_target, fwd_out)

        # Backward propagation
        for layer in reversed(self.layers):
            layer.backward(loss, lr_inv)
            
        # Save weights of the layers
        weights = []
        for l in self.layers:
            if hasattr(l, "weights"):
                weights.append([np.copy(l.weights), np.copy(l.bias)])
        return weights

    # Federated transfer learning Single layers training
    def tl_federated_fit_single_layers(self, data, target, train_layer, lr_inv):
        batch_target = np.eye(10, dtype='uint8')[target]*16

        # Forward propagation
        for layer in self.layers:
            data = layer.forward(data)
        fwd_out = data

        # Loss gradient
        loss = L2_gradient(batch_target, fwd_out)

        # Backward propagation only for one layer
        self.layers[train_layer].backward(loss, lr_inv)
            
        # Save weights of the trained layer
        weights = []
        weights.append(np.copy(self.layers[train_layer].weights))
        weights.append(np.copy(self.layers[train_layer].bias))
        
        return weights

## Training Algorithms

### Upload DFA weights

In [None]:
# Upload DFA weights if they exist
try:
    DFA_weights = np.load("res/dfa/DFA_weights.npy")
    print("DFA weights loaded!")
except FileNotFoundError:
    print("DFA weights not found!")
    DFA_weights = [DFA_uniform(10, 50) for run in range(n_runs)]
    # Save DFA weights
    np.save("res/dfa/DFA_weights.npy", DFA_weights)
    print("DFA weights generated!")

### Full-Network Training

In [None]:
def fn_train(run, lr_inv, features_batches, n_workers):
    # Net structure
    nets, n_layers = [], 0
    for n in range(n_workers):
        net = Network()
        net.add(FCLayer(200, 50, desc="First"))
        net.add(FCLayer(50, 10, desc="Second", last_layer=True))

        net.layers[0].DFA_weights = DFA_weights[run]

        nets.append(net)
    
    # Count number of trainable layers
    for l in net.layers:
        if hasattr(l, "weights"):
            n_layers += 1

    # Full-Network Training
    # Loop over the buffers until all the dataset is used
    test_accs = []
    for buffer in range(len(features_batches[0][0])//buffer_len):
        # Repeat for the number of epochs 
        for _ in range(epochs):
            # Loop over the mini batches in the buffer
            for mini_batch in range(buffer_len//mini_batch_size):
                # Compute the starting and ending index of the mini batch
                idx_start_batch = buffer * buffer_len + mini_batch * mini_batch_size
                idx_end_batch = idx_start_batch + mini_batch_size

                # Train the network and save the weights for each layer to be averaged
                weights, average_weights = [[] for _ in range(n_workers)], [[] for _ in range(n_layers)]
                for n in range(n_workers):
                    # Extract the correct mini batch data and target from the training dataset
                    data, target = features_batches[n][0][idx_start_batch:idx_end_batch], features_batches[n][1][idx_start_batch:idx_end_batch]
                    train_res = nets[n].tl_federated_fit(data, target, lr_inv)
                    weights[n] = train_res
                    
        # Average the weights of the layers
        for l in range(n_layers):
            w_mean = np.mean([weights[n][l][0] for n in range(n_workers)], axis=0).astype(int)
            b_mean = np.mean([weights[n][l][1] for n in range(n_workers)], axis=0).astype(int)
            average_weights[l] = [w_mean, b_mean]
        # Set the computed average weights to the layers of the networks
        for n in range(n_workers):
            for l in range(n_layers):
                nets[n].layers[l].weights = average_weights[l][0]
                nets[n].layers[l].bias = average_weights[l][1]
        # Compute the final test accuracy
        test_accs.append(nets[0].tl_test(test_features, test_labels))
    return {f"{lr_inv}-{run}": [test_accs, average_weights]}

### Single-Layer Training

In [None]:
def sl_train(run, lr_inv, features_batches, n_workers):
    # Net structure
    nets, n_layers = [], 0
    for n in range(n_workers):
        net = Network()
        net.add(FCLayer(200, 50, desc="First"))
        net.add(FCLayer(50, 10, desc="Second", last_layer=True))

        net.layers[0].DFA_weights = DFA_weights[run]

        nets.append(net)
    
    # Count number of trainable layers
    for l in net.layers:
        if hasattr(l, "weights"):
            n_layers += 1

    # Single-layers Training
    # Loop over the buffers until all the dataset is used
    test_accs = []
    for buffer in range(len(features_batches[0][0])//buffer_len):
        # Repeat for the number of epochs 
        for _ in range(epochs):
            # Loop over the mini batches in the buffer
            for mini_batch in range(buffer_len//mini_batch_size):
                # Compute the starting and ending index of the mini batch
                idx_start_batch = buffer * buffer_len + mini_batch * mini_batch_size
                idx_end_batch = idx_start_batch + mini_batch_size

                # Train the network and save the weights for each layer to be averaged
                weights, average_weights = [[] for _ in range(n_layers)], [[] for _ in range(n_layers)]

                # Initialize the layer to be trained
                train_layer = -1

                for n in range(n_workers):
                    if n%(n_workers//n_layers) == 0:
                        train_layer += 1
                    # Extract the correct mini batch data and target from the training dataset
                    data, target = features_batches[n][0][idx_start_batch:idx_end_batch], features_batches[n][1][idx_start_batch:idx_end_batch]
                    train_res = nets[n].tl_federated_fit_single_layers(data, target, train_layer, lr_inv)
                    weights[train_layer].append(train_res)
            
        # Average the weights of the layers
        for l in range(n_layers):
            w_mean = np.mean([weights[l][n][0] for n in range(n_workers//n_layers)], axis=0).astype(int)
            b_mean = np.mean([weights[l][n][1] for n in range(n_workers//n_layers)], axis=0).astype(int)
            average_weights[l] = [w_mean, b_mean]
            
        # Set the computed average weights to the layers of the networks
        for n in range(n_workers):
            for l in range(n_layers):
                nets[n].layers[l].weights = average_weights[l][0]
                nets[n].layers[l].bias = average_weights[l][1]
        # Compute the final test accuracy
        test_accs.append(nets[0].tl_test(test_features, test_labels))
    return {f"{lr_inv}-{run}": [test_accs, average_weights]}

## Experiments

### TIFeD Full-Network

In [None]:
for n_workers in n_workers_list:

    # Split the train features into different batches
    features_batches = [[] for _ in range(n_workers)]
    for n in range(n_workers):
        features_batches[n] = [train_features[n*size:(n+1)*size], train_labels[n*size:(n+1)*size]]

    # Parallelization of the training procedure using joblib 
    fn_train_res = Parallel(n_jobs=-1)(delayed(fn_train)(run, lr_inv, features_batches, n_workers) for lr_inv in lrs_inv for run in range(n_runs))
    
    # Reorder and save the results
    fn_train_nets = {}
    for lr_idx in range(len(lrs_inv)):
        accs, W = [], []
        for run in range(n_runs):
            # Extract the results
            index, dict_key = n_runs*lr_idx+run, f"{lrs_inv[lr_idx]}-{run}"
            accs.append(np.divide(fn_train_res[index][dict_key][0], 100))
            W.append(fn_train_res[index][dict_key][1])
        fn_train_nets[lrs_inv[lr_idx]] = [accs, W]

    with open(f"out/full_network/fmnist_{n_workers}.pkl", "wb") as f:
        pickle.dump(fn_train_nets, f)

### TIFeD Single-Layers

In [None]:
for n_workers in n_workers_list:

    # Split the train features into different batches
    features_batches = [[] for _ in range(n_workers)]
    for n in range(n_workers):
        features_batches[n] = [train_features[n*size:(n+1)*size], train_labels[n*size:(n+1)*size]]

    # Parallelization of the training procedure using joblib 
    sl_train_res = Parallel(n_jobs=-1)(delayed(sl_train)(run, lr_inv, features_batches, n_workers) for lr_inv in lrs_inv for run in range(n_runs))
    
    # Reorder and save the results
    sl_train_nets = {}
    for lr_idx in range(len(lrs_inv)):
        accs, W = [], []
        for run in range(n_runs):
            # Extract the results
            index, dict_key = n_runs*lr_idx+run, f"{lrs_inv[lr_idx]}-{run}"
            accs.append(np.divide(sl_train_res[index][dict_key][0], 100))
            W.append(sl_train_res[index][dict_key][1])
        sl_train_nets[lrs_inv[lr_idx]] = [accs, W]

    with open(f"out/single_layers/fmnist_{n_workers}.pkl", "wb") as f:
        pickle.dump(sl_train_nets, f)