In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import time
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import initializers
from tensorflow.keras import backend as K
from __future__ import print_function

import sys
import os
import time
import tensorflow as tf
from tensorflow.keras.layers import Input, Dropout, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError, Hinge
from tensorflow.keras.metrics import Mean, SparseCategoricalAccuracy
import numpy as np

import pandas as pd
from tensorflow.keras.layers import Input, Dropout, Flatten
from tensorflow.keras.models import Model

import numpy as np
np.random.seed(1234)  # for reproducibility

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from sklearn.utils import shuffle

from collections import OrderedDict

# Batch Norm Layer

In [None]:
class BatchNormLayer(Layer):
    def __init__(self, axes=None, epsilon=0.01, alpha=0.5, nonlinearity=None, **kwargs):
        super(BatchNormLayer, self).__init__(**kwargs)
        self.axes = axes
        self.epsilon = epsilon
        self.alpha = alpha
        self.nonlinearity = nonlinearity
        self.beta = None
        self.gamma = None

    def build(self, input_shape):
        shape = list(input_shape)
        broadcast = [False] * len(shape)
        if self.axes is None:
            # default: normalize over all but the second axis
            self.axes = (0,) + tuple(range(2, len(shape)))
        elif isinstance(self.axes, int):
            self.axes = (self.axes,)
        for axis in self.axes:
            shape[axis] = 1
            broadcast[axis] = True
        if any(size is None for size in shape):
            raise ValueError("BatchNormLayer needs specified input sizes for "
                             "all dimensions/axes not normalized over.")
        # Initialize learnable parameters
        self.beta = self.add_weight(name='beta', shape=shape,
                                    initializer='zeros', trainable=True)
        self.gamma = self.add_weight(name='gamma', shape=shape,
                                     initializer='ones', trainable=True)
        self.mean = self.add_weight(name='mean', shape=shape,
                                    initializer=initializers.Constant(0),
                                    trainable=False)
        self.std = self.add_weight(name='std', shape=shape,
                                   initializer=initializers.Constant(1),
                                   trainable=False)
        super(BatchNormLayer, self).build(input_shape)

    def call(self, inputs, **kwargs):
        mean = K.mean(inputs, axis=self.axes, keepdims=True)
        std = K.std(inputs, axis=self.axes, keepdims=True)
        # Update moving averages for mean and standard deviation
        self.add_update([(self.mean, (1 - self.alpha) * self.mean + self.alpha * mean),
                         (self.std, (1 - self.alpha) * self.std + self.alpha * std)])

        # Normalize inputs using batch normalization formula
        normalized = (inputs - mean) * (self.gamma / (std + self.epsilon)) + self.beta
        return normalized if self.nonlinearity is None else self.nonlinearity(normalized)

# Function to apply Batch Normalization to a layer
def batch_norm(layer):
    nonlinearity = getattr(layer, 'activation', None)
    if nonlinearity is not None:
        layer.activation = None
    if hasattr(layer, 'bias'):
        layer.bias = None
    return BatchNormLayer()(layer)

# Binary Connect Layer

In [None]:


# Hard sigmoid function for binary weights
def hard_sigmoid(x):
    return tf.clip_by_value((x + 1.0) / 2.0, 0, 1)

# BinaryConnect weight binarization function
def binarization(W, H, binary=True, deterministic=False, stochastic=False, seed=None):
    if not binary or (deterministic and stochastic):
        return W
    else:
        Wb = hard_sigmoid(W / H)

        if stochastic:
            # Apply random binomial sampling for stochastic BinaryConnect
            Wb = tf.dtypes.cast(
                tf.random.stateless_binomial(shape=tf.shape(W), seed=seed, counts=1, probs=Wb),
                tf.float32
            )
        else:
            # Deterministic rounding for deterministic BinaryConnect
            Wb = tf.round(Wb)
        # Mapping binary values to original weight range
        Wb = tf.dtypes.cast(tf.where(Wb > 0, H, -H), tf.float32)
        return Wb

# Custom DenseLayer with BinaryConnect
class DenseLayer(tf.keras.layers.Layer):
    def __init__(self, units, binary=True, stochastic=True, H=1.0, W_LR_scale="Glorot", **kwargs):
        super(DenseLayer, self).__init__(**kwargs)
        self.units = units
        self.binary = binary
        self.stochastic = stochastic
        self.H = H
        self.W_LR_scale = W_LR_scale
        self.seed = tf.constant([42, 42], dtype=tf.int32)  # Set your desired seed values
        self.W = None  # Initialize W to None

        if W_LR_scale == "Glorot":
            self.W_LR_scale = None  # Set to None initially
        elif isinstance(W_LR_scale, str) and W_LR_scale.lower() == "none":
            self.W_LR_scale = 1.0  # or any default value you prefer

    def build(self, input_shape):
        if self.W is None:
            input_dim = tf.TensorShape(input_shape[-1]).as_list()[0]
            self.units = self.units if self.units is not None else 1
            # Initialize binary weights using RandomUniform distribution
            self.W = self.add_weight(
                name='kernel',
                shape=(input_dim, self.units),
                initializer=tf.initializers.RandomUniform(-self.H, self.H),
                trainable=True
            )

            if self.W_LR_scale is None:
                self.W_LR_scale = 1.0 / np.sqrt(1.5 / (self.units + input_dim))

        super(DenseLayer, self).build(input_shape)

    def call(self, inputs, training=None, **kwargs):
        # Apply BinaryConnect binarization to weights
        self.Wb = binarization(self.W, self.H, self.binary, not training, self.stochastic, self.seed)
        Wr = self.W
        self.W = self.Wb
        output = tf.matmul(inputs, self.W)
        self.W = Wr
        return output


# compute gradients for BinaryConnect layers
def compute_grads(loss, network):
    grads = []
    for layer in network.layers:
        if hasattr(layer, 'Wb'):
            grads.append(tf.gradients(loss, layer.Wb)[0])
    return grads

# apply clipping and scaling to BinaryConnect layer updates
def clipping_scaling(updates, model):
    clipped_updates = {}

    for layer in model.layers:
        if isinstance(layer, DenseLayer):
            lr_scaled_update = layer.W_LR_scale * (layer.Wb - layer.W)
            clipped_update = tf.clip_by_value(layer.W + lr_scaled_update, -layer.H, layer.H)
            clipped_updates[layer.W.ref()] = clipped_update

    return clipped_updates


# Fix Parameters and Load data

In [None]:

# BN parameters
batch_size = 100
print("batch_size = " + str(batch_size))
# alpha is the exponential moving average factor
alpha = 0.15
print("alpha = " + str(alpha))
epsilon = 1e-4
print("epsilon = " + str(epsilon))

# MLP parameters
num_units = 2048
print("num_units = " + str(num_units))
n_hidden_layers = 3
print("n_hidden_layers = " + str(n_hidden_layers))

# Training parameters
num_epochs = 10
print("num_epochs = " + str(num_epochs))

# Dropout parameters
dropout_in = 0. # no dropout
print("dropout_in = " + str(dropout_in))
dropout_hidden = 0.
print("dropout_hidden = " + str(dropout_hidden))

# BinaryConnect
binary = True
print("binary = " + str(binary))
stochastic = True    # change to false for deterministic setting
print("stochastic = " + str(stochastic))
H = 1.
print("H = " + str(H))
# W_LR_scale = 1.
W_LR_scale = "Glorot"  # using the coefficients from Glorot's paper
print("W_LR_scale = " + str(W_LR_scale))

# Decaying LR
LR_start = 0.001
print("LR_start = " + str(LR_start))
LR_fin = 0.000003
print("LR_fin = " + str(LR_fin))
LR_decay = (LR_fin / LR_start) ** (1. / num_epochs)
print("LR_decay = " + str(LR_decay))

print('Loading MNIST dataset...')

(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train, y_train = shuffle(X_train, y_train, random_state=42)


X_train = X_train.reshape(-1, 1, 28, 28).astype(np.float32)
X_test = X_test.reshape(-1, 1, 28, 28).astype(np.float32)

# flatten targets
y_train = np.hstack(y_train)
y_test = np.hstack(y_test)

# Onehot the targets
y_train = np.float32(np.eye(10)[y_train])
y_test = np.float32(np.eye(10)[y_test])

# for hinge loss
y_train = 2 * y_train - 1.
y_test = 2 * y_test - 1.
print(X_train.shape)

print('Building the MLP...')


# Prepare TensorFlow variables for inputs and targets
input_shape = (1, 28, 28)

batch_size = 100
alpha = 0.15
epsilon = 0.0001
num_units = 2048
n_hidden_layers = 3
num_epochs = 10
dropout_in = 0.0
dropout_hidden = 0.5
binary = True
stochastic = True
H = 1.0
W_LR_scale = Glorot
LR_start = 0.001
LR_fin = 3e-06
LR_decay = 0.5593866859813431
Loading MNIST dataset...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
(30000, 1, 28, 28)
Building the MLP...


# Building and training model

In [None]:
## BUILDING THE MLP MODEL #########################################################
def build_model(input_shape, num_units, n_hidden_layers, binary, stochastic, H, dropout_in, dropout_hidden):
    input_layer = Input(shape=input_shape, name="input")
    x = Dropout(rate=dropout_in)(input_layer)

    for i in range(n_hidden_layers):
        dense_layer = DenseLayer(units=num_units, binary=binary, stochastic=stochastic, H=H, name=f'dense_layer_{i}')(x)
        x = BatchNormLayer(epsilon=epsilon, alpha=alpha)(dense_layer)
        x = Dropout(rate=dropout_hidden)(x)

    x = Flatten()(x)
    output_layer = DenseLayer(units=10, binary=binary, stochastic=stochastic, H=H, name='output_dense_layer')(x)
    output = BatchNormLayer(epsilon=epsilon, alpha=alpha, nonlinearity=None)(output_layer)

    model = Model(inputs=input_layer, outputs=output)
    return model



# Create the model
model = build_model((1, 28, 28), num_units, n_hidden_layers, binary, stochastic, H, dropout_in, dropout_hidden)

# Loss function
loss_object = Hinge()



# Metrics
train_loss = Mean(name="train_loss")
val_loss = Mean(name="val_loss")


optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=LR_start, nesterov = True)

@tf.function
def train_step(inputs, targets, LR):
    with tf.GradientTape() as tape:
        predictions = model(inputs, training=True)
        loss = loss_object(targets, predictions)

    if binary:
        grads = compute_grads(loss, model)
        binary_params = [var for var in model.trainable_variables if 'dense' in var.name.lower()]
        updates_binary = optimizer.apply_gradients(zip(grads, binary_params))
        updates_binary = clipping_scaling(updates_binary, model)

        non_binary_params = [var for var in model.trainable_variables if 'dense' not in var.name.lower()]
        gradients_other = tape.gradient(loss, non_binary_params)
        updates_other = optimizer.apply_gradients(zip(gradients_other, non_binary_params))

        tf.executing_eagerly() and updates_binary
        tf.executing_eagerly() and updates_other

    else:
        gradients = tape.gradient(loss, model.trainable_variables)
        trainable_vars = [var for var in model.trainable_variables if 'dense' in var.name.lower()]  # Modify this based on your layer naming
        updates_final = optimizer.apply_gradients(zip(gradients, trainable_vars))

        tf.executing_eagerly() and updates_final

    return loss




@tf.function
def val_step(inputs, targets):
    predictions = model(inputs, training=False)
    # Cast targets to float32 to match the data type of predictions
    targets = tf.cast(targets, dtype=tf.float32)
    loss = loss_object(targets, predictions)
    return loss

# Training loop

import matplotlib.pyplot as plt

# Lists to store the training and validation losses for each epoch
train_losses = []
val_losses = []
num_epochs = 10
# Training loop
for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        x_batch = X_train[i:i + batch_size]
        y_batch = y_train[i:i + batch_size]

        loss = train_step(x_batch, y_batch, LR_start)
        train_loss(loss)

    for i in range(0, len(X_test), batch_size):
        x_val_batch = X_test[i:i + batch_size]
        y_val_batch = y_test[i:i + batch_size]

        loss = val_step(x_val_batch, y_val_batch)
        val_loss(loss)

    # Append the training and validation losses for the current epoch
    train_losses.append(train_loss.result().numpy())
    val_losses.append(val_loss.result().numpy())

    print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss.result()}, Validation Loss: {val_loss.result()}')

    # Reset the metrics for the next epoch
    train_loss.reset_states()
    val_loss.reset_states()

# Creating a DataFrame
df_sgd_n = pd.DataFrame({'Epoch': range(1, num_epochs + 1), 'Training Loss': train_losses, 'Validation Loss': val_losses})

# Plotting the training and validation losses
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss over Epochs')
plt.legend()
plt.show()


In [None]:
df_sgdn.to_csv('dr_sgdn.csv')