# Optimizing Masks to create WTs

In [1]:
# importing necessary libraries and the cnn architecture I defined

from cnn_architecture import CNN2Model
from utils import *
from load_datasets import load_and_prep_dataset

import tensorflow_datasets as tfds
#import tensorflow_probability as tfp
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.io import loadmat
import copy

# all the extra stuff for supermasks

2024-05-27 14:35:27.992412: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tf.test.is_built_with_cuda()

True

all the variables i have to check their meaning:
- use bias
- dynamik scaling
- sigmoid bias
- use learning phase


In [3]:
class MaskedDense(tf.keras.layers.Dense):
    
    # untrainable normal Dense layer
    # trainable mask, that is sigmoided (maybe squished) and then multiplied to Dense
    
    def __init__(self, units,*args, **kwargs):
        super(MaskedDense, self).__init__(units, *args, **kwargs)

        
    def build(self, input_shape):

        super(MaskedDense, self).build(input_shape)

        print("The super build function was called.")
        print("self kernel: ", self.kernel)
        self.kernel_mask = tf.Variable(initial_value=tf.random.uniform(shape=self.kernel.shape,minval=-1, maxval=1, seed=None), trainable=True,name="mask")
        print("self kernel mask: ",self.kernel_mask)
        
        # make bias and weights untrainable
        self._trainable_weights.remove(self.kernel)
        self._non_trainable_weights.append(self.kernel)
        self._trainable_weights.remove(self.bias)
        self._non_trainable_weights.append(self.bias)

        print("trainable weights have been adapted.")
        
        # create mask and make it trainable
        mask_init = tf.random.uniform(shape=self.kernel.shape,minval=-1, maxval=1, seed=None)
        self.kernel_mask = tf.Variable(initial_value=mask_init,
                                        trainable=True,
                                        validate_shape=True,
                                        name='mask',
                                        dtype=self.dtype,
                                        shape=self.kernel.shape)
        
        self._trainable_weights.append(self.kernel_mask)

        print("trainable mask have been adapted.")
    
    @tf.function
    def call(self, inputs):

        tf.print("opened masked dense call function")
        tf.print(inputs.shape)
        tf.print(self.kernel_mask)

        #effective_mask = tf.cast(tfp.distributions.Bernoulli(probs=tf.nn.sigmoid(self.kernel_mask)).sample(), dtype=tf.float32)
        effective_mask = tf.math.round(self.kernel_mask * 0.5 + 0.5)
        effective_kernel = tf.math.multiply(self.kernel, effective_mask)

        tf.print(effective_kernel.shape)

        inputs = tf.convert_to_tensor(inputs)
        outputs = tf.linalg.matmul(inputs, effective_kernel)
        outputs = tf.nn.bias_add(outputs, self.bias)
        output =  self.activation(outputs)
        #output = self.activation(tf.matmul(inputs, tf.math.multiply(self.kernel, tf.math.round(tf.nn.sigmoid(self.kernel_mask)))))
        return output
    
    def get_mask(self):
        return  tf.nn.sigmoid(self.kernel_mask)
    
    def get_binary_mask(self):
        return tf.math.round(tf.nn.sigmoid(self.kernel_mask))

In [4]:
class CNN2ModelMasked(tf.keras.Model):
    
    # basic
    def __init__(self):
        super(CNN2ModelMasked, self).__init__()
        
        # set biases to a value that is not exactly 0.0, so they don't get handled like pruned values
        self.bias_in = tf.keras.initializers.Constant(value=0.0000000001)
        
        self.conv1 = tf.keras.layers.Conv2D(filters=64, kernel_size=3,activation="relu", padding="same",kernel_initializer='glorot_uniform', bias_initializer=self.bias_in) # [batchsize,32,32,64]
        self.conv2 = tf.keras.layers.Conv2D(filters=64, kernel_size=3,activation="relu", padding="same",kernel_initializer='glorot_uniform', bias_initializer=self.bias_in) # [batchsize,32,32,64]
        self.maxpool = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),strides=(2, 2),input_shape=(32, 32, 64)) # [batchsize,16,16,64]
        self.flatten = tf.keras.layers.Flatten() # [batch_size,16384]
        self.dense1 = MaskedDense(256, activation="relu",kernel_initializer='glorot_uniform', bias_initializer=self.bias_in) # [batch_size,256]
        self.dense2 = MaskedDense(256, activation="relu",kernel_initializer='glorot_uniform', bias_initializer=self.bias_in) # [batch_size,256]
        self.dense3 = MaskedDense(10, activation="softmax",kernel_initializer='glorot_uniform', bias_initializer=self.bias_in) # [batch_size,256]
        
        # Making the weights of the conv layers untrainable
        self.conv1.trainable = False
        self.conv2.trainable = False
    
    @tf.function
    def call(self, inputs):
        
        # adjust the dense layers to be multiplayed with trainable mask (which gets assigned binary values for this step)
        x = self.conv1(inputs)
        x = self.conv2(x)
        x = self.maxpool(x)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        return x
    
    def get_masks(self):
        return [self.dense1.get_mask(), self.dense2.get_mask(), self.dense3.get_mask()]
        
    def get_binary_masks(self):
        return [self.dense1.get_binary_mask(), self.dense2.get_binary_mask(), self.dense3.get_binary_mask()]

In [5]:
# modified train loop to also work with sparse networks (such that pruned weights remain frozen at 0.0)

def train_mask(train, test, model, num_epochs=5):
    
    # hyperparameters
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002)
    loss_function= tf.keras.losses.CategoricalCrossentropy()
    
    # initializing training statistics
    train_accuracy = tf.keras.metrics.Accuracy(name='test_accuracy')
    test_accuracy = tf.keras.metrics.Accuracy(name='train_accuracy')
    train_losses = tf.keras.metrics.CategoricalCrossentropy(name='train_losses')
    test_losses = tf.keras.metrics.CategoricalCrossentropy(name='test_losses')
    train_acc = []
    test_acc = []
    train_l =[]
    test_l = []

    for epoch in tqdm(range(num_epochs), leave=False, desc="training epochs"):
        
        #train step
        for x, t in train:
            with tf.GradientTape() as tape:
                pred = model(x)
                loss = loss_function(t, pred)
                train_losses.update_state(t, pred)
                train_accuracy.update_state(tf.argmax(t,1), tf.argmax(pred,1))
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            
        # test step
        for x, t in test:
            pred = model(x)
            test_accuracy.update_state(tf.argmax(t,1), tf.argmax(pred,1))
            test_losses.update_state(t, pred)
        
        # updataing training statistics
        train_acc.append(train_accuracy.result().numpy())
        test_acc.append(test_accuracy.result().numpy())
        train_l.append(train_losses.result().numpy())
        test_l.append(test_losses.result().numpy())
        train_accuracy.reset_state()
        test_accuracy.reset_state()
        train_losses.reset_state()
        test_losses.reset_state()
        
    # collecting losses in a dictionary
    losses = { "test loss":test_l , "training loss":train_l , "test accuracy":test_acc , "training accuracy":train_acc}
    
    return  losses

In [6]:
train_dataset, test_dataset = load_and_prep_dataset("CIFAR", batch_size=60, shuffle_size=512)

model = CNN2ModelMasked()
model(list(train_dataset)[0][0])
initial_weights = model.get_weights()
initial_mask = model.get_masks()
initial_b_mask = model.get_binary_masks()
print(initial_mask)
print(initial_b_mask)
print("pruning_rates: ", get_pruning_rates(initial_b_mask))
print(model.trainable_variables)
model.summary()

losses = train_mask(train_dataset, test_dataset, model)
plot_losses("CIFAR", "TestSuperMaskOptimization", losses,"CNN Loss and Accuracy for supermask model")

2024-05-27 14:35:32.503135: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-27 14:35:32.558406: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-27 14:35:32.558690: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

The super build function was called.
self kernel:  <KerasVariable shape=(16384, 256), dtype=float32, path=cnn2_model_masked/masked_dense/kernel>
self kernel mask:  <tf.Variable 'masked_dense/mask:0' shape=(16384, 256) dtype=float32>


AttributeError: 'MaskedDense' object has no attribute 'kernel_mask'

debugging to do:
- check paper for optimizer
- make the call function simpler
- research other examples of unusual trainable parameters in models