# Using the MANN Package to train a Convolutional Neural Network

In this notebook, the MANN package will be used to train pruned convolutional neural networks.  We will train two single-task networks on two separate tasks and one multitask network which performs both tasks.

In [1]:
# Load the MANN package and TensorFlow
import tensorflow as tf
import mann

In [2]:
# Load both MNIST tasks
(digit_x_train, digit_y_train), (digit_x_test, digit_y_test) = tf.keras.datasets.mnist.load_data()
(fashion_x_train, fashion_y_train), (fashion_x_test, fashion_y_test) = tf.keras.datasets.fashion_mnist.load_data()

# Reshape the x data so they have channels, divide by 255 to place all pixel values in [0, 1]
digit_x_train = digit_x_train.reshape(digit_x_train.shape + (1,))/255
digit_x_test = digit_x_test.reshape(digit_x_test.shape + (1,))/255
fashion_x_train = fashion_x_train.reshape(fashion_x_train.shape + (1,))/255
fashion_x_test = fashion_x_test.reshape(fashion_x_test.shape + (1,))/255

# Reshape the y data to have 1 column
digit_y_train = digit_y_train.reshape(-1, 1)
digit_y_test = digit_y_test.reshape(-1, 1)
fashion_y_train = fashion_y_train.reshape(-1, 1)
fashion_y_test = fashion_y_test.reshape(-1, 1)

# Create a callback for early stopping
callback = tf.keras.callbacks.EarlyStopping(min_delta = 0.01, patience = 3, restore_best_weights = True)

## Create the first model

This first model is a convolutional model which will perform MNIST digit recognition. It will be pruned utilizing the MANN package so that most of its weights are 0.

In [3]:
# Create the input layer for the digit task
input_layer = tf.keras.layers.Input(digit_x_train.shape[1:])

# Create the convolutional blocks
x = mann.layers.MaskedConv2D(
    filters = 32,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(input_layer)
x = mann.layers.MaskedConv2D(
    filters = 32,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(x)
x = tf.keras.layers.MaxPool2D(
    pool_size = 2,
    strides = 1,
    padding = 'valid'
)(x)
x = mann.layers.MaskedConv2D(
    filters = 64,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(x)
x = mann.layers.MaskedConv2D(
    filters = 64,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(x)
x = tf.keras.layers.MaxPool2D(
    pool_size = 2,
    strides = 1,
    padding = 'valid'
)(x)
x = tf.keras.layers.Flatten()(x)
x = mann.layers.MaskedDense(256, activation = 'relu')(x)
x = mann.layers.MaskedDense(256, activation = 'relu')(x)
output_layer = mann.layers.MaskedDense(10, activation = 'softmax')(x)

# Create the model
model = tf.keras.Model(input_layer, output_layer)

2021-11-09 07:25:38.653338: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-09 07:25:38.654033: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1


In [4]:
# Compile the model for training and to prepare for masking
model.compile(
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy'],
    optimizer = 'adam'
)

# Mask (prune) the model using the MANN package
model = mann.utils.mask_model(
    model = model,              # The model to be pruned
    percentile = 80,            # The percentile to be masked, for example, if the value is 90, then 90% of weights will be masked
    method = 'gradients',       # The method to use to mask, either 'gradients' or 'magnitude'
    exclusive = True,           # Whether weight locations must be exclusive to each task
    x = digit_x_train[:1000],   # The input data (using a subset to calculate gradients)
    y = digit_y_train[:1000]    # The expected outputs (using a subset to calculate gradients)
)

# Recompile the model
model.compile(
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy'],
    optimizer = 'adam'
)

In [5]:
# Show the sparsity of the model
model.layers[1].get_weights()[0]

array([[[[-0.        ,  0.        , -0.        , -0.        ,
           0.        ,  0.        ,  0.        ,  0.        ,
          -0.        ,  0.        , -0.        ,  0.02906718,
           0.05429413,  0.        ,  0.        , -0.        ,
          -0.02267656, -0.        ,  0.        , -0.        ,
          -0.05906082, -0.12245995,  0.04115575,  0.        ,
           0.04951426,  0.        ,  0.        ,  0.        ,
          -0.        , -0.        , -0.        ,  0.        ]],

        [[-0.        ,  0.        , -0.        , -0.        ,
           0.        , -0.        ,  0.        ,  0.        ,
           0.        , -0.        ,  0.        ,  0.0459222 ,
           0.03733057, -0.        ,  0.05112445, -0.        ,
           0.05061556, -0.        , -0.        , -0.        ,
           0.1067455 ,  0.        , -0.00599049,  0.        ,
           0.02466581, -0.        , -0.0068549 ,  0.        ,
          -0.        , -0.        , -0.        ,  0.        ]],

  

In [6]:
# Fit the model on the first dataset
model.fit(
    digit_x_train,
    digit_y_train,
    batch_size = 128,
    epochs = 100,
    validation_split = 0.2,
    callbacks = [callback]
)
print(f'Digit Model Accuracy: {(model.predict(digit_x_test).argmax(axis = 1).flatten() == digit_y_test.flatten()).sum()/digit_y_test.shape[0]}')

2021-11-09 07:26:22.954960: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-09 07:26:22.957940: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/100


2021-11-09 07:26:23.170031: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-11-09 07:26:56.007482: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


2021-11-09 07:29:54.027378: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Digit Model Accuracy: 0.9864


## Create the second model

The second model will have an identical architecture to the first, but it will be trained to perform MNIST fashion recognition instead.

In [7]:
# Create the input layer for the fashion task
input_layer = tf.keras.layers.Input(fashion_x_train.shape[1:])

# Create the convolutional blocks
x = mann.layers.MaskedConv2D(
    filters = 32,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(input_layer)
x = mann.layers.MaskedConv2D(
    filters = 32,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(x)
x = tf.keras.layers.MaxPool2D(
    pool_size = 2,
    strides = 1,
    padding = 'valid'
)(x)
x = mann.layers.MaskedConv2D(
    filters = 64,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(x)
x = mann.layers.MaskedConv2D(
    filters = 64,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(x)
x = tf.keras.layers.MaxPool2D(
    pool_size = 2,
    strides = 1,
    padding = 'valid'
)(x)
x = tf.keras.layers.Flatten()(x)
x = mann.layers.MaskedDense(256, activation = 'relu')(x)
x = mann.layers.MaskedDense(256, activation = 'relu')(x)
output_layer = mann.layers.MaskedDense(10, activation = 'softmax')(x)

# Create the model
model = tf.keras.Model(input_layer, output_layer)

In [8]:
# Compile the model for training and to prepare for masking
model.compile(
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy'],
    optimizer = 'adam'
)

# Mask (prune) the model using the MANN package
model = mann.utils.mask_model(
    model = model,              # The model to be pruned
    percentile = 80,            # The percentile to be masked, for example, if the value is 90, then 90% of weights will be masked
    method = 'gradients',       # The method to use to mask, either 'gradients' or 'magnitude'
    exclusive = True,           # Whether weight locations must be exclusive to each task
    x = fashion_x_train[:1000], # The input data (using a subset to calculate gradients)
    y = fashion_y_train[:1000]  # The expected outputs (using a subset to calculate gradients)
)

# Recompile the model
model.compile(
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy'],
    optimizer = 'adam'
)

In [9]:
# Fit the model on the second dataset
model.fit(
    fashion_x_train,
    fashion_y_train,
    batch_size = 128,
    epochs = 100,
    validation_split = 0.2,
    callbacks = [callback]
)
print(f'Fashion Model Accuracy: {(model.predict(fashion_x_test).argmax(axis = 1).flatten() == fashion_y_test.flatten()).sum()/fashion_y_test.shape[0]}')

Epoch 1/100


2021-11-09 07:30:04.183111: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-11-09 07:30:35.875719: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


2021-11-09 07:35:15.092436: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Fashion Model Accuracy: 0.8995


## Create the MANN

The third and final model we create here will be a multitask model (MANN) which performs both the MNIST digit recognition and the MNIST fashion recognition tasks.

In [10]:
# Train the Multitask Model
digit_input = tf.keras.layers.Input(digit_x_train.shape[1:])
fashion_input = tf.keras.layers.Input(fashion_x_train.shape[1:])

# Create the convolutional blocks
x = mann.layers.MultiMaskedConv2D(
    filters = 32,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)([digit_input, fashion_input])
x = mann.layers.MultiMaskedConv2D(
    filters = 32,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(x)
x = mann.layers.MultiMaxPool2D(
    pool_size = 2,
    strides = 1,
    padding = 'valid'
)(x)
x = mann.layers.MultiMaskedConv2D(
    filters = 64,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(x)
x = mann.layers.MultiMaskedConv2D(
    filters = 64,
    kernel_size = 3,
    padding = 'same',
    strides = 1,
    activation = 'relu'
)(x)
x = mann.layers.MultiMaxPool2D(
    pool_size = 2,
    strides = 1,
    padding = 'valid'
)(x)

# SelectorLayer for the first task
sel1 = mann.layers.SelectorLayer(0)(x)
digit_flatten = tf.keras.layers.Flatten()(sel1)

# SelectorLayer for the second task
sel2 = mann.layers.SelectorLayer(1)(x)
fashion_flatten = tf.keras.layers.Flatten()(sel2)

x = mann.layers.MultiMaskedDense(256, activation = 'relu')([digit_flatten, fashion_flatten])
x = mann.layers.MultiMaskedDense(256, activation = 'relu')(x)
output_layer = mann.layers.MultiMaskedDense(10, activation = 'softmax')(x)

# Create the model
model = tf.keras.Model([digit_input, fashion_input], output_layer)

In [11]:
# Perform masking
model.compile(
    loss = 'sparse_categorical_crossentropy',
    
    metrics = ['accuracy'],
    optimizer = 'adam'
)
model = mann.utils.mask_model(
    model,
    80,
    method = 'gradients',
    exclusive = True,
    x = [digit_x_train[:1000], fashion_x_train[:1000]],
    y = [digit_y_train[:1000], fashion_y_train[:1000]]
)
model.compile(
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy'],
    optimizer = 'adam'
)

model.fit(
    [digit_x_train, fashion_x_train],
    [digit_y_train, fashion_y_train],
    epochs = 100,
    batch_size = 128,
    callbacks = [callback],
    validation_split = 0.2
)

Epoch 1/100


2021-11-09 07:35:24.650070: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-11-09 07:36:33.194804: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<keras.callbacks.History at 0x165cc0940>

# Predict Using the MANN

Now that the MANN model has been trained, we can use it to get predictions just as we would a traditional model. In this case, a list of predictions are returned, with each index corresponding to the task.

In [12]:
digit_preds, fashion_preds = model.predict([digit_x_test, fashion_x_test])
digit_preds = digit_preds.argmax(axis = 1)
fashion_preds = fashion_preds.argmax(axis = 1)

print(f'Multitask Model Digit Accuracy: {(digit_preds.flatten() == digit_y_test.flatten()).sum()/digit_y_test.flatten().shape[0]}')
print(f'Multitask Model Fashion Accuracy: {(fashion_preds.flatten() == fashion_y_test.flatten()).sum()/fashion_y_test.flatten().shape[0]}')

2021-11-09 07:46:41.027169: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Multitask Model Digit Accuracy: 0.9883
Multitask Model Fashion Accuracy: 0.9164
