In [2]:
# Import the necessary modules
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_compression as tfc
import tensorflow_datasets as tfds

# Define the hyperparameters
batch_size = 128
num_epochs = 10

# Load the Fashion MNIST dataset
transform = tf.keras.layers.experimental.preprocessing.Rescaling(1./255)
trainset = tfds.load('fashion_mnist', split='train', as_supervised=True).map(lambda x, y: (transform(x), y)).batch(batch_size)
testset = tfds.load('fashion_mnist', split='test', as_supervised=True).map(lambda x, y: (transform(x), y)).batch(batch_size)
classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot')

# Define the teacher model (a small convolutional neural network)
class Teacher(tf.keras.Model):
    def __init__(self):
        super(Teacher, self).__init__()
        self.conv1 = tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu')
        self.pool1 = tf.keras.layers.MaxPool2D(2, 2)
        self.conv2 = tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu')
        self.pool2 = tf.keras.layers.MaxPool2D(2, 2)
        self.flatten = tf.keras.layers.Flatten()
        self.fc1 = tf.keras.layers.Dense(64, activation='relu')
        self.fc2 = tf.keras.layers.Dense(10)

    def call(self, x):
        x = self.pool1(self.conv1(x))
        x = self.pool2(self.conv2(x))
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

# Define another teacher model (a small deep neural network)
class Teacher_DNN(tf.keras.Model):
    def __init__(self):
        super(Teacher_DNN, self).__init__()
        self.flatten = tf.keras.layers.Flatten()
        self.fc1 = tf.keras.layers.Dense(32, activation='relu')
        self.fc2 = tf.keras.layers.Dense(32, activation='relu')
        self.fc3 = tf.keras.layers.Dense(10)

    def call(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

# Define the student model (a smaller fully-connected neural network)
class Student(tf.keras.Model):
    def __init__(self):
        super(Student, self).__init__()
        self.flatten = tf.keras.layers.Flatten()
        self.fc1 = tf.keras.layers.Dense(32, activation='relu')
        self.fc2 = tf.keras.layers.Dense(10)

    def call(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x


2023-11-17 09:20:05.628624: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-17 09:20:05.944322: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-17 09:20:05.944379: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-17 09:20:05.946749: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-17 09:20:06.112011: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-17 09:20:06.113842: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

In [3]:
# Instantiate the models
teacher_cnn = Teacher()
teacher_dnn = Teacher_DNN()
student_simple = Student()
student_mimic = Student()

In [4]:
# train the teacher CNN model
# Define the loss function and the optimizer
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)

# Train the teacher model on the original hard targets
teacher_cnn.compile(optimizer=optimizer, loss=criterion, metrics=['accuracy'])
teacher_cnn.fit(trainset, epochs=num_epochs, validation_data=testset)
print('Finished training the teacher model')

# Evaluate the teacher model on the test set
teacher_cnn.evaluate(testset)
print('Accuracy of the teacher model on the test set: %.2f %%' % (teacher_cnn.metrics[-1].result() * 100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Finished training the teacher model
Accuracy of the teacher model on the test set: 90.02 %


In [5]:
# train the teacher DNN model
# Define the loss function and the optimizer
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)

# Train the teacher model on the original hard targets
teacher_dnn.compile(optimizer=optimizer, loss=criterion, metrics=['accuracy'])
teacher_dnn.fit(trainset, epochs=num_epochs, validation_data=testset)
print('Finished training the teacher model')

# Evaluate the teacher model on the test set
teacher_dnn.evaluate(testset)
print('Accuracy of the teacher model on the test set: %.2f %%' % (teacher_dnn.metrics[-1].result() * 100))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Finished training the teacher model
Accuracy of the teacher model on the test set: 86.61 %


In [8]:
temperature = 10 # A scaling factor for the soft targets
alpha = 0.9 # A weighting factor for the soft loss

student_simple = Student()
# Train the student model on the original hard targets
# Define the loss function and the optimizer
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)

# Train the student model on the original hard targets from the teacher model
student_simple.compile(optimizer=optimizer, loss=criterion, metrics=['accuracy'])
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(trainset):
        with tf.GradientTape() as tape:
            outputs = student_simple(inputs)
            with tf.GradientTape(watch_accessed_variables=False) as tape2:
                tape2.watch(outputs)
                teacher_outputs = teacher_cnn(inputs)
            # Compute the soft loss and the hard loss
            loss = criterion(labels, outputs)
        # Apply the gradients
        grads = tape.gradient(loss, student_simple.trainable_variables)
        optimizer.apply_gradients(zip(grads, student_simple.trainable_variables))
        running_loss += loss.numpy()
        if i % 200 == 199:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0
print('Finished training the simple student model')

# Evaluate the simple student model on the test set
student_simple.evaluate(testset)
print('Accuracy of the simple student model on the test set: %.2f %%' % (student_simple.metrics[-1].result() * 100))

[1,   200] loss: 0.859
[1,   400] loss: 0.550
[2,   200] loss: 0.484
[2,   400] loss: 0.469
[3,   200] loss: 0.443
[3,   400] loss: 0.437
[4,   200] loss: 0.420
[4,   400] loss: 0.416
[5,   200] loss: 0.405
[5,   400] loss: 0.402
[6,   200] loss: 0.393
[6,   400] loss: 0.390
[7,   200] loss: 0.382
[7,   400] loss: 0.379
[8,   200] loss: 0.373
[8,   400] loss: 0.369
[9,   200] loss: 0.365
[9,   400] loss: 0.361
[10,   200] loss: 0.357
[10,   400] loss: 0.354
Finished training the simple student model
Accuracy of the simple student model on the test set: 85.89 %


In [9]:
# Train the student model on the combined soft and hard targets
# Define the loss function and the optimizer
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)

# Get the full list of trainable variables
trainable_variables = student_mimic.trainable_variables

# Build the optimizer with the full list of trainable variables
optimizer.build(trainable_variables)

# Define the optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)

# Train the student model on the soft targets from the teacher model
student_mimic.compile(optimizer=optimizer, loss=criterion, metrics=['accuracy'])
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(trainset):
        with tf.GradientTape() as tape:
            outputs = student_mimic(inputs)
            with tf.GradientTape(watch_accessed_variables=False) as tape2:
                tape2.watch(outputs)
                teacher_outputs = teacher_cnn(inputs)
            # Compute the soft loss and the hard loss
            soft_loss = tf.keras.losses.KLDivergence()(tf.nn.softmax(teacher_outputs / temperature), tf.nn.softmax(outputs / temperature))
            hard_loss = criterion(labels, outputs)
            # Combine the soft loss and the hard loss with a weighting factor
            loss = alpha * soft_loss + (1 - alpha) * hard_loss
        # Apply the gradients
        grads = tape.gradient(loss, student_mimic.trainable_variables)
        optimizer.apply_gradients(zip(grads, student_mimic.trainable_variables))
        running_loss += loss.numpy()
        if i % 200 == 199:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0
print('Finished training the student model')

# Evaluate the student model on the test set
student_mimic.evaluate(testset)
print('Accuracy of the student model on the test set: %.2f %%' % (student_mimic.metrics[-1].result() * 100))


[1,   200] loss: 0.269
[1,   400] loss: 0.159
[2,   200] loss: 0.121
[2,   400] loss: 0.110
[3,   200] loss: 0.099
[3,   400] loss: 0.094
[4,   200] loss: 0.087
[4,   400] loss: 0.086
[5,   200] loss: 0.081
[5,   400] loss: 0.080
[6,   200] loss: 0.077
[6,   400] loss: 0.076
[7,   200] loss: 0.074
[7,   400] loss: 0.073
[8,   200] loss: 0.071
[8,   400] loss: 0.071
[9,   200] loss: 0.069
[9,   400] loss: 0.069
[10,   200] loss: 0.068
[10,   400] loss: 0.068
Finished training the student model
Accuracy of the student model on the test set: 82.29 %
