In [1]:
import keras
from keras.datasets import mnist
from keras.layers import Activation, Input, Embedding, LSTM, Dense, Lambda, GaussianNoise, concatenate
from keras.models import Model
import numpy as np
from keras.utils import np_utils
from keras.layers.core import Dense, Dropout, Activation
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, merge
from keras.optimizers import SGD, Adam, RMSprop
from keras.constraints import max_norm
from keras.layers import MaxPooling2D, Dropout, Dense, Flatten, Activation, Conv2D
from keras.models import Sequential
from keras.losses import categorical_crossentropy as logloss
from keras.metrics import categorical_accuracy
import matplotlib.pyplot as plt


Using TensorFlow backend.


# Load the data

In [2]:
nb_classes = 10

(X_train, y_train), (X_test, y_test) = mnist.load_data()

# convert y_train and y_test to categorical binary values 
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)

In [3]:
X_train.shape

(60000, 28, 28)

In [4]:
Y_train[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.], dtype=float32)

# Preprocessing the Data

In [None]:
# Reshape them to batch_size, width,height,#channels
X_train = X_train.reshape(60000, 28, 28, 1)
X_test = X_test.reshape(10000, 28, 28, 1)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalize the values
X_train /= 255
X_test /= 255

print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

60000 train samples
10000 test samples


# Define Teacher Model

In [None]:
# Teacher model

input_shape = (28, 28, 1) # Input shape of each image

# Hyperparameters
nb_filters = 64 # number of convolutional filters to use
pool_size = (2, 2) # size of pooling area for max pooling
kernel_size = (3, 3) # convolution kernel size

teacher = Sequential()
teacher.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
teacher.add(Conv2D(64, (3, 3), activation='relu'))
teacher.add(MaxPooling2D(pool_size=(2, 2)))

teacher.add(Dropout(0.25)) # For reguralization

teacher.add(Flatten())
teacher.add(Dense(128, activation='relu'))
teacher.add(Dropout(0.5)) # For reguralization

teacher.add(Dense(nb_classes))
teacher.add(Activation('softmax')) # Note that we add a normal softmax layer to begin with

teacher.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

print(teacher.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 24, 24, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 12, 12, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 12, 12, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9216)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1179776   
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
__________

# Define Student Model

In [None]:
# Student model that is stand-alone. We will evaluate its accuracy compared to a teacher trained student model

student = Sequential()
student.add(Flatten(input_shape=input_shape))
student.add(Dense(32, activation='relu'))
student.add(Dropout(0.2))
student.add(Dense(nb_classes))
student.add(Activation('softmax'))

#sgd = keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
student.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

student.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                25120     
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                330       
_________________________________________________________________
activation_2 (Activation)    (None, 10)                0         
Total params: 25,450
Trainable params: 25,450
Non-trainable params: 0
_________________________________________________________________


# Training the Teacher model

In [None]:
# Train the teacher model as usual
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

log_dir = 'logs/teacher/'
logging = TensorBoard(log_dir=log_dir)
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=3)

epochs = 500
batch_size = 256
teacher.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(X_test, Y_test))

Train on 60000 samples, validate on 10000 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500

# Define a new model that outputs only techer logits

In [None]:
# Raise the temperature of teacher model and gather the soft targets

# Set a tempature value
temp = 1

#Collect the logits from the previous layer output and store it in a different model
teacher_WO_Softmax = Model(teacher.input, teacher.get_layer('dense_2').output)


In [None]:
teacher_WO_Softmax.summary()

# Define a manual softmax function

In [None]:
# Define a manual softmax function
def softmax(x):
    return np.exp(x)/(np.exp(x).sum())

# Understanding the concept of temperature in softmax activation

In [None]:
# For example, just grab the first image and lets see how softening of probabilities work
intermediate_output = teacher_WO_Softmax.predict(X_test[9].reshape(1,28,28,1))
print(softmax(intermediate_output))

pixels = X_test[9]
pixels = pixels.reshape((28, 28))
plt.imshow(pixels)
plt.savefig('Kimg.jpg')
plt.show()


# logits for the first number in test dataset
x = intermediate_output[0]
plt.figure(figsize=(20, 10));

temperature = [1,3,7,10,20,50]

for temp in temperature:
    plt.plot((softmax(x/temp)), label='$T='+str(temp)+'$', linewidth=2);
    plt.legend();
plt.xlabel('classes ->');
plt.ylabel('probability');
plt.xlim([0, 10]);
plt.savefig('Kgraph.jpg')
plt.show()


# Prepare the soft targets and the target data for student to be trained upon

In [None]:
temp = 1
epochs = 500
teacher_train_logits = teacher_WO_Softmax.predict(X_train)
teacher_test_logits = teacher_WO_Softmax.predict(X_test) # This model directly gives the logits ( see the teacher_WO_softmax model above)

# Perform a manual softmax at raised temperature
train_logits_T = teacher_train_logits/temp
test_logits_T = teacher_test_logits / temp 

#Y_train_soft = softmax(train_logits_T)
#Y_test_soft = softmax(test_logits_T)

Y_train_soft = []
Y_test_soft = []

for i in range( len( train_logits_T ) ):
  Y_train_soft.append( softmax( train_logits_T[i] ) )

for i in range( len( test_logits_T ) ):
  Y_test_soft.append( softmax( test_logits_T[i] ) )

#len(Y_train_soft[0])
Y_train_soft = np.array(Y_train_soft)
Y_test_soft = np.array(Y_test_soft)


# Concatenate so that this becomes a 10 + 10 dimensional vector
Y_train_new = np.concatenate([Y_train, Y_train_soft], axis=1)
Y_test_new =  np.concatenate([Y_test, Y_test_soft], axis =1)

In [None]:
Y_train_new[0]

In [None]:
# This is a standalone student model (same number of layers as original student model) trained on same data
# for comparing it with teacher trained student.
student = Sequential()
student.add(Flatten(input_shape=input_shape))
student.add(Dense(32, activation='relu'))
student.add(Dropout(0.2))
student.add(Dense(nb_classes))
student.add(Activation('softmax'))

#sgd = keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
student.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy']
               )

In [None]:
log_dir = 'logs/pure_student/'
logging = TensorBoard(log_dir=log_dir)
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=3)

student.fit(X_train, Y_train,
          batch_size=256,
          epochs=epochs,
          verbose=1,
          validation_data=(X_test, Y_test),
           callbacks=[logging,checkpoint] )

# Student

In [None]:
studentX = Sequential()
studentX.add(Flatten(input_shape=input_shape))
studentX.add(Dense(32, activation='relu'))
studentX.add(Dropout(0.2))
studentX.add(Dense(nb_classes))
studentX.add(Activation('softmax'))



#sgd = keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
studentX.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy']
                )



In [None]:
log_dir = 'logs/no_loss_function/'
logging = TensorBoard(log_dir=log_dir)
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=3)


studentX.fit(X_train, Y_train_soft,
                      batch_size=256,
                      epochs=epochs,
                      verbose=1,
                      validation_data=(X_test, Y_test),
                      callbacks=[logging,checkpoint] )

# StudentA

In [None]:
studentA = Sequential()
studentA.add(Flatten(input_shape=input_shape))
studentA.add(Dense(32, activation='relu'))
studentA.add(Dropout(0.2))
studentA.add(Dense(nb_classes))
studentA.add(Activation('softmax'))


##sgd = keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
studentA.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

#studentA = Model(student.input,student.output)
studentA.summary()

In [None]:
# Remove the softmax layer from the student network
#student.layers.pop()

# Now collect the logits from the last layer
logits = studentA.layers[-2].output # This is going to be a tensor. And hence it needs to pass through a Activation layer
probs = Activation('softmax')(logits)

# softed probabilities at raised temperature
logits_T = Lambda(lambda x: x / temp)(logits)
probs_T = Activation('softmax')(logits_T)

output = concatenate([probs, probs_T])


# This is our new student model
studentA = Model(studentA.input, output)

studentA.summary()

In [None]:
studentA.predict( X_train[0].reshape(1,28,28,1) )

In [None]:
# This will be a teacher trained student model. 
# --> This uses a knowledge distillation loss function

# Declare knowledge distillation loss
def knowledge_distillation_loss(y_true, y_pred, alpha):

    # Extract the one-hot encoded values and the softs separately so that we can create two objective functions
    y_true, y_logits = y_true[: , :nb_classes], y_true[: , nb_classes:]
    
    y_pred, y_pred_softs = y_pred[: , :nb_classes], y_pred[: , nb_classes:]
    
    loss = ( alpha*temp*logloss(y_logits, y_pred) ) + ( (1-alpha)*logloss(y_true,y_pred) ) 
    
    return loss

# For testing use regular output probabilities - without temperature
def acc(y_true, y_pred):
    y_true = y_true[:, :nb_classes]
    y_pred = y_pred[:, :nb_classes]
    return categorical_accuracy(y_true, y_pred)
  
# For testing use regular output probabilities - without temperature
def true_loss(y_true, y_pred):
    y_true = y_true[:, :nb_classes]
    y_pred = y_pred[:, :nb_classes]
    return logloss(y_true, y_pred)

def logits_loss(y_true, y_pred):
    y_true = y_true[:, nb_classes:]
    y_pred = y_pred[:, :nb_classes]
    return logloss(y_true, y_pred)

studentA.compile(
    #optimizer=optimizers.SGD(lr=1e-1, momentum=0.9, nesterov=True),
    optimizer='adadelta',
    loss=lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, 0.5),
    #loss='categorical_crossentropy',
    metrics=[acc] )#,true_loss,logits_loss


In [None]:
log_dir = 'logs/loss_function_a/'
logging = TensorBoard(log_dir=log_dir)
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=3)

studentA.fit(X_train, Y_train_new,
                      batch_size=256,
                      epochs=epochs,
                      verbose=1,
                      validation_data=(X_test, Y_test_new),
            callbacks=[logging,checkpoint])

# StudentB

In [None]:
studentB = Sequential()
studentB.add(Flatten(input_shape=input_shape))
studentB.add(Dense(32, activation='relu'))
studentB.add(Dropout(0.2))
studentB.add(Dense(nb_classes))
studentB.add(Activation('softmax'))


##sgd = keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
studentB.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])
#studentB = Model(student.input,student.output)
studentB.summary()

In [None]:
# Remove the softmax layer from the student network
#student.layers.pop()

# Now collect the logits from the last layer
logits = studentB.layers[-2].output # This is going to be a tensor. And hence it needs to pass through a Activation layer
probs = Activation('softmax')(logits)

# softed probabilities at raised temperature
#logits_T = Lambda(lambda x: x / temp)(logits)
probs_T = Activation('softmax')(logits)#(logits_T)

output = concatenate([probs, probs_T])


# This is our new student model
studentB = Model(studentB.input, output)

studentB.summary()

In [None]:
studentB.predict( X_train[0].reshape(1,28,28,1) )

In [None]:
# This will be a teacher trained student model. 
# --> This uses a knowledge distillation loss function

# Declare knowledge distillation loss
def knowledge_distillation_loss(y_true, y_pred, alpha,beta,gamma):

    # Extract the one-hot encoded values and the softs separately so that we can create two objective functions
    y_true, y_logits = y_true[: , :nb_classes], y_true[: , nb_classes:]
    
    y_pred, y_pred_softs = y_pred[: , :nb_classes], y_pred[: , nb_classes:]
    
    loss = ( alpha*logloss(y_true,y_logits) ) + ( beta*logloss(y_true, y_pred) ) +( gamma*logloss(y_logits, y_pred) )
   
    return loss

# For testing use regular output probabilities - without temperature
def acc(y_true, y_pred):
    y_true = y_true[:, :nb_classes]
    y_pred = y_pred[:, :nb_classes]
    return categorical_accuracy(y_true, y_pred)
  
# For testing use regular output probabilities - without temperature
def teacher_loss(y_true, y_pred):
    y_true = y_true[:, :nb_classes]
    y_pred = y_true[:, nb_classes:]
    return logloss(y_true, y_pred)
  
def student_loss(y_true, y_pred):
    y_true = y_true[:, :nb_classes]
    y_pred = y_pred[:, :nb_classes]
    return logloss(y_true, y_pred)

def apprentice_loss(y_true, y_pred):
    y_true = y_true[:, nb_classes:]
    y_pred = y_pred[:, :nb_classes]
    return logloss(y_true, y_pred)

studentB.compile(
    #optimizer=optimizers.SGD(lr=1e-1, momentum=0.9, nesterov=True),
    optimizer='adadelta',
    loss=lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, 1,0.5,0.5),
    #loss='categorical_crossentropy',
    metrics=[acc] )


In [None]:
log_dir = 'logs/loss_function_b/'
logging = TensorBoard(log_dir=log_dir)
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=3)

studentB.fit(X_train, Y_train_new,
                      batch_size=256,
                      epochs=epochs,
                      verbose=1,
                      validation_data=(X_test, Y_test_new),
            callbacks=[logging,checkpoint])