In [1]:
%cd ../..

c:\Users\aldion\Desktop\Development\neural-net-quantization-and-pruning


In [2]:
import os
import random
from dataclasses import asdict

import numpy as np
import tensorflow as tf
import tensorflow_model_optimization as tfmot

from src.models.baseline_model import CompileParams, build_baseline_model
from src.models.qat_model import build_qat_model
from src.models.tflite_model import TFLiteModel, convert_and_save_model_as_tflite
from src.utils.load_data import load_data
from src.utils.metrics import (
    MetricsStore,
    compare_model_sizes,
    compute_zipped_file_sizes,
    
    get_model_metrics,
)
from src.utils.model_files import ModelFiles, ProjectDirs, calculate_file_path

In [3]:
seed_value=100
tf.random.set_seed(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

### Data Loading and Preprocessing

In [4]:
data = load_data()
data.X_train.max(), data.X_test.max() 

(255, 255)

In [5]:
data.preprocess()
data.X_train.max(), data.X_test.max() 

(1.0, 1.0)

### Training the baseline model

In [6]:
baseline_model = build_baseline_model()

In [7]:
baseline_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 28, 28, 1)         0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 12)        120       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 12)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 2028)              0         
                                                                 
 dense (Dense)               (None, 10)                20290     
                                                                 
Total params: 20,410
Trainable params: 20,410
Non-trainable params: 0
____________________________________________________

In [8]:
COMPILE_PARAMS = CompileParams(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [9]:
baseline_model.compile(**asdict(COMPILE_PARAMS))

In [10]:
os.makedirs(ProjectDirs.MODELS_DIR, exist_ok=True)

In [11]:
BASELINE_MODEL_WEIGHTS_PATH = calculate_file_path(ModelFiles.BASELINE_MODEL_WEIGHTS)
baseline_model.save_weights(BASELINE_MODEL_WEIGHTS_PATH)

In [12]:
baseline_model.fit(data.X_train, data.y_train, epochs=1, shuffle=False)



<keras.callbacks.History at 0x26003afbf40>

In [13]:
_, baseline_model_accuracy = baseline_model.evaluate(data.X_test, data.y_test)



In [14]:
NON_QUANTIZED_H5_PATH = calculate_file_path(ModelFiles.NON_QUANTIZED_H5)
baseline_model.save(NON_QUANTIZED_H5_PATH, include_optimizer=False)

In [15]:
baseline_model_metrics = get_model_metrics(NON_QUANTIZED_H5_PATH, baseline_model_accuracy)
baseline_model_metrics

Metrics(model_file_name='non_quantized.h5', model_size=98968, model_accuracy=0.9509999752044678)

In [16]:
metrics_store = MetricsStore()

In [17]:
metrics_store.update(baseline_model_metrics)
metrics_store.display()

Unnamed: 0,model_file_name,model_size,model_accuracy
0,non_quantized.h5,98968,0.951


### Model Conversion to TFLite Format

In [18]:
NON_QUANTIZED_TFLITE_PATH = calculate_file_path(ModelFiles.NON_QUANTIZED_TFLITE)
convert_and_save_model_as_tflite(baseline_model, NON_QUANTIZED_TFLITE_PATH)



INFO:tensorflow:Assets written to: C:\Users\aldion\AppData\Local\Temp\tmpepyof5r2\assets


INFO:tensorflow:Assets written to: C:\Users\aldion\AppData\Local\Temp\tmpepyof5r2\assets


### Loading and Evaluation of the TFLite Model

In [19]:
tflite_no_quantized_model =TFLiteModel(NON_QUANTIZED_TFLITE_PATH)

In [20]:
tflite_no_quantized_accuracy = tflite_no_quantized_model.evaluate(data.X_test, data.y_test)
tflite_no_quantized_accuracy

0.951

In [21]:
tflite_no_quantized_model_metrics = get_model_metrics(NON_QUANTIZED_TFLITE_PATH, tflite_no_quantized_accuracy)
tflite_no_quantized_model_metrics

Metrics(model_file_name='non_quantized.tflite', model_size=85012, model_accuracy=0.951)

In [22]:
metrics_store.update(tflite_no_quantized_model_metrics)
metrics_store.display()

Unnamed: 0,model_file_name,model_size,model_accuracy
0,non_quantized.h5,98968,0.951
1,non_quantized.tflite,85012,0.951


- We see a slight reduction in model size when Keras model is converted to TFLite Model

### Post - Training Quantization

In [23]:
POST_TRAINING_QUANTIZED_PATH = calculate_file_path(ModelFiles.POST_TRAINING_QUANTIZED_TFLITE)
convert_and_save_model_as_tflite(baseline_model, POST_TRAINING_QUANTIZED_PATH, quantize=True)



INFO:tensorflow:Assets written to: C:\Users\aldion\AppData\Local\Temp\tmpji2jd83h\assets


INFO:tensorflow:Assets written to: C:\Users\aldion\AppData\Local\Temp\tmpji2jd83h\assets


In [24]:
tflite_post_training_quantized_model =TFLiteModel(POST_TRAINING_QUANTIZED_PATH)

In [25]:
tflite_post_training_quantized_accuracy = tflite_post_training_quantized_model.evaluate(data.X_test, data.y_test)
tflite_post_training_quantized_accuracy

0.9502

In [26]:
tflite_post_training_quantized_metrics = get_model_metrics(POST_TRAINING_QUANTIZED_PATH, tflite_post_training_quantized_accuracy)
tflite_post_training_quantized_metrics

Metrics(model_file_name='post_training_quantized.tflite', model_size=24256, model_accuracy=0.9502)

In [27]:
metrics_store.update(tflite_post_training_quantized_metrics)
metrics_store.display()

Unnamed: 0,model_file_name,model_size,model_accuracy
0,non_quantized.h5,98968,0.951
1,non_quantized.tflite,85012,0.951
2,post_training_quantized.tflite,24256,0.9502


- Quantization results in significant reduction on model size without any change in accuracy

### Quantization Aware Training

In [28]:
quantization_aware_model = build_qat_model(BASELINE_MODEL_WEIGHTS_PATH)
quantization_aware_model.compile(**asdict(COMPILE_PARAMS))

In [29]:
quantization_aware_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 quantize_layer (QuantizeLay  (None, 28, 28)           3         
 er)                                                             
                                                                 
 quant_reshape_1 (QuantizeWr  (None, 28, 28, 1)        1         
 apperV2)                                                        
                                                                 
 quant_conv2d_1 (QuantizeWra  (None, 26, 26, 12)       147       
 pperV2)                                                         
                                                                 
 quant_max_pooling2d_1 (Quan  (None, 13, 13, 12)       1         
 tizeWrapperV2)                                                  
                                                                 
 quant_flatten_1 (QuantizeWr  (None, 2028)            

In [30]:
quantization_aware_model.fit(data.X_train, data.y_train, epochs=1, shuffle=False)



<keras.callbacks.History at 0x26005e5fee0>

In [31]:
_, quantization_aware_model_accuracy = quantization_aware_model.evaluate(data.X_test, data.y_test)
quantization_aware_model_accuracy



0.9501000046730042

In [33]:
QUANTIZED_AWARE_TRAINED_TFLITE_PATH = calculate_file_path(ModelFiles.QUANTIZED_AWARE_TRAINED_TFLITE)
convert_and_save_model_as_tflite(baseline_model, QUANTIZED_AWARE_TRAINED_TFLITE_PATH, quantize=True)



INFO:tensorflow:Assets written to: C:\Users\aldion\AppData\Local\Temp\tmpde1ekvht\assets


INFO:tensorflow:Assets written to: C:\Users\aldion\AppData\Local\Temp\tmpde1ekvht\assets


In [34]:
quantized_aware_trained_model =TFLiteModel(QUANTIZED_AWARE_TRAINED_TFLITE_PATH)
quantized_aware_trained_model_accuracy = quantized_aware_trained_model.evaluate(data.X_test, data.y_test)
quantized_aware_trained_model_accuracy

0.9502

In [35]:
quantization_aware_trained_model_metrics = get_model_metrics(QUANTIZED_AWARE_TRAINED_TFLITE_PATH, quantized_aware_trained_model_accuracy)
quantization_aware_trained_model_metrics

Metrics(model_file_name='quantization_aware_trained.tflite', model_size=24256, model_accuracy=0.9502)

In [36]:
metrics_store.update(quantization_aware_trained_model_metrics)
metrics_store.display()

Unnamed: 0,model_file_name,model_size,model_accuracy
0,non_quantized.h5,98968,0.951
1,non_quantized.tflite,85012,0.951
2,post_training_quantized.tflite,24256,0.9502
3,quantization_aware_trained.tflite,24256,0.9502


### Pruning

In [37]:
batch_size = 128
epochs = 2
validation_split = 0.1 

num_images = int(data.X_train.shape[0] * (1 - validation_split))
end_step = np.floor(num_images / batch_size).astype(np.int32) * epochs

In [38]:
pruning_schedule = tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.50, final_sparsity=0.80, begin_step=0, end_step=end_step)

In [39]:
pruned_model = tfmot.sparsity.keras.prune_low_magnitude(baseline_model, pruning_schedule=pruning_schedule)
pruned_model.compile(**asdict(COMPILE_PARAMS))

In [40]:
pruned_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 prune_low_magnitude_reshape  (None, 28, 28, 1)        1         
  (PruneLowMagnitude)                                            
                                                                 
 prune_low_magnitude_conv2d   (None, 26, 26, 12)       230       
 (PruneLowMagnitude)                                             
                                                                 
 prune_low_magnitude_max_poo  (None, 13, 13, 12)       1         
 ling2d (PruneLowMagnitude)                                      
                                                                 
 prune_low_magnitude_flatten  (None, 2028)             1         
  (PruneLowMagnitude)                                            
                                                                 
 prune_low_magnitude_dense (  (None, 10)               4

In [41]:
callbacks = [
  tfmot.sparsity.keras.UpdatePruningStep(),
]

pruned_model.fit(data.X_train, data.y_train,
                  epochs=epochs, validation_split=validation_split,
                  callbacks=callbacks)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x260224d8250>

In [42]:
exported_model = tfmot.sparsity.keras.strip_pruning(pruned_model)
exported_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 28, 28, 1)         0         


                                                                 
 conv2d (Conv2D)             (None, 26, 26, 12)        120       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 12)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 2028)              0         
                                                                 
 dense (Dense)               (None, 10)                20290     
                                                                 
Total params: 20,410
Trainable params: 20,410
Non-trainable params: 0
_________________________________________________________________


In [43]:
exported_model.weights[0]

<tf.Variable 'conv2d/kernel:0' shape=(3, 3, 1, 12) dtype=float32, numpy=
array([[[[ 0.        ,  0.        , -0.        , -0.7950687 ,
          -0.        , -0.        , -0.        , -0.        ,
           0.        , -0.        ,  0.        , -0.        ]],

        [[-0.        ,  0.        , -0.        , -1.053268  ,
           0.        , -0.        , -0.        , -0.        ,
           0.        , -0.        , -0.        , -0.        ]],

        [[-0.        ,  0.        , -0.        , -1.1284305 ,
           0.        , -0.        , -0.        , -0.        ,
           0.        , -0.        , -0.        , -0.        ]]],


       [[[ 0.        ,  0.        , -0.        ,  0.72856486,
          -0.        ,  0.815222  , -1.2257017 , -0.        ,
           0.        ,  0.86109215,  0.        , -0.        ]],

        [[-0.        ,  0.        ,  0.92507166,  0.        ,
          -0.        ,  0.        , -0.        , -1.6080204 ,
           0.        , -0.        , -0.      

- The pruned model has approximately 80% of it's weights reduced to 0

In [44]:
PRUNED_MODEL_H5_PATH = calculate_file_path(ModelFiles.PRUNED_MODEL_H5)
exported_model.save(PRUNED_MODEL_H5_PATH, include_optimizer=False)





In [45]:
compare_model_sizes(PRUNED_MODEL_H5_PATH, NON_QUANTIZED_H5_PATH)

Unnamed: 0,model,size
0,pruned_model.h5,98968
1,non_quantized.h5,98968


In [46]:
compute_zipped_file_sizes(PRUNED_MODEL_H5_PATH, NON_QUANTIZED_H5_PATH)

Unnamed: 0,model,size
0,non_quantized_h5.zip,78072
1,pruned_model_h5.zip,25819


- Pruning results in models which are much easier to compress

In [47]:
PRUNED_QUANTIZED_TFLITE_PATH = calculate_file_path(ModelFiles.PRUNED_QUANTIZED_TFLITE)
convert_and_save_model_as_tflite(exported_model, PRUNED_QUANTIZED_TFLITE_PATH, quantize=True)



INFO:tensorflow:Assets written to: C:\Users\aldion\AppData\Local\Temp\tmpaaeoe3is\assets


INFO:tensorflow:Assets written to: C:\Users\aldion\AppData\Local\Temp\tmpaaeoe3is\assets


In [48]:
pruned_quantized_model = TFLiteModel(PRUNED_QUANTIZED_TFLITE_PATH)

In [49]:
pruned_quantized_model_accuracy = pruned_quantized_model.evaluate(data.X_test, data.y_test)

In [50]:
pruned_quantized_model_metrics = get_model_metrics(PRUNED_QUANTIZED_TFLITE_PATH, pruned_quantized_model_accuracy)
metrics_store.update(pruned_quantized_model_metrics)
metrics_store.display()

Unnamed: 0,model_file_name,model_size,model_accuracy
0,non_quantized.h5,98968,0.951
1,non_quantized.tflite,85012,0.951
2,post_training_quantized.tflite,24256,0.9502
3,quantization_aware_trained.tflite,24256,0.9502
4,pruned_quantized.tflite,24256,0.9666


### Conclusion

The results of this project demonstrate the effects of pruning and quantization in reducing model size. We can clearly see that quantization can result in a 3x - 4x reduction in model size, and is hence an excellent tool for resource constrained environment. We also see that pruning significantly increases the compressibility of a model and can even have a regularizing effect causing the model to generalize better to uneen data.