# Pose classification model optimization

In [1]:
import tensorflow as tf
from tensorflow.python.compiler.tensorrt import trt_convert as trt
from tensorflow import keras
from keras.utils.data_utils import get_file
from tensorflow.python.saved_model import tag_constants

import json
import numpy as np
import time
from tqdm import tqdm

from pose_classification_kit.datasets import bodyDataset, BODY18, dataAugmentation

1 Physical GPUs, 1 Logical GPUs


## Dataset & Keras model import

In [2]:
for key,val in bodyDataset(testSplit = .3, bodyModel = BODY18).items():
    val = np.array(val)
    exec(key + '=val')
    print(key+':', val.shape)

x_test_aug, y_test_aug = dataAugmentation(
    x_test, y_test_onehot,
    augmentation_ratio = 1.,
    remove_specific_keypoints = np.where(np.isin(BODY18.mapping,[
        "left_knee", "right_knee", "left_ankle", "right_ankle", "left_hip", "right_hip"
        ]))[0],
)

x_train: (7040, 18, 2)
y_train: (7040,)
y_train_onehot: (7040, 20)
x_test: (3000, 18, 2)
y_test: (3000,)
y_test_onehot: (3000, 20)
labels: (20,)


In [3]:
model_keras = keras.models.load_model('./Robust_BODY18.h5')

with open('Robust_BODY18_Info.json') as f:
    model_labels = np.array(json.load(f)['labels'])
assert np.array_equiv(model_labels, labels)

In [4]:
def benchmark_keras(keras_model, ds_x, ds_y):
    nb_samples = ds_x.shape[0]
    acc = 0
    acc_aug = 0
    dur_tot = 0.
    for x, y in tqdm(zip(ds_x, ds_y), total=nb_samples):
        start_time = time.time()
        output = keras_model.predict(np.array([x]))
        dur_tot += time.time() - start_time
        if np.argmax(output[0]) == np.argmax(y):
            acc += 1
    return acc / nb_samples, dur_tot / nb_samples

accuracy, duration = benchmark_keras(model_keras, x_test, y_test_onehot)
accuracy_aug, duration_aug = benchmark_keras(model_keras, x_test_aug, y_test_aug)

print('--- Raw testing dataset ---')
print('Accuracy: {:.2%}'.format(accuracy))
print('Inference time: {}ms'.format(int(duration*1000)))
print('--- Partial input testing dataset ---')
print('Accuracy: {:.2%}'.format(accuracy_aug))
print('Inference time: {}ms \n'.format(int(duration_aug*1000)))

 10%|█         | 315/3000 [01:15<10:41,  4.18it/s] 


KeyboardInterrupt: 

Save Keras model as standard TensorFlow format for TensorRT optimization.

In [4]:
model_keras.save('./Robust_BODY18_In')

INFO:tensorflow:Assets written to: ./Robust_BODY18_In/assets


## TF-TRT Comparisons

In [5]:
def benchmark_trt(inference_function, ds_x, ds_y):
    nb_samples = ds_x.shape[0]
    acc = 0
    acc_aug = 0
    dur_tot = 0.
    for x, y in tqdm(zip(ds_x, ds_y), total=nb_samples):
        x = tf.constant(np.expand_dims(x, axis=0), dtype=tf.float32)
        start_time = time.time()
        output = inference_function(x)
        dur_tot += time.time() - start_time
        if np.argmax(output['dense_20'][0]) == np.argmax(y):
            acc += 1
    return acc / nb_samples, dur_tot / nb_samples

### FP32

In [6]:
params_32 = tf.experimental.tensorrt.ConversionParams(
    precision_mode='FP32',
    max_workspace_size_bytes=(1<<25),
    maximum_cached_engines=64
)

converter = trt.TrtGraphConverterV2(
    input_saved_model_dir='./Robust_BODY18_In',
    conversion_params=params_32)

# Converter method used to partition and optimize TensorRT compatible segments
converter.convert()

# Optionally, build TensorRT engines before deployment to save time at runtime
# Note that this is GPU specific, and as a rule of thumb, we recommend building at runtime
def input_fn():
    yield [np.random.normal(size=(1, 18, 2)).astype(np.float32)]

converter.build(input_fn=input_fn)

# Save the model to the disk 
converter.save('./Robust_BODY18_FP32')

INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: ./Robust_BODY18_FP32/assets


In [7]:
model_FP32 = tf.saved_model.load('./Robust_BODY18_FP32', tags=[tag_constants.SERVING])
infer_FP32 = model_FP32.signatures['serving_default']

In [8]:
accuracy, duration = benchmark_trt(infer_FP32, x_test, y_test_onehot)
accuracy_aug, duration_aug = benchmark_trt(infer_FP32, x_test_aug, y_test_aug)

print('--- Raw testing dataset ---')
print('Accuracy: {:.2%}'.format(accuracy))
print('Inference time: {}ms'.format(int(duration*1000)))
print('--- Partial input testing dataset ---')
print('Accuracy: {:.2%}'.format(accuracy_aug))
print('Inference time: {}ms \n'.format(int(duration_aug*1000)))

100%|██████████| 3000/3000 [00:23<00:00, 126.73it/s]
100%|██████████| 3000/3000 [00:23<00:00, 127.66it/s]

--- Raw testing dataset ---
Accuracy: 98.37%
Inference time: 6ms
--- Partial input testing dataset ---
Accuracy: 95.37%
Inference time: 5ms 






### FP16

In [None]:
params_16 = tf.experimental.tensorrt.ConversionParams(
    precision_mode='FP16',
    max_workspace_size_bytes=(1<<25),
    maximum_cached_engines=64
)

converter = trt.TrtGraphConverterV2(
    input_saved_model_dir='./Robust_BODY18_In',
    conversion_params=params_16)

# Converter method used to partition and optimize TensorRT compatible segments
converter.convert()

# Optionally, build TensorRT engines before deployment to save time at runtime
# Note that this is GPU specific, and as a rule of thumb, we recommend building at runtime
def input_fn():
    yield [np.random.normal(size=(1, 18, 2)).astype(np.float32)]

converter.build(input_fn=input_fn)

# Save the model to the disk 
converter.save('./Robust_BODY18_FP16')

In [None]:
model_FP16 = tf.saved_model.load('./Robust_BODY18_FP16', tags=[tag_constants.SERVING])
infer_FP16 = model_FP16.signatures['serving_default']

In [None]:
accuracy, duration = benchmark_trt(infer_FP16, x_test, y_test_onehot)
accuracy_aug, duration_aug = benchmark_trt(infer_FP16, x_test_aug, y_test_aug)

print('--- Raw testing dataset ---')
print('Accuracy: {:.2%}'.format(accuracy))
print('Inference time: {}ms'.format(int(duration*1000)))
print('--- Partial input testing dataset ---')
print('Accuracy: {:.2%}'.format(accuracy_aug))
print('Inference time: {}ms \n'.format(int(duration_aug*1000)))

### INT8

In [None]:
params_8 = tf.experimental.tensorrt.ConversionParams(
    precision_mode='INT8',
    max_workspace_size_bytes=(1<<25),
    maximum_cached_engines=64
)

converter = trt.TrtGraphConverterV2(
    input_saved_model_dir='./Robust_BODY18_In',
    conversion_params=params_8)


# Optionally, build TensorRT engines before deployment to save time at runtime
# Note that this is GPU specific, and as a rule of thumb, we recommend building at runtime
def input_fn():
    yield [np.random.normal(size=(1, 18, 2)).astype(np.float32)]

# Converter method used to partition and optimize TensorRT compatible segments
converter.convert(calibration_input_fn=input_fn)

converter.build(input_fn=input_fn)

# Save the model to the disk 
converter.save('./Robust_BODY18_INT8')

In [None]:
model_INT8 = tf.saved_model.load('./Robust_BODY18_INT8', tags=[tag_constants.SERVING])
infer_INT8 = model_INT8.signatures['serving_default']

In [None]:
accuracy, duration = benchmark_trt(infer_INT8, x_test, y_test_onehot)
accuracy_aug, duration_aug = benchmark_trt(infer_INT8, x_test_aug, y_test_aug)

print('--- Raw testing dataset ---')
print('Accuracy: {:.2%}'.format(accuracy))
print('Inference time: {}ms'.format(int(duration*1000)))
print('--- Partial input testing dataset ---')
print('Accuracy: {:.2%}'.format(accuracy_aug))
print('Inference time: {}ms \n'.format(int(duration_aug*1000)))