In [1]:
# Define whether to use CPU or GPU for benchmarking
import os


CPU = True

if CPU:
    print('Using CPU for inference......')
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
else:
    print('Using GPU for inference......')


%reload_ext tensorboard

import datetime

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import tensorflow.keras.layers as layers
import tensorflow.keras.models as models
import tensorflow.keras.losses as losses
import tensorflow.keras.optimizers as optimizers

import tensorflow_datasets as tfds

from tensorflow.data import AUTOTUNE

if not CPU:
    device_name = tf.test.gpu_device_name()
    if device_name != '/device:GPU:0':
        raise SystemError('GPU device not found')
    print('Found GPU at: {}'.format(device_name))

    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])
        except RuntimeError as e:
            print(e)

Using CPU for inference......


C:\Users\andre\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\andre\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [2]:
# Necessity for model loading and compiling
import tensorflow.keras.layers as layers
import tensorflow.keras.losses as losses
import tensorflow.keras.metrics as metrics
import tensorflow.keras.models as models
import tensorflow.keras.optimizers as optimizers
import tensorflow_addons as tfa
import json
import numpy as np
import cv2


def class_map_road(seg):
    # map class 0=anything, 1=road
    return tf.where(seg == 7, [0, 1.0], [1.0, 0])


def cityscapes_prep(output_shape, input_shape=(512,256,3), class_map_func=None, float_range=True):
    def prep_map(sample):
        img = sample['image_left']
        seg = sample['segmentation_label']

        if float_range:
            img /= 255

        img = tf.image.resize(img, input_shape[0:2])
        seg = tf.image.resize(seg, output_shape[0:2])
        
        if callable(class_map_func):
            seg = class_map_func(seg)
        else:
            seg = tf.one_hot(tf.cast(seg, tf.int32), output_shape[-1], axis=2)
            seg = tf.cast(seg, tf.float32)
            seg = tf.squeeze(seg)
            #seg = tf.keras.utils.to_categorical(seg, num_classes=output_shape[-1])

        return img, seg

    return prep_map

def bisenetv2_output_shape(num_classes, scale, input_shape=(512,256,3)):
    return ((input_shape[0] // 8) * scale, 
            (input_shape[1] // 8) * scale, 
            num_classes)

class ArgmaxMeanIOU(metrics.MeanIoU):
    def update_state(self, y_true, y_pred, sample_weight=None):
        return super().update_state(tf.argmax(y_true, axis=-1), tf.argmax(y_pred, axis=-1), sample_weight)


decay_steps=10e3
momentum=0.9
weight_decay=0.0005

schedule = optimizers.schedules.PolynomialDecay( initial_learning_rate=5e-2, decay_steps=decay_steps, power=0.9)

sgd = tfa.optimizers.SGDW(weight_decay=weight_decay, learning_rate=schedule, momentum=momentum)
cce = losses.CategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=5e-6)

In [16]:
# Predict segmentation and overlay output with an image

import cv2
import time
def img_pred(src,model,image_size):
    src = cv2.cvtColor(src, cv2.COLOR_BGR2RGB)
    #cv2_imshow(src)
    image = cv2.resize(src,image_size,interpolation=cv2.INTER_CUBIC)
    #cv2_imshow(src)
    image = image/255
    data = tf.convert_to_tensor(image, dtype=tf.float32)
    data = tf.expand_dims(data, axis=0)
    s = time.time()
    pred = model.predict(data)
    e = time.time()
    fps = 1/(e-s)
    seg = tf.argmax(pred[0], axis=-1)
    seg = seg[..., tf.newaxis]
    seg = tf.keras.preprocessing.image.array_to_img(seg)
    #plt.imshow(seg)
    seg = cv2.cvtColor(np.array(seg), cv2.COLOR_BGR2RGB)
    seg = cv2.resize(seg,(src.shape[1],src.shape[0]),interpolation=cv2.INTER_CUBIC)
    result = cv2.addWeighted(np.array(src), 0.6, seg, 0.5, 0, dtype = cv2.CV_8U)
    return src, seg, result, fps

In [12]:
# Produce Concatenate video of six models
import statistics as stat
from tqdm import notebook

def gen_seg_vid(cap,out,model,subtitle):
    length = 1000
    progress = notebook.tqdm(total = length)
    FPS = [0,0,0]
  #while(cap.isOpened()):
    for _ in range(length):
        ret, frame = cap.read()
        if ret == True:
            img_out = []
            for i in range(len(model)):
                total_fps = []
                m = model[i]
                shape = (256,128)
                result = img_pred(frame,m,shape)
                img_seg = result[2]
                fps = result[3]
                total_fps.append(fps)
                avg_fps = stat.mean(total_fps)
                FPS[i] = avg_fps
                x = 'AVG_FPS:'
                y = 'FPS: '
                img_seg = cv2.resize(img_seg,(512,256),interpolation=cv2.INTER_CUBIC)
                text = "{}{:.3f}".format(x,avg_fps)
                text1 = "{}{:.3f}".format(y,fps)
                font = cv2.FONT_HERSHEY_DUPLEX
                img_out.append(cv2.cvtColor(img_seg, cv2.COLOR_BGR2RGB))
                cv2.putText(img_out[i], text , (350,230), font, 0.5, (0, 255, 255), 1, cv2.LINE_AA)
                cv2.putText(img_out[i], text1 , (350,210), font, 0.5, (0, 255, 255), 1, cv2.LINE_AA)
                cv2.putText(img_out[i], subtitle[i] , (140,20), font, 0.7, (0, 255, 0), 2, cv2.LINE_AA)
            #Verti_512 = np.concatenate((img_out[0], img_out[1]), axis=0)
            #Verti_128 = np.concatenate((img_out[2], img_out[3]), axis=0)
            #Verti_128_tr = np.concatenate((img_out[4], img_out[5]), axis=0)
            img_out[0] = cv2.resize(img_out[0],(1024,512),interpolation=cv2.INTER_CUBIC)
            out_img = np.concatenate((img_out[1], img_out[2]), axis=1)
            out_img = np.concatenate((out_img, img_out[0]), axis=0)
            out_img = cv2.resize(out_img,(1800,1300),interpolation=cv2.INTER_CUBIC)
            
            out.write(out_img)
            progress.update(1)
            if cv2.waitKey(100) & 0xFF == ord('q'):
                break
        else:
            break
    cap.release()
    out.release()
    cv2.destroyAllWindows()
    return FPS

In [5]:
# Helper function to run inference on a TFLite model with dataset
import statistics as stat
from tqdm import notebook

import cv2
import time
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

def run_tflite_model_valid_MIoU(tflite_file, test, size, ALL):
    cnt = 0
    progress = notebook.tqdm(total = len(test))
    for sample in test:
        miou = []
        total_fps = []
        cnt += 1
        gt = sample[1]
        test_image = sample[0]
        
        gt = tf.image.resize(gt, size)
        gt = tf.argmax(gt, axis=-1)
        gt = gt[..., tf.newaxis]
  
  # Initialize the interpreter
        interpreter = tf.lite.Interpreter(model_path=str(tflite_file),num_threads=4)
        interpreter.allocate_tensors()

        input_details = interpreter.get_input_details()[0]
        output_details = interpreter.get_output_details()[0]

        test_image = cv2.cvtColor(np.array(test_image), cv2.COLOR_BGR2RGB)
        test_image = cv2.resize(test_image,size,interpolation=cv2.INTER_CUBIC)
        image = test_image/255
        data = tf.convert_to_tensor(image, dtype=tf.float32)
        data = tf.expand_dims(data, axis=0)
        interpreter.set_tensor(input_details["index"], data)
  
        s = time.time()
        interpreter.invoke()
        e = time.time()
        fps = 1/(e-s)
        
        output = interpreter.get_tensor(output_details["index"])[0]

        seg = tf.argmax(output, axis=-1)
        seg = seg[..., tf.newaxis]
        m = tf.keras.metrics.MeanIoU(num_classes=2)
        m.update_state(seg, gt)
        miou.append(m.result().numpy())
        total_fps.append(fps)
        
        if not ALL:
            x = 'fps: '
            text = "{}{:.3f}".format(x,fps)
            y = "MIoU: "
            text2 = "{}{:.3f}".format(y,m.result().numpy())
            z = "#"
            text3 = "{}{}".format(z,cnt)
        
            seg = tf.keras.preprocessing.image.array_to_img(seg)

            seg = cv2.cvtColor(np.array(seg), cv2.COLOR_BGR2RGB)
            result = cv2.addWeighted(test_image, 0.6, seg, 0.5, 0, dtype = cv2.CV_8U)
            if size == (256,128):
                result = cv2.resize(result,(512,256),interpolation=cv2.INTER_CUBIC)
            font = cv2.FONT_HERSHEY_SIMPLEX
            cv2.putText(result, text , (100,250), font, 0.7, (0, 255, 255), 2, cv2.LINE_AA)
            cv2.putText(result, text2 , (250,250), font, 0.7, (100, 100, 255), 2, cv2.LINE_AA)
            cv2.putText(result, text3 , (10,50), font, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
            cv2.imshow('result',result)
        
        progress.update(1)
        a = cv2.waitKey(300) if not ALL else cv2.waitKey(0)
        if a & 0xFF == ord('q') or cnt == 100 and ALL == False:
            break
    mean_MIoU = np.round(stat.mean(miou)*100,3)
    mean_FPS = np.round(stat.mean(total_fps),3)
    cv2.destroyAllWindows()
    return mean_MIoU, mean_FPS


In [4]:
INPUT_SHAPE = [(256,512,3),(128, 256, 3)]
NUM_CLASSES = 2
SCALE = 2
OUTPUT_SHAPE = [(256,512,2),(128,256,2)]

In [7]:
cityscapes = tfds.load('cityscapes/semantic_segmentation',data_dir="E:\EE\project\FPGA\cityscapes",download=False)

valid_ds_1 = cityscapes['validation'].map(cityscapes_prep(OUTPUT_SHAPE[0], INPUT_SHAPE[0], class_map_road))
valid_ds_2 = cityscapes['validation'].map(cityscapes_prep(OUTPUT_SHAPE[1], INPUT_SHAPE[1], class_map_road))


BATCH_SIZE = 16

valid_ds_1 = valid_ds_1.batch(BATCH_SIZE).prefetch(AUTOTUNE)
valid_ds_2 = valid_ds_2.batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [5]:
# load in all tensorflow models for evaluating

m1 = tf.keras.models.load_model('E:/EE/project/FPGA/benchmark_model/stripped_pruned_model.tf',custom_objects={'ArgmaxMeanIOU': ArgmaxMeanIOU}, compile=False)
m1.compile(sgd, loss=cce, metrics=['accuracy', ArgmaxMeanIOU(NUM_CLASSES)] )
m1_pc = tf.keras.models.load_model('E:/EE/project/FPGA/128x256/stripped_pruned_small_2_model.tf',custom_objects={'ArgmaxMeanIOU': ArgmaxMeanIOU}, compile=False)
m1_pc.compile(opt, loss=cce, metrics=['accuracy', ArgmaxMeanIOU(NUM_CLASSES)] )
m2 = tf.keras.models.load_model('E:/EE/project/FPGA/128x256/original_model.tf',custom_objects={'ArgmaxMeanIOU': ArgmaxMeanIOU}, compile=False)
m2.compile(sgd, loss=cce, metrics=['accuracy', ArgmaxMeanIOU(NUM_CLASSES)] )
m2_pc = tf.keras.models.load_model('E:/EE/project/FPGA/128x256/stripped_pruned_small_3_model.tf',custom_objects={'ArgmaxMeanIOU': ArgmaxMeanIOU}, compile=False)
m2_pc.compile(opt, loss=cce, metrics=['accuracy', ArgmaxMeanIOU(NUM_CLASSES)] )
m3 = tf.keras.models.load_model('E:/EE/project/FPGA/128x256/bisenet_small.tf',custom_objects={'ArgmaxMeanIOU': ArgmaxMeanIOU}, compile=False)
m3.compile(sgd, loss=cce, metrics=['accuracy', ArgmaxMeanIOU(NUM_CLASSES)] )
m3_pc = tf.keras.models.load_model('E:/EE/project/FPGA/128x256/stripped_pruned_small_model.tf',custom_objects={'ArgmaxMeanIOU': ArgmaxMeanIOU}, compile=False)
m3_pc.compile(opt, loss=cce, metrics=['accuracy', ArgmaxMeanIOU(NUM_CLASSES)] )

model = [m1, m1_pc, m2, m2_pc, m3, m3_pc]

if model is not None:
    print('Loading is Successful : )')
else:
    print('Loading has Failed : (')

Loading is Successful : )


In [None]:
# Test Image
src = cv2.imread('E:/EE/project/FPGA/test_01.png')
start = time.time()
result = img_pred(src,model[1],(256,128))[2]
end = time.time()
result = cv2.resize(result,(1024,512))
fps = 1/(end-start)
print('fps =',fps)
cv2.imshow("result",result)
cv2.waitKey (0)

fps = 11.520566043436583


In [16]:
print("Evaluate bise small 2 on valid data")
results1 = model[0].evaluate(valid_ds_2)
print("Evaluate P small 2 20% on valid data")
results2 = model[1].evaluate(valid_ds_2)
print("Evaluate model(256x128) on valid data")
results1 = model[2].evaluate(valid_ds_2)
print("Evaluate pruned bise small 70% on valid data")
results2 = model[3].evaluate(valid_ds_2)
print("Evaluate bise small on valid data")
results1 = model[4].evaluate(valid_ds_2)
print("Evaluate P bise sbmall on valid data")
results2 = model[5].evaluate(valid_ds_2)

Evaluate bise small 2 on valid data
Evaluate P small 2 20% on valid data
Evaluate model(256x128) on valid data
Evaluate pruned bise small 70% on valid data
Evaluate bise small on valid data
Evaluate P bise sbmall on valid data


In [17]:
# load in all tensorflow models for evaluating

m1 = tf.keras.models.load_model('E:/EE/project/FPGA/128x256/stripped_pruned_small_2_model.tf',custom_objects={'ArgmaxMeanIOU': ArgmaxMeanIOU}, compile=False)
m2 = tf.keras.models.load_model('E:/EE/project/FPGA/128x256/stripped_pruned_small_3_model.tf',custom_objects={'ArgmaxMeanIOU': ArgmaxMeanIOU}, compile=False)
m3 = tf.keras.models.load_model('E:/EE/project/FPGA/128x256/stripped_pruned_small_model.tf',custom_objects={'ArgmaxMeanIOU': ArgmaxMeanIOU}, compile=False)

model = [m1, m2, m3]

if model is not None:
    print('Loading is Successful : )')
else:
    print('Loading has Failed : (')

Loading is Successful : )


In [None]:
cap = cv2.VideoCapture('E:\EE\project\FPGA\data_rural.mp4')
fourcc = cv2.VideoWriter_fourcc(*'MP4V')
out = cv2.VideoWriter('E:/EE/project/FPGA/Original_rural_CPU.mp4',fourcc,10,(1800,1300))

subtitle = ['PC Model(512x256)', 'PC Model(128x256)', 'PC Transfer(128x256)']

FPS = gen_seg_vid(cap,out,model,subtitle)

print('FPS of Pruned clustered model(512x256): ', np.round(FPS[0],3))
print('FPS of Pruned clustered model(256x128): ', np.round(FPS[1],3))
print('FPS of Pruned clustered transfer model(256x128): ', np.round(FPS[2],3))

In [18]:
cap = cv2.VideoCapture('E:\EE\project\FPGA\Driving_data.mp4')
fourcc = cv2.VideoWriter_fourcc(*'MP4V')
out = cv2.VideoWriter('E:/EE/project/FPGA/Original_CPU.mp4',fourcc,10,(1800,1300))

subtitle = ['BM 2 20% Model(128x256)', 'BM2 70% (128x256)', 'BM 20% model(128x256)']

FPS = gen_seg_vid(cap,out,model,subtitle)

print('FPS of BM 2 20% Model: ', np.round(FPS[0],3))
print('FPS of BM2 70% model: ', np.round(FPS[1],3))
print('FPS of BM 20% model): ', np.round(FPS[2],3))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))

FPS of BM 2 20% Model:  8.874
FPS of BM2 70% model:  8.835
FPS of BM 20% model):  9.035


In [None]:
cap = cv2.VideoCapture('E:\EE\project\FPGA\Drive_Taiwan.mp4')
fourcc = cv2.VideoWriter_fourcc(*'MP4V')
out = cv2.VideoWriter('E:/EE/project/FPGA/Original_Taiwan_CPU.mp4',fourcc,10,(1800,1300))

subtitle = ['PC Model(512x256)', 'PC Model(128x256)', 'PC Transfer(128x256)']

FPS = gen_seg_vid(cap,out,model,subtitle)

print('FPS of Pruned clustered model(512x256): ', np.round(FPS[0],3))
print('FPS of Pruned clustered model(256x128): ', np.round(FPS[1],3))
print('FPS of Pruned clustered transfer model(256x128): ', np.round(FPS[2],3))

In [None]:
import os 

def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

print('Size of 512x256 model: ' , np.round(get_size('E:/EE/project/FPGA/512x256/model8.tf')/1e6,3), 'MB')
print('Size of 512x256 PC model: ' , np.round(get_size('E:/EE/project/FPGA/512x256/stripped_sparsity_clustered_model.tf')/1e6,3), 'MB')
print('Size of 256x128 model: ' , np.round(get_size('E:/EE/project/FPGA/128x256/original_model.tf')/1e6,3), 'MB')
print('Size of 256x128 PC model: ' , np.round(get_size('E:/EE/project/FPGA/128x256/stripped_sparsity_clustered_model.tf')/1e6,3), 'MB')
print('Size of 256x128 transfer model: ' , np.round(get_size('E:/EE/project/FPGA/128x256/transfer_model.tf')/1e6,3), 'MB')
print('Size of 256x128 transfer PC model: ' , np.round(get_size('E:/EE/project/FPGA/128x256/stripped_sparsity_clustered_transfer_model.tf')/1e6,3), 'MB')

print('\nUnquant model has size ', np.round(os.stat('E:/EE/project/FPGA/128x256/cityscapes_tflite_models/cityscapes_notquant.tflite').st_size/1e6,3), ' MB')
print('Quant model with sparsity INT8/float32 has size ', np.round(os.stat('E:/EE/project/FPGA/128x256/cityscapes_tflite_models/cityscapes_EXPiof32.tflite').st_size/1e6,3), ' MB')
print('Quant model with full INT8 has size ', np.round(os.stat('E:/EE/project/FPGA/128x256/cityscapes_tflite_models/cityscapes_fullyINT8.tflite').st_size/1e6,3), ' MB')


In [24]:
cityscapes = tfds.load('cityscapes/semantic_segmentation',data_dir="E:\EE\project\FPGA\cityscapes",download=False)
test_ds = cityscapes['validation'].map(cityscapes_prep(OUTPUT_SHAPE[1], INPUT_SHAPE[1], class_map_road, float_range=False))
test = test_ds.shuffle(len(test_ds)).take(100)

path1 = "E:/EE/project/FPGA/128x256/cityscapes_tflite_models/cityscapes_f32_small_2_pc.tflite"
path2 = "E:/EE/project/FPGA/128x256/cityscapes_tflite_models/cityscapes_f32_small_2.tflite"
path3 = "E:/EE/project/FPGA/128x256/cityscapes_tflite_models/cityscapes_f32_small_3.tflite"
#path3 = "E:/EE/project/FPGA/128x256/cityscapes_tflite_models/cityscapes_notquant.tflite"

Test_on_all = True


R = run_tflite_model_valid_MIoU(path1,test_ds,(256,128),Test_on_all)
print('MIoU of bsmall 70% PC model: ', R[0], '%')
print('Mean FPS of bisenet 80% model: ', R[1])

R = run_tflite_model_valid_MIoU(path2,test_ds,(256,128),Test_on_all)
print('MIoU of bsmall 20% model: ', R[0], '%')
print('Mean FPS of bsmall 20% model: ', R[1])

R = run_tflite_model_valid_MIoU(path3,test_ds,(256,128),Test_on_all)
print('MIoU of bsmall 70%  model: ', R[0], '%')
print('Mean FPS of bsmall 70%  model: ', R[1])


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=500.0), HTML(value='')))

MIoU of bsmall 70% PC model:  72.239 %
Mean FPS of bisenet 80% model:  0.417


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=500.0), HTML(value='')))

MIoU of bsmall 20% model:  72.82 %
Mean FPS of bsmall 20% model:  0.421


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=500.0), HTML(value='')))

MIoU of bsmall 70%  model:  72.698 %
Mean FPS of bsmall 70%  model:  0.406
