# Part 1. Gesture Recognition with MobileNet(v1)
In this tutorial, we are going to train a classifier that recognizes gestures. 

## Import packages
* First of all, import all packages used in this tutorial.

In [None]:
import os
import cv2
import keras
import random
import itertools
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from keras import layers
from google.colab.output import eval_js
from base64 import b64decode, b64encode
from sklearn.metrics import confusion_matrix
from IPython.display import display, Javascript, HTML, Image

## Function declaration
* Next, define all the functions used later. 
* Just directly run the cell 

In [None]:
def show_inference_info(iterator, confidences, predictions):
  iter_tmp = iterator
  iter_tmp.batch_size = iterator.n
  batchX, batchy = iter_tmp.next()

  # randomly pick 9 results to show 
  fig = plt.figure(figsize=(10, 10))
  fig.suptitle('Predictions with confidence:', fontsize=16)
  for i in range(9):
    idx = random.choice(list(range(iter_tmp.n)))
    confidence, y_gt, y_pred = confidences[idx], batchy[idx].astype('int'), predictions[idx]
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow((batchX[idx]*255).astype("uint8"))
    title = '{} ({:.2f} %) Label: {}.'.format(class_names[y_pred], 
                                              100 * np.max(confidence),
                                              class_names[y_gt])
    plt.title(title, color='red' if y_pred != y_gt else 'green')
    plt.axis("off")

def show_train_info(history, epochs):
  fig = plt.figure(figsize=(8, 8))
  fig.suptitle('KPI & Objective function', fontsize = 16)

  plt.subplot(1, 2, 1)
  plt.plot(range(epochs), history.history['accuracy'], label='Training Accuracy')
  plt.plot(range(epochs), history.history['val_accuracy'], label='Validation Accuracy')
  plt.legend(loc='lower right')
  plt.title('Training and Validation Accuracy')

  plt.subplot(1, 2, 2)
  plt.plot(range(epochs), history.history['loss'], label='Training Loss')
  plt.plot(range(epochs), history.history['val_loss'], label='Validation Loss')
  plt.legend(loc='upper right')
  plt.title('Training and Validation Loss')
  plt.show()
  
def show_data_info(iterator):
  batchX, batchy = iterator.next()
  print('Input batch shape=%s, min=%.3f, max=%.3f' % (batchX.shape, batchX.min(), batchX.max()))
  print('Output batch shape=%s, min=%d, max=%d' % (batchy.shape, batchy.min(), batchy.max()))
  print('Class_name: {}' .format(train_it.class_indices))
  print('Lable sample: {}' .format(batchy))
  print('One-Hot sample:')
  print(keras.utils.to_categorical(batchy))

  # Show augmented image samples
  fig = plt.figure(figsize=(10, 10))
  fig.suptitle('Augmented image samples', fontsize=16)
  for i in range(9):
      ax = plt.subplot(3, 3, i + 1)
      plt.imshow((batchX[i]*255).astype("uint8"))
      plt.title(class_names[int(batchy[i])])
      plt.axis("off")

def plot_confusion_matrix(cm, classes, cmap=plt.cm.Blues):
  # used for plot confusion matrix
  plt.imshow(cm, interpolation='nearest', cmap=cmap)
  plt.title('Confusion matrix')
  plt.colorbar()
  tick_marks = np.arange(len(classes))
  plt.xticks(tick_marks, classes, rotation=45)
  plt.yticks(tick_marks, classes)

  thresh = cm.max() / 2.
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
      plt.text(j, i, cm[i, j],
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

  plt.tight_layout()
  plt.ylabel('True label')
  plt.xlabel('Predicted label')

def record_video(filename='video.mp4'):
  js = Javascript("""
    async function recordVideo() {
      const options = { mimeType: "video/webm; codecs=vp9" };
      const div = document.createElement('div');
      const capture = document.createElement('button');
      const stopCapture = document.createElement("button");
      capture.textContent = "Start Recording";
      capture.style.background = "green";
      capture.style.color = "white";

      stopCapture.textContent = "Stop Recording";
      stopCapture.style.background = "red";
      stopCapture.style.color = "white";
      div.appendChild(capture);

      const video = document.createElement('video');
      const recordingVid = document.createElement("video");
      video.style.display = 'block';

      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      let recorder = new MediaRecorder(stream, options);
      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;

      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      await new Promise((resolve) => {
        capture.onclick = resolve;
      });
      recorder.start();
      capture.replaceWith(stopCapture);
      // use a promise to tell it to stop recording
      await new Promise((resolve) => stopCapture.onclick = resolve);
      recorder.stop();

      let recData = await new Promise((resolve) => recorder.ondataavailable = resolve);
      let arrBuff = await recData.data.arrayBuffer();
      
      // stop the stream and remove the video element
      stream.getVideoTracks()[0].stop();
      div.remove();

      let binaryString = "";
      let bytes = new Uint8Array(arrBuff);
      bytes.forEach((byte) => {
        binaryString += String.fromCharCode(byte);
      })
      return btoa(binaryString);
    }
    """)
  try:
    display(js)
    data = eval_js('recordVideo({})')
    binary = b64decode(data)
    with open(filename, "wb") as video_file:
      video_file.write(binary)
    print(
        f"Finished recording video. Saved binary in directory: {filename}"
    )
  except Exception as err:
      # In case any exceptions arise
      print(str(err))
  return filename

def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoHeight;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 
                                        (video.videoWidth - video.videoHeight)/2, 0, video.videoHeight, video.videoHeight,
                                        0, 0, video.videoHeight, video.videoHeight);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  try:
    display(js)
    data = eval_js('takePhoto({})'.format(quality))
    binary = b64decode(data.split(',')[1])
    with open(filename, 'wb') as f:
      f.write(binary)
    print('Saved to {}'.format(filename))
    
    # Show the image which was just taken.
    display(Image(filename))
    
  except Exception as err:
    print(str(err))

def create_directory(dir):
  if os.path.isdir(dir) is False:
    os.mkdir(dir)

## Dataset: [Hand Gesture Datasets](https://lttm.dei.unipd.it/downloads/gesture/)
![](https://drive.google.com/uc?export=view&id=1K5NWR8S3XITwZ4fwQpJ1IaW51bbzaWv_)
There are totally 11 different kinds of gestures(classes). In this tutorial, we only pick 4 of them . \
*   4 classes
  1.   G0
  2.   G2
  3.   G3
  4.   G5 \

The customized dataset is prepared and available on google drive. Let's download it then unzip. 

In [None]:
!wget --no-check-certificate 'https://drive.google.com/u/0/uc?id=13p3HrbooMoyqJ261RrhYtF954ZwV6WqD&export=download' -O senz_light.zip
if os.path.isdir('senz_light') is False:
  !unzip senz_light.zip
!ls

## Data Preprocessing
*   For MobileNet, the input must be square images with one of the following border width: 
  - [128, 160, 192, 224]
*   Data Augmentation
  - [keras.preprocessing.image.ImageDataGenerator](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator)
  - [keras.layers.experimental.preprocessing](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing)

In [None]:
size, batch = 128, 16
assert size in [128, 160, 192, 224]
path_train, path_val, path_test = ('senz_light/train', 'senz_light/val', 'senz_light/test')

#  Using the data Augmentation in traning data
generator_train = keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255,
                                  shear_range=0.2,
                                  zoom_range=0.2,
                                  width_shift_range=0.2,
                                  height_shift_range= 0.1,
                                  rotation_range = 30)
generator_inf = keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)

train_it = generator_train.flow_from_directory(path_train,
                                              target_size=(size, size),
                                              color_mode = 'rgb',
                                              batch_size = batch,
                                              class_mode= 'binary')

validation_it = generator_inf.flow_from_directory(path_val,
                                                  target_size=(size, size),
                                                  color_mode = 'rgb',
                                                  batch_size = batch,
                                                  class_mode= 'binary',
                                                  shuffle = False)

test_it = generator_inf.flow_from_directory(path_test,
                                            target_size=(size, size),
                                            color_mode = 'rgb',
                                            batch_size = batch,
                                            class_mode= 'binary',
                                            shuffle = False)

class_names = list(train_it.class_indices.keys())
num_classes = len(class_names)

# confirm the iterator works
show_data_info(train_it)

## Ways to create a keras model
* ([keras.applications](https://keras.io/api/applications/))
* Sequential
* Functional
* [Modle Subclassing](https://www.tensorflow.org/guide/keras/custom_layers_and_models)

## MobileNet-v1
[MobileNets: Efficient Convolutional Neural Networks for Mobile Vision
Applications](https://arxiv.org/pdf/1704.04861.pdf) \
* Standard convolution --> [Depthwise separable convolution](https://medium.com/@chih.sheng.huang821/%E6%B7%B1%E5%BA%A6%E5%AD%B8%E7%BF%92-mobilenet-depthwise-separable-convolution-f1ed016b3467) \
![](https://drive.google.com/uc?export=view&id=1bzmYbgk33IpW8EmSWSZeh_Cu159N6P2o)
* Depthwise separable convolution = Depthwise convolution + Pointwise convolution \
![](https://miro.medium.com/max/1838/1*e_oU-f6hX4FSSD-1OukP6Q.png)
* Network Structure(size=224, alpha=1.0) \
![](https://drive.google.com/uc?export=view&id=1vGVELlYIKtHyKIm7Qa52qoYHYf3I7NhF)
* Instead of (size=224, alpha=1.0), we choose (size=128, alpha=0.25) in this case.
 - **size**: *Resolution* in **[0.25, 0.5, 0.75, 1.0]**
 - **alpha**: *Width Multiplier* in **[128, 144, 160, 192]**
![](https://drive.google.com/uc?export=view&id=1jB7DgWm4DSNSb06lSWAhOYJCbJjINpiL)


In [None]:
class MobileNet():
  def __init__(self, size=128, alpha=0.25, classes=1000):
    assert alpha in [0.25, 0.5, 0.75, 1.0]

    self.size = size
    self.alpha = alpha
    self.classes = classes
    self.img_input = layers.Input(shape=(size, size, 3))
    self.model = self.build_model()
    self.load_weights()
    self.add_dense_layer()

  def get_model(self):
    return self.model

  def add_dense_layer(self):
    x = self.model.layers[-1].output
    x = layers.Dense(self.classes)(x)
    x = layers.Softmax(name='softmax')(x)
    self.model = keras.Model(inputs=self.img_input, outputs= x, 
                             name='mobilenet_%0.2f_%s' % (self.alpha, self.size))

  def build_model(self):
    x = self._conv_block(self.img_input, 32, strides=(2, 2))
    x = self._depthwise_conv_block(x, 64, block_id=1)

    x = self._depthwise_conv_block(x, 128, strides=(2, 2), block_id=2)
    x = self._depthwise_conv_block(x, 128, block_id=3)

    x = self._depthwise_conv_block(x, 256, strides=(2, 2), block_id=4)
    x = self._depthwise_conv_block(x, 256, block_id=5)

    x = self._depthwise_conv_block(x, 512, strides=(2, 2), block_id=6)
    x = self._depthwise_conv_block(x, 512, block_id=7)
    x = self._depthwise_conv_block(x, 512, block_id=8)
    x = self._depthwise_conv_block(x, 512, block_id=9)
    x = self._depthwise_conv_block(x, 512, block_id=10)
    x = self._depthwise_conv_block(x, 512, block_id=11)

    x = self._depthwise_conv_block(x, 1024, strides=(2, 2), block_id=12)
    x = self._depthwise_conv_block(x, 1024, block_id=13)

    x = layers.GlobalAveragePooling2D()(x)

    # Create model.
    return keras.models.Model(self.img_input, x, name='mobilenet_%0.2f_%s' % (self.alpha, self.size))

  def load_weights(self):
    BASE_WEIGHT_PATH = ('https://github.com/fchollet/deep-learning-models/'
                    'releases/download/v0.6/')
    # Load weights.
    if self.alpha == 1.0:
        alpha_text = '1_0'
    elif self.alpha == 0.75:
        alpha_text = '7_5'
    elif self.alpha == 0.50:
        alpha_text = '5_0'
    else:
        alpha_text = '2_5'

    model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, size)
    weight_path = BASE_WEIGHT_PATH + model_name
    weights_path = keras.utils.get_file(model_name, weight_path, cache_subdir='models')
    self.model.load_weights(weights_path)

  def _conv_block(self, x, filters, kernel=(3, 3), strides=(1, 1)):
    filters = int(filters * self.alpha)
    x = layers.ZeroPadding2D(padding=((0, 1), (0, 1)), name='conv1_pad')(x)
    x = layers.Conv2D(filters, kernel,
                      padding='valid',
                      use_bias=False,
                      strides=strides,
                      name='conv1')(x)
    x = layers.BatchNormalization(axis=-1, name='conv1_bn')(x)
    return layers.ReLU(6., name='conv1_relu')(x)

  def _depthwise_conv_block(self, x, pointwise_conv_filters, strides=(1, 1), block_id=1):
    pointwise_conv_filters = int(pointwise_conv_filters * self.alpha)

    x = layers.ZeroPadding2D(((0, 1), (0, 1)), name='conv_pad_%d' % block_id)(x) if strides != (1, 1) else x
    x = layers.DepthwiseConv2D((3, 3),
                               padding='same' if strides == (1, 1) else 'valid',
                               depth_multiplier=1,
                               strides=strides,
                               use_bias=False,
                               name='conv_dw_%d' % block_id)(x)
    x = layers.BatchNormalization(axis=-1, name='conv_dw_%d_bn' % block_id)(x)
    x = layers.ReLU(6., name='conv_dw_%d_relu' % block_id)(x)

    x = layers.Conv2D(pointwise_conv_filters, (1, 1),
                      padding='same',
                      use_bias=False,
                      strides=(1, 1),
                      name='conv_pw_%d' % block_id)(x)
    x = layers.BatchNormalization(axis=-1, name='conv_pw_%d_bn' % block_id)(x)
    return layers.ReLU(6., name='conv_pw_%d_relu' % block_id)(x)

model = MobileNet(size =size, alpha=0.25, classes = num_classes).get_model()
model.summary()

## An alternative way: keras.applacations API
[Keras Applications](https://keras.io/api/applications/) \
Keras Applications are deep learning models that are made available alongside pre-trained weights. These models can be used for prediction, feature extraction, and fine-tuning. \
[tf.keras.applications.MobileNet](https://www.tensorflow.org/api_docs/python/tf/keras/applications/MobileNet) \
Instantiates the MobileNet architecture.


In [None]:
'''
mobile = keras.applications.mobilenet.MobileNet(input_shape=(size, size, 3), alpha=0.25)
x = mobile.layers[-6].output
predictions = layers.Dense(num_classes, activation='softmax')(x)
model = keras.Model(inputs=mobile.input, outputs= predictions)
model.summary()
'''

## Train & Validation
* Note that validation data is only used for inference, only training data 
influences the update of trainable variables.
* [Optimizer](https://medium.com/chung-yi/ml%E5%85%A5%E9%96%80-%E5%8D%81%E4%BA%8C-sgd-adagrad-momentum-rmsprop-adam-optimizer-e331ef3cf5cf): Though the origin apply RMSprop, we use Adam with learning rate 0.0002. \
* Loss function: [CrossEntropyLoss](https://medium.com/@chih.sheng.huang821/%E6%A9%9F%E5%99%A8-%E6%B7%B1%E5%BA%A6%E5%AD%B8%E7%BF%92-%E5%9F%BA%E7%A4%8E%E4%BB%8B%E7%B4%B9-%E6%90%8D%E5%A4%B1%E5%87%BD%E6%95%B8-loss-function-2dcac5ebb6cb) 
  - Why not apply **error rate**(1-accuracy) as objective function?
* Epoch: Number of times that model traverses the whole training data

In [None]:
model.compile(optimizer=keras.optimizers.Adam(lr=0.0002),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])

epochs = 12
history = model.fit(train_it,
                    validation_data=validation_it,
                    steps_per_epoch= train_it.n // batch,
                    validation_steps= validation_it.n // batch,
                    epochs=epochs)

You will get about 95% validation accuracy if everything works fine. \
Let's check the improvement during the training procedure. 

In [None]:
show_train_info(history, epochs)

## Evaluate & Inference on test set
We have already trained the model on our customized dataset. \
Now let's see how it works on test set (Unseen data).

In [None]:
#Evaluate on test set
model.evaluate(test_it)

# inference on test set
confidences = model.predict(test_it)
predictions = confidences.argmax(axis=1)

# draw confusion matrix of the entire test set
cm = confusion_matrix(y_true=test_it.classes, y_pred = predictions)
plot_confusion_matrix(cm=cm, classes=class_names)

show_inference_info(test_it, confidences, predictions)

## Confidence score: [Softmax](https://towardsdatascience.com/softmax-activation-function-explained-a7e1bc3ad60)
![](https://drive.google.com/uc?export=view&id=15tZjd6hoAPA22B3I4J7tyLCNuPY6K2CC)
![](https://miro.medium.com/max/1050/1*ReYpdIZ3ZSAPb2W8cJpkBg.jpeg)

# Part 2: Implementation on instant captured images
* In this part, we are going to train on the data that generated by ourselves.
* For each of classes, there will be frames of images extracted from the corresponding captured video. 
* **Try & Error**: Create your own classifier.

## Data collection
*  Let's start from 2 classes: one video record for each class.
*  You can try more than 2 classes if you want. (Just modify variable **num_classes**).
*  For each class video, **the record duration more than 10 seconds is recommended.** 
  * Sufficient samples for more variation
  * Avoid overfitting

In [None]:
capture_dir_pic, capture_dir_vid = 'captured_pictures', 'captured_videos'
!rm -rf {capture_dir_pic}
!rm -rf {capture_dir_vid}
create_directory(capture_dir_pic)
create_directory(capture_dir_vid)

num_classes =  2
video_paths = []
for i in range(num_classes):
  filename = os.path.join(capture_dir_vid, 'video_class{}.mp4'.format(i))
  print("Click button 'Start Recording' to start capturing data for class{}".format(i))
  video_paths.append(record_video(filename=filename)) 

## Conversion & Central crop
* We have prepared class videos, now let's extract image frames from them.  
* Note that only square images are available for MobileNet. 
  * However, the size of captured video is 480*720.
  * Therefore, we apply central crop then get 480*480 square samples.
* Both format conversion and central crop can well done by [ffmpeg](https://ffmpeg.org/) with only one line.

In [None]:
for i, video_path in enumerate(video_paths):
  # create directory & name the file
  class_folder = os.path.join(capture_dir_pic, 'class_{}'.format(i))
  !rm -rf {class_folder}
  create_directory(class_folder)
  out_jpgs = os.path.join(class_folder, '%d.jpg')
  print('Video frames for class{}:'.format(i))

  # Review saved videos
  video_file = open(video_path, "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  display(HTML(f"""<video width=300 controls><source src="{video_url}"></video>"""))

  # crop the frames during converting mp4 to jpgs
  vcap = cv2.VideoCapture(video_path)
  h, w = vcap.get(cv2.CAP_PROP_FRAME_HEIGHT), vcap.get(cv2.CAP_PROP_FRAME_WIDTH)
  !ffmpeg -i {video_path} -filter:v "crop={h}:{h}:{(w-h)//2}:0" -vf fps=20 -crf 19 {out_jpgs}

## Dataset split & preprocessing
* Train:Validation = 8:2 or you can choose your own **validation_split**.
* Make sure that the augmentation settings make sense toward your data.
  * Horizontal flip
  * Vertical flip
  * Rotation

In [None]:
generator_train = keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255,
                                                                validation_split=0.2,
                                                                shear_range=0.2,
                                                                zoom_range=0.2,
                                                                width_shift_range=0.1,
                                                                height_shift_range= 0.1,
                                                                rotation_range = 30)
generator_val = keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255,
                                                             validation_split=0.2)

iterator_train = generator_train.flow_from_directory(capture_dir_pic,
                                                    target_size=(size, size),
                                                    color_mode = 'rgb',
                                                    batch_size= batch,
                                                    class_mode= 'binary',
                                                    subset="training",
                                                    seed = 123)

iterator_val = generator_val.flow_from_directory(capture_dir_pic,
                                                  target_size=(size, size),
                                                  color_mode = 'rgb',
                                                  batch_size= batch,
                                                  class_mode= 'binary',
                                                  subset="validation",
                                                  shuffle = False,
                                                  seed = 123)

# confirm the iterator works
show_data_info(iterator_train)

* This time we get MobileNet from keras API  

In [None]:
mobile = keras.applications.mobilenet.MobileNet(input_shape=(size, size, 3), alpha=0.25)
x = mobile.layers[-6].output
predictions = layers.Dense(num_classes, activation='softmax')(x)
model = keras.Model(inputs=mobile.input, outputs= predictions)
model.summary()

## Start training
* Feel free to tune your own **epochs** and **learning rate(lr)**.
* Various [optimizers](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers) are available to choose. (Optional)

In [None]:
model.compile(optimizer=keras.optimizers.Adam(lr=0.0002),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])

epochs = 10
history = model.fit(iterator_train,
                    validation_data=iterator_val,
                    steps_per_epoch= iterator_train.n // batch,
                    validation_steps= iterator_val.n // batch,
                    epochs=epochs)

## Review the training procedure
* Check if the model is well trained.  
* If not, figure out the problem and try again.  

In [None]:
show_train_info(history, epochs)

## Inference on validation set 

In [None]:
# inference on test set
confidences = model.predict(iterator_val)
predictions = confidences.argmax(axis=1)

# draw confusion matrix of the entire test set
cm = confusion_matrix(y_true=iterator_val.classes, y_pred = predictions)
plot_confusion_matrix(cm=cm, classes=class_names)

show_inference_info(iterator_val, confidences, predictions)

## Try with a new unseen picture

In [None]:
filename = 'photo.jpg'
print("Capture the test image by clicking 'Capture' button.")
take_photo(filename)

img = cv2.imread(filename)
img = cv2.resize(img, (size, size), interpolation=cv2.INTER_CUBIC) #resize
img = img.astype('float32') /255 # rescale
img_in = tf.expand_dims(img, axis=0) # expand batch dimension

scores = model.predict(img_in)
max_score = tf.nn.softmax(scores[0])
print(
    "This image most likely belongs to {} with a {:.2f} percent confidence."
    .format(class_names[np.argmax(max_score)], 100 * np.max(max_score))
)