<a href="https://colab.research.google.com/github/AndreaZoccatelli/aircontrol/blob/master/mediapipe_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mediapipe fine tuning

## Install Required libraries + Import

In [None]:
!pip install --upgrade pip
!pip install opendatasets
!pip install mediapipe-model-maker
!pip install opencv-python

In [None]:
from google.colab import files
import opendatasets as od
import os
import shutil
import tensorflow as tf
import cv2 as cv
import numpy as np
assert tf.__version__.startswith('2')
from random import sample, seed

from mediapipe_model_maker import gesture_recognizer
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import json
from tqdm import tqdm

## Download Dataset

The dataset used here is a subset of [HaGRID](https://github.com/hukenovs/hagrid)

In [None]:
od.download(
    "https://www.kaggle.com/datasets/innominate817/hagrid-classification-512p-127k")

# Dataset Preparation

The original dataset contains pictures of standing people, this could be a problem, given that the desired use case is hand sign recognition near the PC.

To make the training set more relevant for the final objective the images are zoomed using the bounding box detected with the Mediapipe hand landmarker. To make the model more robust to different light conditions the images are also modified with the function alter light.

In [None]:
!wget -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task

base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=1)
detector = vision.HandLandmarker.create_from_options(options)

In [None]:
def bounding_box_resize(imgs_path, model, zoom_factor, failed=0, resize=True):
  image = mp.Image.create_from_file(imgs_path)
  detection_result = model.detect(image)
  x_list = []
  y_list = []
  try:
    img = image.numpy_view()
    for k in detection_result.hand_landmarks[0]:
      x_list.append(k.x)
      y_list.append(k.y)
    min_x = min(x_list)*img.shape[1]
    min_x = round(min_x-zoom_factor*min_x)
    max_x = max(x_list)*img.shape[1]
    max_x = round(max_x+zoom_factor*max_x)
    min_y = min(y_list)*img.shape[0]
    min_y = round(min_y-zoom_factor*min_y)
    max_y = max(y_list)*img.shape[0]
    max_y = round(max_y+zoom_factor*max_y)
    cropped_image = img[min_y:max_y, min_x:max_x]
    if resize:
      cropped_image = cv.resize(cropped_image, img.shape[0:2])

    return cropped_image, failed

  except (IndexError, Exception):
    failed+=1
    return img, failed

In [None]:
def alter_light(img, alpha, beta):
  img_add_b=np.clip(img * 1.5, 0, 255).astype(np.uint8)
  img_sub_b=np.clip(img * 0.5, 0, 255).astype(np.uint8)
  noise=np.random.normal(loc=0, scale=20, size=img.shape).astype(np.uint8)
  img_sub_b=cv.addWeighted(img_sub_b, alpha, noise, beta, 0)
  return img_add_b, img_sub_b

In [None]:
def prepare_dataset(dataset_path, chosen_gestures, imgs_path, sample_size, random_zoom_in, v_augment, model=None):
  np.random.seed(123)
  seed(123)
  none_subfolder=os.path.join(dataset_path, 'none')
  if not os.path.exists(none_subfolder):
    os.makedirs(none_subfolder)

  for gesture in tqdm(chosen_gestures):
    gesture_subfolder=os.path.join(dataset_path, gesture)
    if not os.path.exists(gesture_subfolder):
      os.makedirs(gesture_subfolder)

    gesture_imgs_path=os.path.join(imgs_path, gesture)
    gesture_imgs=os.listdir(gesture_imgs_path)
    if sample_size is not None:
      gesture_imgs=sample(gesture_imgs,sample_size)

    if random_zoom_in:
      failed=0
      gesture_imgs=np.array(gesture_imgs)
      to_zoom = sample(range(0,len(gesture_imgs)), round(0.5*len(gesture_imgs)))
      to_zoom=gesture_imgs[to_zoom]
      regular = np.setdiff1d(range(0,len(gesture_imgs)), to_zoom)
      regular=gesture_imgs[regular]

    for i in regular:
      if v_augment:
        img=cv.imread(os.path.join(gesture_imgs_path, i))
        beta=np.random.choice([0.1,0.2,0.3])
        alpha=1-beta
        img_add_b, img_sub_b = alter_light(img, alpha, beta)
        cv.imwrite(os.path.join(gesture_subfolder, 'add_b_'+i), img_add_b)
        cv.imwrite(os.path.join(gesture_subfolder, 'sub_b_'+i), img_sub_b)
      shutil.copy(os.path.join(gesture_imgs_path, i), os.path.join(gesture_subfolder, i))

    for i in to_zoom:
      zoom_factor=np.random.choice([0.2,0.3,0.4])
      cropped_image,failed=bounding_box_resize(os.path.join(gesture_imgs_path, i), model, zoom_factor, failed=failed, resize=True)
      if cropped_image is not None:
        if v_augment:
          beta=np.random.choice([0.1,0.2,0.3])
          alpha=1-beta
          img_add_b, img_sub_b = alter_light(cropped_image, alpha, beta)
          cv.imwrite(os.path.join(gesture_subfolder, 'add_b_cropped_'+i), img_add_b)
          cv.imwrite(os.path.join(gesture_subfolder, 'sub_b_cropped_'+i), img_sub_b)
        cv.imwrite(os.path.join(gesture_subfolder, 'cropped_'+i), cropped_image)
    if random_zoom_in:
      print(f'{round(failed/len(to_zoom))}% no bounding box detected')

In [None]:
shutil.rmtree('/content/Dataset')

In [None]:
# write here the gestures on which you want to train the model
chosen_gestures=['one', 'peace', 'three2', 'four', 'palm', 'fist']
imgs_path='/content/hagrid-classification-512p-127k/hagrid-classification-512p-127k'
dataset_path=os.path.join('/content', 'Dataset')
if not os.path.exists(dataset_path):
  os.makedirs(dataset_path)

prepare_dataset(dataset_path, chosen_gestures, imgs_path, sample_size=None, random_zoom_in=True, model=detector, v_augment=False)

In [None]:
dataset_path='/content/Dataset'

In [None]:
seed(123)
none_subfolder=os.path.join(dataset_path, 'none')
others=os.listdir(imgs_path)
others=np.setdiff1d(others, chosen_gestures)

for folder_name in tqdm(others):
  folder=os.path.join(imgs_path, folder_name)
  files=os.listdir(folder)
  files=sample(files, 200)

  # Iterate over each file and copy it to the destination folder
  for img in files:
      shutil.copy(os.path.join(folder, img), none_subfolder)

In [None]:
data=gesture_recognizer.Dataset.from_folder(
    dirname=dataset_path,
    hparams=gesture_recognizer.HandDataPreprocessingParams()
)
train_data, rest_data=data.split(0.8)
validation_data, test_data=rest_data.split(0.5)

## Model training


In [None]:
shutil.rmtree('/content/exported_model')

In [None]:
hparams=gesture_recognizer.HParams(learning_rate=0.001, export_dir="exported_model", shuffle=True, batch_size=200)
model_options=gesture_recognizer.ModelOptions(dropout_rate=0.1, layer_widths=[256,128,64])
options=gesture_recognizer.GestureRecognizerOptions(model_options=model_options, hparams=hparams)
model_2=gesture_recognizer.GestureRecognizer.create(
    train_data=train_data,
    validation_data=validation_data,
    options=options
)

In [None]:
loss, accuracy=model.evaluate(test_data)
print(f"Test loss:{loss}, Test accuracy:{accuracy}")

In [None]:
model.export_model()