# Hand Detection using CNN
![Hand Detection](https://user-images.githubusercontent.com/50156227/166168918-a5caff72-e068-41ac-b1cb-91b79a8a1311.gif)
<br>
<h4><b>Model is ready to be used for apps & APIs, (check the last cell for 'How to use')</b></h4>

In [None]:
import cv2
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from pathlib import Path
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Input, Dense, Add, Conv2D, SeparableConv2D
from tensorflow.keras.layers import BatchNormalization, AveragePooling2D
from tensorflow.keras.layers import LeakyReLU, MaxPooling2D, Flatten
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras import backend as K

In [None]:
IMG_HEIGHT = 416
IMG_WIDTH = 416
X_FACTOR = IMG_WIDTH / 600 # 600 is dataset images shape
Y_FACTOR = IMG_HEIGHT / 600
BATCH_SIZE = 128
EPOCHS = 35
learning_rate = 0.0032
dataset_path = '../input/hand-detection-dataset-factory/hands'

In [None]:
def grabPaths(filepath):
    labels = [str(filepath[i]).split("/")[-1] \
              for i in range(len(filepath))]
    filepath = pd.Series(filepath, name='path').astype(str)
    df = pd.DataFrame(filepath)    
    return df

def rescale_boxes(boxes):
    # Rescale boxes (since we resscaled images sizes)
    boxes = np.array(boxes).astype(np.float32)
    boxes[:,[0,2]] = boxes[:,[0,2]] * X_FACTOR
    boxes[:,[1,3]] = boxes[:,[1,3]] * Y_FACTOR
    return boxes

def inverse_rescale_boxes(boxes):
    # Inverse scale of box coordinates
    # You should inverse scale based on the image
    boxes[:,[0,2]] = boxes[:,[0,2]] / X_FACTOR
    boxes[:,[1,3]] = boxes[:,[1,3]] / Y_FACTOR
    return boxes


def plot_bbox(image, yt_box, yp_box=None, norm=False):
    # Given an image and box coordinates, draw the box on the image
    if norm:
        image = image * 255.
        image = image.astype("uint8")
    
    try:
        pil_img = Image.fromarray(image)
    except:
        pil_img = Image.fromarray(image.astype('uint8'))
        
    draw_img = ImageDraw.Draw(pil_img)
    
    x1, y1, w, h = yt_box
    x2, y2 = x1+w, y1+h
    draw_img.rectangle((x1, y1, x2, y2), outline='green')
    
    if yp_box is not None:
        x1, y1, w, h = yp_box
        x2, y2 = x1+w, y1+h
        draw_img.rectangle((x1, y1, x2, y2), outline='red')
    return pil_img


def convblock(previous_layer, n_filters, filter_windows=(3,3,3), padding='same', pool=None):
    # [DepthwiseConv -> BatchNorm -> LeakyReLU] x3 + Residual connection -> Pooling (optional)

    x = SeparableConv2D(n_filters[0], filter_windows[0], padding=padding)(previous_layer)
    x_short = x
    x = BatchNormalization()(x)
    x = LeakyReLU()(x)

    x = SeparableConv2D(n_filters[1], filter_windows[1], padding=padding)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU()(x)

    x = SeparableConv2D(n_filters[2], filter_windows[2], padding=padding)(x)
    x = BatchNormalization()(x)
    x = Add()([x, x_short])
    x = LeakyReLU()(x)
    
    if pool == 'max':
        x = MaxPooling2D(pool_size=(2,2))(x)
    elif pool == 'avg':
        x = AveragePooling2D(pool_size=(2,2))(x)
    
    return x


# Function to calculate MSE Loss function
# for samples where object exists
def custom_mse(y_true, y_pred):
    mask = K.not_equal(K.sum(y_true, axis=1), 0.0)
    y_true_custom = y_true[mask]
    y_pred_custom = y_pred[mask]
    mse = tf.keras.losses.MeanSquaredError()
    result = mse(y_true_custom, y_pred_custom) # * 0.3
    return result


# Function to preview samples of the dataset
def visualize_samples(datagen, row_col_len=4, figsize=None):
    figsize = figsize or np.array((row_col_len, row_col_len)) * 4
    fig, ax = plt.subplots(row_col_len, row_col_len, figsize=figsize)
    for i in range(row_col_len):
        for j in range(row_col_len):
            batch_index = np.random.randint(0, BATCH_SIZE/2)
            output_classes = np.array(datagen[batch_index][1]['class_out'])
            classes_true = np.where(output_classes == 1)[0]
            sample_index = classes_true[np.random.randint(0, classes_true.shape[0])]
            image = datagen[batch_index][0][sample_index]
            box = datagen[batch_index][1]['box_out'][sample_index]
            plotted_box = plot_bbox(image, box, norm=True)
            ax[i,j].imshow(plotted_box)
            ax[i,j].set_axis_off()
    plt.show()
    

# Function to make a prediction during training
def visualize_prediction(model, data):    
    # Select a sample where an object exists
    output_classes = np.array(data[0][1]['class_out'])
    sample_index = np.where(output_classes == 1)[0]
    sample_index = sample_index[0] if sample_index[0] else 0 
    
    # Get image
    image = np.array([data[0][0][sample_index]])
    
    # Set y_true & y_pred for class & bounding box
    yt_box = np.array([data[0][1]['box_out'][sample_index]])
    yt_class = np.array([data[0][1]['class_out'][sample_index]])
    yp_class, yp_box = model.predict(image)

    # Plot bounding box on image & show it
    image_plotted = plot_bbox(image[0], yt_box[0], yp_box[0], norm=True) 
    plt.imshow(image_plotted)
    plt.axis('off')
    
    # Print y_true class & y_pred class
    print("Class: y_true=", yt_class, " | y_pred=", int(yp_class >= 0.5))
    plt.show()

In [None]:
# Read CSV
dataset = pd.read_csv('../input/hand-detection-dataset-factory/dataset.csv')

# REMOVE THE LINE BELOW TO USE ALL SAMPLES
# dataset = dataset.sample(n=15000)


dataset.columns = ['path', 'object_exists', 'x', 'y', 'w', 'h']


# List directories of files
train_image_dir_hand = Path(dataset_path)
train_filepaths_hand = list(train_image_dir_hand.glob(r'**/*.png'))

# Create dataframe of {paths, labels}
train_df_hand = grabPaths(train_filepaths_hand)
dataset['path'] = "../input/hand-detection-dataset-factory/hands/" + dataset['path']

# Resize boxes
dataset.iloc[:,2:] = rescale_boxes(dataset.iloc[:,2:])

In [None]:
train_df, test_df = train_test_split(dataset, test_size=0.2)

In [None]:
class MultiOutputGen(tf.keras.utils.Sequence):
    def __init__(self, input_gen, output_gen):
        self.inpgen = input_gen
        self.outgen = output_gen

    def __len__(self):
        return len(self.inpgen)

    def __getitem__(self, i):
        images = self.inpgen[i]
        start = i * images.shape[0]
        end = (i+1) * images.shape[0]
        classes_num = self.outgen.iloc[start:end,0].values
        x = self.outgen.iloc[start:end,1]
        y = self.outgen.iloc[start:end,2]
        w = self.outgen.iloc[start:end,3]
        h = self.outgen.iloc[start:end,4]
#         return images, np.array([x, y, w, h]).T
        return images, {'class_out':classes_num, 'box_out':np.array([x, y, w, h]).T}

    def on_epoch_end(self):
        self.inpgen.on_epoch_end()
    
# Class to visualize predictions during training
class VisualOutput(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        global custom_test_gen
        visualize_prediction(self.model, custom_test_gen)

In [None]:
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    brightness_range=(0.8, 1.2),
    rescale = 1./255.,
)

test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale = 1./255.,
)

train_images = train_generator.flow_from_dataframe(
    dataframe=train_df,
    x_col='path',
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    color_mode='rgb',
    class_mode=None,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

test_images = test_generator.flow_from_dataframe(
    dataframe=test_df,
    x_col='path',
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    color_mode='rgb',
    class_mode=None,
    batch_size=BATCH_SIZE,
    shuffle=False,
)


custom_train_gen = MultiOutputGen(train_images, train_df.iloc[:,1:])
custom_test_gen = MultiOutputGen(test_images, test_df.iloc[:,1:])

In [None]:
visualize_samples(custom_train_gen)

In [None]:
# BEST MODEL

inp = Input(shape=(IMG_HEIGHT,IMG_WIDTH,3), name='image')

X = SeparableConv2D(64, (7,7), strides=2, padding='valid')(inp)
X = MaxPooling2D(pool_size=(2,2), strides=2)(X)

X = SeparableConv2D(192, (3,3), strides=1, padding='same')(X)
X = MaxPooling2D(pool_size=(2,2), strides=2)(X)

X = convblock(X, [16, 16, 16], pool='max')
X = convblock(X, [32, 32, 32], pool='max')
X = convblock(X, [64, 64, 64], pool='max')
X = convblock(X, [128, 128, 128], pool='max')
X = convblock(X, [256, 256, 256])

X = Flatten()(X)

Xbox = Dense(1024)(X)
Xbox = LeakyReLU()(Xbox)

Xhand = Dense(512)(X)
Xhand = LeakyReLU()(Xhand)


box_output = Dense(4, name='box_out')(Xbox)
class_output = Dense(1, activation='sigmoid', name='class_out')(Xhand)

model = Model(inp, [class_output, box_output])
model.summary()

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate=learning_rate)

save_best_model = tf.keras.callbacks.ModelCheckpoint(
    filepath='./best_hand_detection.h5',
    monitor='val_loss',
    mode='min',
    save_best_only=True)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_box_out_loss",
    factor=tf.math.exp(-0.15),
    patience=2,
    verbose=1,
    mode="min",
    min_lr=0.0004,
)

model.compile(
    loss={'box_out':custom_mse, 'class_out':'binary_crossentropy'},
    optimizer=adam,
    metrics={'class_out':'accuracy'}
)

In [None]:
history = model.fit(
    custom_train_gen,
    epochs=EPOCHS,
    validation_data=custom_test_gen,
    callbacks=[
        reduce_lr,
        VisualOutput(),
        save_best_model
    ]
)

In [None]:
# Loss Graph

%matplotlib inline

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(loss))

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss (MSE)')
plt.legend()

plt.show()

In [None]:
# Save model for later use

# for python apps (& APIs)
model.save("Hand Detection.h5")

# for mobile apps & microcontrollers (TensorFlow Lite)
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
with open("Hand Detection Lite.tflite", 'wb') as file:
    file.write(tflite_model)

# How to use the model with Camera (and as API)

1- For APIs, Python Apps
Download 'Hand Detection.h5'.

2- For Non-Python Apps (i.e. mobile, embedded)
Download Lite version for mobile apps
and search for some package to use Tflite (package:tflite for Flutter & Java)

Run the following code in the same folder with the model

In [None]:
# import cv2
# import tensorflow as tf
# from tensorflow.keras import backend as K
# from tensorflow.keras.models import load_model

# # Function to calculate MSE Loss function
# # for samples where object exists
# def custom_mse(y_true, y_pred):
#     mask = K.not_equal(K.sum(y_true, axis=1), 0.0)
#     y_true_custom = y_true[mask]
#     y_pred_custom = y_pred[mask]
#     mse = tf.keras.losses.MeanSquaredError()
#     result = mse(y_true_custom, y_pred_custom) # * 0.3
#     return result

# model = load_model("Hand Detection.h5", custom_objects={'custom_mse':custom_mse})

# IMG_HEIGHT = 416
# IMG_WIDTH = 416

# cam = cv2.VideoCapture(0)
# while True:
#     ret, frame = cam.read()
#     if ret:
        
#         # How to use as API (given img:frame as input)
#         image = cv2.resize(frame, (IMG_WIDTH, IMG_HEIGHT))
#         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#         image = image / 255.
#         yp_class, yp_box = model.predict(np.array([image]))
#         x, y, w, h= yp_box[0]        
#         x, w = round(IMG_WIDTH / (IMG_WIDTH / frame.shape[1]) ), round(IMG_WIDTH / (IMG_WIDTH / frame.shape[1]))
#         y, h = round(IMG_HEIGHT / (IMG_HEIGHT / frame.shape[0])), round(h / (IMG_HEIGHT / frame.shape[0])) # / 560
#         # END: how to use as API
        
#         if yp_class[0] >= 0.5:
#              frame= cv2.rectangle(frame, (x, y), (x+w, y+h), thickness=2, color=(0,255,0))
#         cv2.imshow("mywindow", frame)
#     if cv2.waitKey(1) == ord('q'):
#         break
# cam.release()
# cv2.destroyAllWindows()