# **0. Setup** 

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow.keras as keras

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

!pip install mediapipe
import mediapipe as mp

Collecting mediapipe
  Downloading mediapipe-0.8.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 52 kB/s 
Installing collected packages: mediapipe
Successfully installed mediapipe-0.8.9


In [None]:
from zipfile import ZipFile
file_name = "/content/drive/MyDrive/Colab Notebooks/ASL/data/MP4.zip"
with ZipFile(file_name, 'r') as zipp:
    print('Extracting all the files now...')
    zipp.extractall()
    print('Done!')

Extracting all the files now...
Done!


# **1. Data Pre-processing**

* **Extract Frames**
* **Extract and pre-process the keypoint locations**
* **Pad the keypoints**
* **Pre-process labels**
* **Do all of the above for 1 video**
* **Do all of the above for all videos**
* **Fix the keypoint padding**



## **Extract Frames**

In [None]:
def extract_frames(video_path):
    '''A function that extracts all frames of a video, appends it to a list and return the list.
    
    Args:
        video_path: The path of the video.
    '''
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if ret:
          frames.append(frame)
        else:
            break
    return frames

In [None]:
def remove_empty_frames(frames):
  '''A function that removes empty frames from a video
    
    Args:
        frames: a list of video frames
  '''
  unempty_frames = []

  for frame in frames:
    unique_values = np.unique(frame)
    if len(unique_values) == 1 and unique_values[0] == 0:
      continue
    else:
      unempty_frames.append(frame)
  
  return unempty_frames

## **Extract and Pre-process keypoint locations**

In [None]:
def zip_three_lists(x, y, z):
  zipped = []
  for i in range(len(x)):
    zip_list = x[i], y[i], z[i]
    zipped.append(zip_list)
  return np.array(zipped)

In [None]:
def clean_keypoints(keypoints, precision=5):
  str_keypoints = str(keypoints)
  x = []
  y = []
  z = []
  for i in range(len(str_keypoints)):
    if str_keypoints[i] == "x" and str_keypoints[i-1] == " ":
      x.append(float(str_keypoints[i + 3: i + 8]))
    elif str_keypoints[i] == "y" and str_keypoints[i-1] == " ":
      y.append(float(str_keypoints[i + 3: i + 8]))
    elif str_keypoints[i] == "z" and str_keypoints[i-1] == " ":
      z.append(float(str_keypoints[i + 3: i + 8]))
  
  # coordinates = zip_three_lists(x, y, z)
  # return coordinates
  return zip_three_lists(x, y, z)

In [None]:
def extract_pose(frame, output_shape=(224, 224, 3), output="list"):
  '''A function that extracts and draws the keypoints using `mediapipe.solutions.pose.Pose()`
    
    Args:
        video_path: The path of the video.
        output_shape: the shape of the output images
    '''
  location = []
  try:
    mp_drawing = mp.solutions.drawing_utils
    mp_drawing_styles = mp.solutions.drawing_styles
    mp_pose = mp.solutions.pose
    video_frames = []
    with mp_pose.Pose(static_image_mode=True, model_complexity=2, min_detection_confidence=0.5) as pose:
      image = cv2.resize(frame, (output_shape[0], output_shape[1]))
      image_height, image_width, _ = image.shape
      one_image = []
      # Convert the BGR image to RGB before processing.
      results = pose.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
      location = clean_keypoints(results.pose_landmarks)
      if output != "list":
        return location
      else:
        return list(location)
  except Exception as e:
    return np.zeros((33, 3))

# Trial
# location1 = extract_pose(extract_frames("/content/MP4/(ON_THE)LEFT 01.mp4")[5], output="array")

## **Pad the keypoints**

In [None]:
def pad_coordinates(coordinates, maxlen=80):
  frames = coordinates.shape[0]
  keypoints, coor = 33, 3
  difference = maxlen - frames
  start = end = difference / 2
  if isinstance(start, float):
    start -= 0.5
    end += 0.5
  if start != 0:
    pad_values_start = list(tf.ones((int(start), keypoints, coor)) - 3)
    pad_values_end = list(tf.ones((int(end), keypoints, coor)) - 3)
    output = pad_values_start + list(coordinates) + pad_values_end
  else:
    pad_values_end = list(tf.zeros(int(end), keypoints, coor) - 3)
    output = list(coordinates) + pad_values_end
  return output

## **Preprocess labels**

In [None]:
 labels = np.unique(sorted([i.split("0")[0].strip() for i in os.listdir("/content/MP4")]))

In [None]:
def preprocess_label(label):
  '''A function that pre-processes a label
    
    Args:
        label: a single string. the label to a video
  '''
  return label.split("01")[0].split("4/")[1].strip()

In [None]:
#@title
# def encode_labels(label, video_length, labels_array=labels):
#   '''A function that converts a label to an array of numbers, that is appended to another array for `video_length` number of times
    
#     Args:
#         label: a single string. the label to a video
#         video_length: the length of the video to which the label corresponds to
#         labels_array: an array of all labels 
#   '''

#   labels_binary = []
#   bool_array = preprocess_label(label) == labels_array
#   numerical_labels = list(bool_array.astype(int))
#   for i in range(video_length):
#     labels_binary.append(numerical_labels)
#   return np.array(labels_binary)

In [None]:
def get_labels(label):
  bool_array = preprocess_label(label) == labels
  numerical_labels = list(bool_array.astype(int))
  return numerical_labels

## **Do all of the above for 1 video**

In [None]:
videos = ["/content/MP4/" + i for i in os.listdir("/content/MP4")]

In [None]:
def preprocess_single_video(video_path):
  frames = extract_frames(video_path)
  coordinates = []
  for frame in frames:
    coors = extract_pose(frame)
    if np.array(coors).shape == (0, ):
      coors = list(np.zeros((33, 3)))      
    coordinates.append(coors)
  x = pad_coordinates(np.array(coordinates))
  y = list(get_labels(video_path))
  return x, y

# Trial
# X, Y = preprocess_single_video(videos[9])

## **Do all of the above for all videos**

In [None]:
def preprocess_videos(video_paths):
  X = []
  Y = []
  i = 1
  for video in video_paths:
    x, y = preprocess_single_video(video)
    X.append(x)
    Y.append(y)
    print(i)
    i += 1
  
  return X, Y

X, Y = preprocess_videos(videos[:5])

Downloading model to /usr/local/lib/python3.7/dist-packages/mediapipe/modules/pose_landmark/pose_landmark_heavy.tflite
1
2
3
4
5


## **Fix the keypoint padding**

In [None]:
def standardize_shapes(ragged_X):
  correctedX = []
  for i in ragged_X:
    if np.array(i).shape == (80, 33, 3):
      a = list(i)
      a.pop(-1)
      correctedX.append(a)
    else:
      correctedX.append(i)
  return np.array(correctedX)

X = standardize_shapes(X)
y = np.array(Y)

In [None]:
X.shape, y.shape

((5, 79, 33, 3), (5, 42))