<a href="https://colab.research.google.com/github/ADA-SITE-JML/sign-lang/blob/main/jamal/Data_Organizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd 
import cv2
import torch

In [None]:
import sys
import subprocess

def pip_install(package):
  subprocess.check_call([sys.executable, '-m', 'pip', 'install',package])

In [None]:
pip_install('mediapipe')

pip_install('pytorchvideo')

# https://github.com/jbohnslav/opencv_transforms
#pip_install('opencv_transforms')

In [None]:
drive_folder = 'drive/MyDrive/SLR/Data/'
video_folder = drive_folder+'/Video'
train_csv_path = drive_folder+'/sentences_all.csv'
camera_source = 'Cam2' # Cam1 - side-top, Cam2 - front
output_folder = drive_folder + 'jamal/Video_features'

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
max_frames = 64

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print('Device:',device)

Device: cuda:0


In [None]:
import mediapipe as mp
from torchvision import transforms

from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo
)

# keeps only informative frames
def keep_frames_with_hands(video_data, crop_size: int = None,
                           mp_min_detection_confidence: float = 0.8, mp_min_tracking_confidence: float = 0.9): # initially 0.5, 0.2 / 0.5 0.7
    
  mpHands = mp.solutions.hands
  hands = mpHands.Hands(static_image_mode=True, max_num_hands=2,
                        min_detection_confidence=mp_min_detection_confidence, min_tracking_confidence=mp_min_tracking_confidence)
  
  if crop_size:
    video_arr = torch.zeros((0, 3, crop_size, crop_size)).to(device)
    transform = Compose([
      CenterCrop(crop_size),
      ])
  else:
    video_arr = torch.zeros((0, 3, 960, 1280)).to(device)
  
  ret = True
  frame = None

  for frame in video_data:
    hand_results = hands.process(frame.numpy())

    if hand_results.multi_hand_landmarks != None:
      if crop_size:
        frame_ext = torch.unsqueeze(transform(frame.permute(2, 0, 1)), dim=0).to(device)
      else:
        frame_ext = torch.unsqueeze(frame.permute(2, 0, 1), dim=0).to(device)

      video_arr = torch.cat((video_arr, frame_ext/255.0),0)
      
    
  return video_arr

In [None]:
def apply_video_transforms(resize_size: int = 224):
    video_transform = Compose([
        Resize(size=(resize_size, resize_size)),
        # UniformTemporalSubsample(25),
        #ColorJitter(brightness=0.5, contrast=0.5),
        # RandomShortSideScale(min_size=256, max_size=512),
        #RandomHorizontalFlip(p=0.5),
    ])
    
    return video_transform

In [None]:
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    UniformTemporalSubsample,
    Permute,   
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    CenterCrop,
    RandomAdjustSharpness,
    Resize,
    ColorJitter,
    RandomHorizontalFlip
)

from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo
)

In [None]:
def get_video_frames(video_path):
  reader, _, _ = torchvision.io.read_video(video_path, output_format="THWC")
  hands_only = keep_frames_with_hands(reader, crop_size=600).to(device) # initially: 960
  
  apply_trans = apply_video_transforms()
  hands_only = apply_trans(hands_only)

  n,l,w,h = hands_only.shape

  # When frames are more than we need but not that much (just trim it from the start and end)
  if (n > max_frames) and (n < 2*max_frames):
    left = (n-max_frames)//2
    hands_only_new = hands_only[left:(n-left-1),:,:,:]
  # If we have much more frames than we need
  elif (n > max_frames):
    # Cut 5 frames from start/end and then skip every n-th
    slice_step = ((n-10)//max_frames+1)
    hands_only_new = hands_only[5:(n-5):slice_step,:,:,:]
  else:
    hands_only_new = hands_only

  n = hands_only_new.shape[0]

  # If we have less frames than we need
  if (n < max_frames):
    # Previous approach : fill with the zero matrix
    # compliment_arr = torch.zeros(config.max_frames-n,l,w,h).to(config.device)
    # New approach: fill with the last frames (usually 1)
    compliment_arr = hands_only[-(max_frames-n):,:,:,:]
    hands_only_new = torch.cat((hands_only_new,compliment_arr),0)

    return hands_only_new

In [None]:
from torchvision.models import squeezenet1_1
from torchvision.models.feature_extraction import create_feature_extractor

model = squeezenet1_1(pretrained=True).to(device)
return_nodes = {
      'features.12.cat': 'layer12'
      }
pretrained_model = create_feature_extractor(model, return_nodes=return_nodes).to(device)
pretrained_model.eval()

def frame_to_feats(pretrained_model, frames):
  features = pretrained_model(frames.squeeze())['layer12'].to(device=device)
  feat_shape = features.shape
  feat_flat =  torch.reshape(features,(1,feat_shape[0],feat_shape[1]*feat_shape[2]*feat_shape[3])).to(device=device)



In [None]:
# read cvs file
sentences = pd.read_csv(train_csv_path)

# generate (video file name, encoding list)
# Good recommendation on not to iterate over DFs like this:
# https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
# but it's not my case - I have fewer rows and one to many with videos.
df = pd.DataFrame(columns=["id", "video_file","encoding"])

for index, row in sentences.iterrows():
    id = int(row[0])
    phrase = row[2].lower()

    # there is a grouping of videos in production.
    pre_folder = '/1-250/' if (id < 251) else '/'
    
    dir = video_folder+'/' + camera_source + pre_folder + str(id)
    # iterate over video folders
    fidx = 1
    for filename in os.listdir(dir):
        f = os.path.join(dir, filename)
        # checking if it is a file
        if os.path.isfile(f):
            video_id = filename[:filename.rindex('.')]
            print(id,f)

            try:
              frames = get_video_frames(f)
              feats = frame_to_feats(pretrained_model,frames)
              
              feat_dir = output_folder + '/' + str(id)
              os.makedirs(feat_dir, exist_ok = True)
              torch.save(feats, os.path.join(feat_dir, str(fidx) + '.pt'))
              fidx += 1
            except Exception as exp:
              print('There was an error: ',f,exp)
              continue



2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-04-19 15-17-13.mp4




2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-04-19 15-45-12.mp4
2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-04-21 17-25-09.mp4
2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-04-22 12-11-21.mp4
2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-04-23 11-56-14.mp4
2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-04-25 13-04-30.mp4
2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-04-25 13-30-11.mp4
2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-04-26 12-58-07.mp4
There was an error:  drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-04-26 12-58-07.mp4 'NoneType' object has no attribute 'squeeze'
2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-05-19 14-57-23.mp4
There was an error:  drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-05-19 14-57-23.mp4 'NoneType' object has no attribute 'squeeze'
2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-05-19 14-58-55.mp4
2 drive/MyDrive/SLR/Data//Video/Cam2/1-250/2/2022-05-19 14-59-23.mp4
There was an error:  drive/MyDrive/SLR/Data/