<a href="https://colab.research.google.com/github/ADA-SITE-JML/sign-lang/blob/main/jamal/Data_Organizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd 
import cv2
import torch

In [2]:
import sys
import subprocess

def pip_install(package):
  subprocess.check_call([sys.executable, '-m', 'pip', 'install',package])

In [3]:
pip_install('mediapipe')

pip_install('pytorchvideo')

# https://github.com/jbohnslav/opencv_transforms
#pip_install('opencv_transforms')

In [4]:
drive_folder = '/content/drive/MyDrive/SLR/Data/'
video_folder = drive_folder+'/Video'
train_csv_path = drive_folder+'sentences_all.csv'
camera_source = 'Cam2' # Cam1 - side-top, Cam2 - front
output_folder = drive_folder + 'jamal/Video_features'

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
max_frames = 64

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print('Device:',device)

Device: cuda:0


In [6]:
import mediapipe as mp
import torchvision
from torchvision import transforms

from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo
)

# keeps only informative frames
def keep_frames_with_hands(video_data, crop_size: int = None,
                           mp_min_detection_confidence: float = 0.7, mp_min_tracking_confidence: float = 0.8): # initially 0.5, 0.2 / 0.5 0.7
    
  mpHands = mp.solutions.hands
  hands = mpHands.Hands(static_image_mode=True, max_num_hands=2,
                        min_detection_confidence=mp_min_detection_confidence, min_tracking_confidence=mp_min_tracking_confidence)
  
  video_arr = torch.zeros((0, 3, 960, 1280)).to(device)
  if crop_size:
    video_arr = torch.zeros((0, 3, crop_size, crop_size)).to(device)
    transform = Compose([
      CenterCrop(crop_size),
      ])

  for frame in video_data:
    hand_results = hands.process(frame.numpy())

    if hand_results.multi_hand_landmarks != None:
      if crop_size:
        frame_ext = torch.unsqueeze(transform(frame.permute(2, 0, 1)), dim=0).to(device)
      else:
        frame_ext = torch.unsqueeze(frame.permute(2, 0, 1), dim=0).to(device)

      video_arr = torch.cat((video_arr, frame_ext/255.0),0)

  return video_arr



In [7]:
from torchvision.transforms import (
    Compose,
    CenterCrop,
    Resize
)

def apply_video_transforms(resize_size: int = 224):
    video_transform = Compose([
        Resize(size=(resize_size, resize_size)),
        # UniformTemporalSubsample(25),
        #ColorJitter(brightness=0.5, contrast=0.5),
        # RandomShortSideScale(min_size=256, max_size=512),
        #RandomHorizontalFlip(p=0.5),
    ])
    
    return video_transform

In [8]:
def get_video_frames(video_path):
  reader, _, _ = torchvision.io.read_video(video_path, output_format="THWC")
  hands_only = keep_frames_with_hands(reader, crop_size=600).to(device) # initially: 960
  
  apply_trans = apply_video_transforms()
  hands_only = apply_trans(hands_only)

  n,l,w,h = hands_only.shape

  # When frames are more than we need but not that much (just trim it from the start and end)
  if (n > max_frames) and (n < 2*max_frames):
    left = (n-max_frames)//2
    hands_only_new = hands_only[left:(n-left-1),:,:,:]
  # If we have much more frames than we need
  elif (n > max_frames):
    # Cut 5 frames from start/end and then skip every n-th
    slice_step = ((n-10)//max_frames+1)
    hands_only_new = hands_only[5:(n-5):slice_step,:,:,:]
  else:
    hands_only_new = hands_only

  n = hands_only_new.shape[0]

  # If we have less frames than we need
  if (n < max_frames):
    # Previous approach : fill with the zero matrix
    # compliment_arr = torch.zeros(config.max_frames-n,l,w,h).to(config.device)
    # New approach: fill with the last frames (usually 1)
    compliment_arr = hands_only[-(max_frames-n):,:,:,:]
    hands_only_new = torch.cat((hands_only_new,compliment_arr),0)

    return hands_only_new

def tensor2list(mdim_tensor):
  tensor_stack = []
  for i in range(mdim_tensor.shape[0]):
    tensor_stack.append(mdim_tensor[i])

  return tensor_stack

In [9]:
feature_type = 'i3d' # or 'pretrained'

if feature_type == 'i3d':
  # For I3D features
  !git clone https://github.com/v-iashin/video_features.git
  !pip install omegaconf==2.0.6

  %cd video_features

  from models.i3d.extract_i3d import ExtractI3D
  from models.raft.raft_src.raft import RAFT, InputPadder
  from utils.utils import build_cfg_path
  from omegaconf import OmegaConf

  # Load and patch the config
  args = OmegaConf.load(build_cfg_path(feature_type))
  # args.show_pred = True
  # args.stack_size = 24
  # args.step_size = 24
  # args.extraction_fps = 30
  args.flow_type = 'raft' # 'pwc' is not supported on Google Colab (cupy version mismatch)
  # args.streams = 'flow'

  # Load the model
  extractor = ExtractI3D(args)
else:
  from torchvision.models import squeezenet1_1
  from torchvision.models.feature_extraction import create_feature_extractor

  model = squeezenet1_1(pretrained=True).to(device)
  return_nodes = {
        'features.12.cat': 'layer12'
        }
  pretrained_model = create_feature_extractor(model, return_nodes=return_nodes).to(device)
  pretrained_model.eval()

def frame_to_feats(pretrained_model, frames):
  features = pretrained_model(frames.squeeze())['layer12'].to(device=device)
  feat_shape = features.shape
  feat_flat =  torch.reshape(features,(feat_shape[0],feat_shape[1]*feat_shape[2]*feat_shape[3])).to(device=device)
  return feat_flat

Cloning into 'video_features'...
remote: Enumerating objects: 1299, done.[K
remote: Counting objects: 100% (420/420), done.[K
remote: Compressing objects: 100% (189/189), done.[K
remote: Total 1299 (delta 264), reused 322 (delta 215), pack-reused 879[K
Receiving objects: 100% (1299/1299), 288.63 MiB | 17.19 MiB/s, done.
Resolving deltas: 100% (671/671), done.
Updating files: 100% (177/177), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting omegaconf==2.0.6
  Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)
Installing collected packages: omegaconf
Successfully installed omegaconf-2.0.6
/content/video_features


In [None]:
import gc

# read cvs file
# %cd $drive_folder
sentences = pd.read_csv(train_csv_path)

# generate (video file name, encoding list)
# Good recommendation on not to iterate over DFs like this:
# https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
# but it's not my case - I have fewer rows and one to many with videos.
df = pd.DataFrame(columns=["id", "video_file","encoding"])

for index, row in sentences.iterrows():
    id = int(row[0])

    if (id<360):
      continue
      
    phrase = row[2].lower()

    # there is a grouping of videos in production.
    pre_folder = '/1-250/' if (id < 251) else '/'
    
    dir = video_folder+'/' + camera_source + pre_folder + str(id)
    # iterate over video folders
    fidx = 1

    if str(device).startswith('cuda'):
      torch.cuda.empty_cache()
      

    for filename in os.listdir(dir):
        f = os.path.join(dir, filename)
        # checking if it is a file
        if os.path.isfile(f):
            video_id = filename[:filename.rindex('.')]
            print(id,f)
            
            try:
              feat_dir = output_folder + '/' + str(id)
              if feature_type == 'i3d':
                # Tried to pass exact frames to the i3d extraction method but did not work:
                # frames = get_video_frames(f)
                # frame_list = tensor2list(frames.unsqueeze(1))
                # print('Frames:',frames.shape)
                # print('Frame list:',len(frame_list))
                # feature_dict = extractor.run_on_a_stack(frame_list,64,InputPadder((1,3,224,244)))
                feature_dict = extractor.extract(f)

                f_num, f_size = feature_dict['rgb'].shape
                REQ_FEATS = 5 # required number of features

                # Keep only REQ_FEATS features from each and apply zero padding if there are less than REQ_FEATS features
                feats_rgb = torch.from_numpy(feature_dict['rgb'])
                feats_flow = torch.from_numpy(feature_dict['flow'])

                # Trim extra features.
                # Trim shall be applied on each, since we need to have equal number of RGB and FLOW features.
                # Like for RGB and FLOW, 8 features each will make 16 features if we apply catenation first.
                # If we trimming after that to keep 10 features, eight of them will be about RGB, two - FLOW.
                if f_num > REQ_FEATS:
                  feats_rgb  = feats_rgb[-(REQ_FEATS-f_num):,:]
                  feats_flow = feats_flow[-(REQ_FEATS-f_num):,:]

                # Concatenate the features
                feats = torch.cat((feats_rgb,feats_flow),1)

                # Apply zero padding if needed.
                # Zero padding needs to be done after the catenation - zero features shall come at the end, not after each type (RGB and FLOW)
                if f_num < REQ_FEATS:  
                  padarr = torch.zeros((REQ_FEATS-f_num,f_size*2))
                  feats = torch.cat((feats,padarr),0)
              else:
                frames = get_video_frames(f)
                feats = frame_to_feats(pretrained_model,frames)
                print('feat shape:',feats.shape)

              # Save the features
              os.makedirs(feat_dir, exist_ok = True)
              torch.save(feats, os.path.join(feat_dir, str(fidx) + '.pt'))
              
              gc.collect()
              fidx += 1
            except Exception as exp:
              print('There was an error: ',f,exp)
              continue

