In [1]:

import pandas as pd
from pathlib import Path
from typing import List, Tuple
import numpy as np
import skvideo.io  


from video_download import download_partial_video_from_youtube



In [2]:
path = Path("../data/")

In [3]:
path.mkdir(exist_ok=True)

In [4]:
df_train = pd.read_csv(path/"avspeech_train.csv", names=["id", "start", "end", "x_center", "y_center"], header=None)

In [5]:
df_train.head()

Unnamed: 0,id,start,end,x_center,y_center
0,CJoOwXcjhds,233.266,239.367,0.780469,0.670833
1,AvWWVOgaMlk,90.0,93.566667,0.586719,0.311111
2,Y8HMIm8mdns,171.607767,174.607767,0.505729,0.240741
3,akwvpAiLFk0,144.68,150.0,0.698438,0.288889
4,Swss72CHSWg,90.023267,97.2972,0.230729,0.20463


In [8]:
for i, row in df_train.sample(10000).iterrows():
    if row.end - row.start < 3:
        continue
    try:
        download_partial_video_from_youtube(str(path), file_base_name=row.id, youtube_id=row.id, start_time=row.start, duration=3)
    except FileExistsError as e:
        print(e)
    except FileNotFoundError as e:
        print(e)
    except TimeoutExpired as e:
        print(e)

Could not download Video! (Return code was not 0)
Could not download Video! (Return code was not 0)
Could not download Video! (Return code was not 0)
Could not get audio and video URL! (Return code was not 0)
Could not get audio and video URL! (Return code was not 0)
Could not get audio and video URL! (Return code was not 0)
Could not get audio and video URL! (Return code was not 0)
Could not download Video! (Return code was not 0)
Could not get audio and video URL! (Return code was not 0)
Could not download Video! (Return code was not 0)
Could not download Video! (Return code was not 0)
Could not get audio and video URL! (Return code was not 0)
Could not get audio and video URL! (Return code was not 0)
Could not download Video! (Return code was not 0)
Could not get audio and video URL! (Return code was not 0)
../data/f_a0mqZ3cu8.mp4 already exists!
Could not get audio and video URL! (Return code was not 0)
Could not download Video! (Return code was not 0)
Could not download Video! (Re

NameError: name 'TimeoutExpired' is not defined

In [6]:
vids = list(path.iterdir())

In [17]:
file = vids[2]

In [18]:
file

PosixPath('../data/lZhHLNsi1WI.mp4')

In [60]:
import os
from pathlib import Path
from typing import Optional, Union

from facenet_pytorch import MTCNN, InceptionResnetV1
import torch


In [204]:
class FaceCropper:
    """
    Class that provides utility methods to crop a face out of a video.
    
    Attributes:
        out_dir: directory to save the cropped videos to
        num_detect_points: the number of frames for which face detection is performed.
        model (torch.Module): a face detection network
    
    """
    
    def __init__(self, out_dir: Optional[str] = "./", num_detect_points:Optional[int]=2, device: Optional[torch.device]=None):
        """
        Constructor of FaceCropper
        
        Arguments:
            out_dir: directory to save the cropped videos to
            num_detect_points: the number of frames for which face detection is performed.
                Has to have a minimum number of 2.
            device: torch device for the model
            
        """
        
        if device is None:
            device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        else:
            device = device
            
        if num_detect_points < 2:
            raise ValueError("Face detection has to be performed for at least two frames in order to interpolate its position")
        
        self.out_dir = out_dir
        self.num_detect_points = num_detect_points
        self.model = MTCNN(keep_all=True, device=device)
            
    
    def cut_face_from_video(self, filename: Union[str, Path]):
        """
        TODO WIP
        """
        
        filename = Path(filename)
        
        # Load video
        videodata = skvideo.io.vread(filename)
        
        frame_count = len(videodata)
        
        face_detect_frame_idxs = np.linspace(0, frame_count - 1, min(self.num_detect_points, frame_count), dtype=np.int)
        
        # Detect faces in first and last frame
        boxes, _ = self.model.detect(videodata[face_detect_frame_idxs])
        for b in boxes:
            # Make sure that there is exactly one face detected
            if b is None or len(b) != 1:
                return 1
        
        
        # Interpolate center coordinates in between two frames for
        # which face detection has been performed
        center_coords_interp = []
        
        for box_start, box_end, interp_start_idx, interp_end_idx in zip(boxes[:-1], boxes[1:], face_detect_frame_idxs[:-1], face_detect_frame_idxs[1:]):

            # We select index 0 because we already checked
            coords_start = self.get_center(box_start[0])
            coords_end = self.get_center(box_end[0])
            
            num_frames = interp_end_idx - interp_start_idx
            
            center_coords_interp.extend(self.interpolate_coords(coords_start, coords_end, num_frames))
        
        # Determine height and width from the face detected in the first frame
        height, width = self.get_height_width(boxes[0][0])
        cropped_frames = np.zeros((len(videodata), height, width, 3), dtype=np.int)
        
        _, video_height, video_width, _ = videodata.shape

        # Crop the faces out of the frames using the interpolated center coordinates
        # and the height and widht determined from the first frame
        for frame_idx, (frame, center) in enumerate(zip(videodata, center_coords_interp)):
            upper = center[0] - height // 2
            lower = upper + height
            left = center[1] - width // 2
            right = left + width
            
            #pdb.set_trace()

            cropped_face = self.pad_crop(
                frame,
                upper,
                lower,
                left,
                right,
                video_height,video_width,
                filename
            )

            cropped_frames[frame_idx, ...] = cropped_face

        skvideo.io.vwrite(os.path.join(self.out_dir, f"cropped_{filename.name}"), cropped_frames)
        return 0

    @staticmethod
    def interpolate_coords(coords_start: Tuple[int], coords_end: Tuple[int], total_steps, step_size=1) -> List[Tuple[int]]:
        """
        TODO WIP
        """
        t = [0, total_steps]
        x = [coords_start[1], coords_end[1]]
        y = [coords_start[0], coords_end[0]]

        x_interp = np.interp(np.arange(0, total_steps, step_size), t, x)
        y_interp = np.interp(np.arange(0, total_steps, step_size), t, y)

        return [(int(y), int(x)) for x, y in zip(x_interp, y_interp)]
    
    @staticmethod
    def pad_crop(frame: np.array, upper, lower, left, right, frame_height, frame_width, filename):
        
        delta_upper = abs(min(0, upper))
        delta_lower = max(0, lower - frame_height)
        delta_left = abs(min(0, left))
        delta_right = max(0, right - frame_width)
        
        # TODO remove this after debugging
        if delta_upper > 0 or delta_lower > 0 or delta_left > 0 or delta_right > 0:
            print(filename)
        
        cropped_face = frame[upper:lower, left:right]
        
        return np.pad(
            cropped_face,
            ((delta_upper, delta_lower), (delta_left, delta_right), (0, 0)),
            'constant', constant_values=0
        )

    @staticmethod
    def get_center(coords: np.array):
        """
        TODO WIP
        """
        if len(coords) != 4:
            raise ValueError("This should have been 4 coordinates")
        
        left = coords[0]
        right = coords[2]
        lower = coords[1]
        upper = coords[3]

        return int((lower + upper) // 2), int((left + right) // 2)

    @staticmethod
    def get_height_width(coords: np.array):
        """
        TODO WIP
        """
        if len(coords) != 4:
            raise ValueError("This should have been 4 coordinates")
            
        left = coords[0]
        right = coords[2]
        lower = coords[1]
        upper = coords[3]

        return int(upper - lower), int(right - left)

In [205]:
cutter = FaceCropper(num_detect_points=10)

In [186]:
from glob import glob
from functools import partial

In [187]:
files = glob("../data/*.mp4")

In [206]:
cutter.cut_face_from_video("../data/UoW-O1khURA.mp4")

../data/UoW-O1khURA.mp4


ValueError: could not broadcast input array from shape (90,162,3) into shape (230,162,3)

In [188]:
list(map(cutter.cut_face_from_video, files))

../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/_rl4qZPf4ac.mp4
../data/UoW-O1khURA.mp4


ValueError: could not broadcast input array from shape (90,162,3) into shape (230,162,3)

In [150]:
%debug

> [0;32m<ipython-input-136-f0b236fa5909>[0m(86)[0;36mcut_face_from_video[0;34m()[0m
[0;32m     84 [0;31m            [0mright[0m [0;34m=[0m [0mleft[0m [0;34m+[0m [0mwidth[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     85 [0;31m[0;34m[0m[0m
[0m[0;32m---> 86 [0;31m            [0mcropped_frames[0m[0;34m[[0m[0mframe_idx[0m[0;34m,[0m [0;34m...[0m[0;34m][0m [0;34m=[0m [0mframe[0m[0;34m[[0m[0mupper[0m[0;34m:[0m[0mlower[0m[0;34m,[0m [0mleft[0m[0;34m:[0m[0mright[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     87 [0;31m[0;34m[0m[0m
[0m[0;32m     88 [0;31m        [0mskvideo[0m[0;34m.[0m[0mio[0m[0;34m.[0m[0mvwrite[0m[0;34m([0m[0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mout_dir[0m[0;34m,[0m [0;34mf"cropped_{filename.name}"[0m[0;34m)[0m[0;34m,[0m [0mcropped_frames[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> len(videodata)
75
ipdb> len(cropped_frame