In [1]:
import cv2
from facenet_pytorch import MTCNN

from pytube import YouTube
import cv2
import numpy as np

class VideoProcessor:
    def __init__(self, batch_size=50, skip_frames=1):
        self.batch_size = batch_size
        self.skip_frames = skip_frames

    def load_video_from_youtube(self, yt_link):
        self.yt_link = yt_link
        # Download the YouTube video and get the highest resolution stream
        yt_video = YouTube(self.yt_link)
        stream = yt_video.streams.get_highest_resolution()

        # Open the video stream using OpenCV
        self.video = cv2.VideoCapture(stream.url)

        # Get available and used resolution - Debugging
        #self.available_resolutions = [streams.resolution for streams in yt_video.streams.filter(type="video", progressive=True)]
        #self.used_resolution = stream.resolution

        # Get the number of frames in the video
        self.frame_count = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))

        # Get the frame rate of the video
        self.fps = int(self.video.get(cv2.CAP_PROP_FPS))

        # Get the height and width of the video frames
        self.height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
        self.width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))

        # Calculate the total number of frames to process after skipping frames
        self.total_frames = len(range(0, self.frame_count, self.skip_frames))

        # Calculate the number of batches required to process all the frames
        self.num_batches = int(np.ceil(self.total_frames / self.batch_size))

    def get_batches(self):
        # Initialize an empty numpy array to hold the frames
        frames = np.empty((self.batch_size, self.height, self.width, 3), np.dtype('uint8'))

        self.frames_read = 0

        # Read the frames in batches and fill up the numpy array
        for batch_start in range(0, self.frame_count, self.batch_size * self.skip_frames):
            batch_end = min(batch_start + (self.batch_size * self.skip_frames), self.frame_count)
            batch_index = 0

            for i in range(batch_start, batch_end):
                ret, frame = self.video.read()
                self.frames_read +=1
                if not ret:
                    break

                if i % self.skip_frames == 0:
                    frames[batch_index] = frame
                    batch_index += 1

            # Resize the numpy array to fit the actual number of frames in the batch
            if batch_index < self.batch_size:
                frames = frames[:batch_index]

            # Yield the current batch of frames
            yield frames

        # Release the video stream
        self.video.release()

class FaceDetector:
    def __init__(self, detection_type="mtcnn"):
        self.detection_type = detection_type

        if self.detection_type == "mtcnn":
            # Initialize the MTCNN face detector
            self.face_detector = MTCNN()
        # elif self.detection_type == "haarcascade":
        #     # Load the Haar Cascade face detector
        #     self.face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
        # elif self.detection_type == "cnn":
        #     # Load the CNN face detector
        #     self.face_detector = cv2.dnn.readNetFromCaffe("deploy.prototxt", "res10_300x300_ssd_iter_140000.caffemodel")

    def detect_faces(self, frames):
        # Detect faces in the frames using the selected face detection model
        if self.detection_type == "mtcnn":
            # Use MTCNN face detector
            return_boxes, _ = self.face_detector.detect(frames)
            return return_boxes
            boxes = []
            for i, bbox in enumerate(return_boxes[0]):
                xmin, ymin, xmax, ymax = bbox
                box = [ymin, ymax, xmin, xmax]
                boxes.append((i, box))
        # elif self.detection_type == "haarcascade":
        #     # Use Haar Cascade face detector
        #     gray = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
        #     boxes = self.face_detector.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)
        #     boxes = [[y, y+h, x, x+w] for (x,y,w,h) in boxes]
        # elif self.detection_type == "cnn":
        #     # Use CNN face detector
        #     blob = cv2.dnn.blobFromImages(frames, 1.0, (300, 300), (104.0, 177.0, 123.0))
        #     self.face_detector.setInput(blob)
        #     detections = self.face_detector.forward()
        #     boxes = []
        #     for i in range(detections.shape[2]):
        #         confidence = detections[0, 0, i, 2]
        #         if confidence > 0.5:
        #             box = detections[0, 0, i, 3:7] * np.array([frames.shape[2], frames.shape[1], frames.shape[2], frames.shape[1]])
        #             box = box.astype(int)
        #             ymin, xmin, ymax, xmax = box
        #             box = [ymin, ymax, xmin, xmax]
        #             boxes.append(box)

        return boxes



In [74]:
#yt_link = 'https://www.youtube.com/watch?v=vtT78TfDfXU'                   # 1 Actor
#yt_link = 'https://www.youtube.com/watch?v=embYkODkzcs'                 # 7 basic emotions
#yt_link = 'https://www.youtube.com/watch?v=m70UInZKJjU'                    # Two persons
yt_link = 'https://www.youtube.com/watch?v=UECCHwh7bZE'

my_test = VideoProcessor(skip_frames=10)
my_face_detector = FaceDetector(detection_type='mtcnn')
#my_emotion_detector = EmotionDetector
batchlist = []
boxlist = []
my_test.load_video_from_youtube(yt_link)
for idx, batch in enumerate(my_test.get_batches()):
    print(f'{idx}: {len(batch)}')
    batchlist.append(batch)
    #boxes = my_face_detector.detect_faces(batch)
    #boxlist.append(boxes)
    

0: 50
1: 50
2: 50
3: 50
4: 50
5: 50
6: 50
7: 50
8: 50
9: 50
10: 11


In [2]:
yt_link = 'https://www.youtube.com/watch?v=wo6K1GWEx84'

my_test = VideoProcessor(skip_frames=10)
my_face_detector = FaceDetector(detection_type='mtcnn')
my_test.load_video_from_youtube(yt_link)
for idx, batch in enumerate(my_test.get_batches()):
    boxes = my_face_detector.detect_faces(batch)
    

  if method is "Min":
  if method is "Min":
  if method is "Min":


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (50,) + inhomogeneous part.

In [70]:
len(batchlist)

1

In [71]:
len(boxlist)

0

In [75]:
yt_link = 'https://www.youtube.com/watch?v=wo6K1GWEx84'
my_test = VideoProcessor(skip_frames=10)
my_face_detector = FaceDetector(detection_type='mtcnn')
#my_emotion_detector = EmotionDetector
batchlist = []
boxlist = []
my_test.load_video_from_youtube(yt_link)
for idx, batch in enumerate(my_test.get_batches()):
    print(f'{idx}: {len(batch)}')
    batchlist.append(batch)
    boxes = my_face_detector.detect_faces(batch)
    boxlist.append(boxes)

0: 50


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (50,) + inhomogeneous part.

In [3]:
batch.shape

(50, 720, 1280, 3)

In [128]:
face_detector_mtcnn = MTCNN()
a = face_detector_mtcnn.detect(batch[3:12])
print(a)

(array([[[ 800.64   ,  155.62999, 1065.035  ,  477.59427]],

       [[ 787.31104,  160.48291, 1047.8341 ,  486.15082]],

       [[ 736.2914 ,  159.95483,  996.07245,  482.7968 ]],

       [[ 724.2704 ,  159.92754,  975.6582 ,  487.37204]],

       [[ 715.5035 ,  159.72684,  956.12256,  473.42117]],

       [[ 205.15468,  180.71413,  391.8848 ,  428.30347]],

       [[ 213.19006,  178.01108,  399.1574 ,  423.1865 ]],

       [[ 227.02106,  182.68604,  408.90018,  420.32648]],

       [[ 761.26526,  181.73456, 1000.58075,  492.46167]]], dtype=float32), array([[0.9998952 ],
       [0.9997003 ],
       [0.99998546],
       [0.99999976],
       [0.9988213 ],
       [0.9996246 ],
       [0.99990594],
       [0.99990046],
       [0.99999857]], dtype=float32))


In [130]:
a[0].shape

(9, 1, 4)

In [4]:
import face_detection
# Initialize detector
detector = face_detection.build_detector("DSFDDetector", confidence_threshold=.5, nms_iou_threshold=.3)




In [None]:

# Getting detections
detections = detector.detect(batch)

In [5]:
detections = detector.batched_detect(batch)

KeyboardInterrupt: 

In [10]:
!pip install retina-face --no-deps

Collecting retina-face
  Using cached retina_face-0.0.13-py3-none-any.whl (16 kB)
Installing collected packages: retina-face
Successfully installed retina-face-0.0.13


In [12]:
batch.shape

(50, 720, 1280, 3)

In [16]:
# Initialize detector
detector = cv2.FaceDetectorYN.create("/models/face_detection_yunet_2022mar.onnx", "", (320, 320))


error: OpenCV(4.7.0) /Users/xperience/GHA-OCV-Python/_work/opencv-python/opencv-python/opencv/modules/dnn/src/onnx/onnx_importer.cpp:270: error: (-5:Bad argument) Can't read ONNX file: /models/face_detection_yunet_2022mar.onnx in function 'ONNXImporter'


In [None]:
# Set input size
detector.setInputSize((1280, 720))
# Getting detections
detections = detector.detect(batch[0])

In [116]:
face_detector_mtcnn = MTCNN()
for i in range(50):
    a = face_detector_mtcnn.detect(batch[i])
    print(a)

(array([[ 213.83145,  219.39542,  415.32016,  483.84225],
       [ 834.7785 ,  208.32353, 1019.23663,  466.85956]], dtype=float32), array([0.99976796, 0.99993265], dtype=float32))
(array([[ 224.82489,  210.4985 ,  420.9738 ,  463.00748],
       [ 835.11536,  201.44376, 1015.41327,  461.78235]], dtype=float32), array([0.9999089 , 0.99991834], dtype=float32))
(array([[ 791.4241 ,  163.10724, 1054.3726 ,  494.60812]], dtype=float32), array([0.9998498], dtype=float32))
(array([[ 800.64   ,  155.63005, 1065.035  ,  477.5943 ]], dtype=float32), array([0.9998952], dtype=float32))
(array([[ 787.31104,  160.48288, 1047.8341 ,  486.1508 ]], dtype=float32), array([0.9997003], dtype=float32))
(array([[736.2914 , 159.95485, 996.07245, 482.7968 ]], dtype=float32), array([0.99998546], dtype=float32))
(array([[724.2704 , 159.92757, 975.6582 , 487.37204]], dtype=float32), array([0.99999976], dtype=float32))
(array([[715.5035 , 159.7268 , 956.12256, 473.4211 ]], dtype=float32), array([0.9988213], dtype=

In [115]:
a

(array([[ 224.82489,  210.4985 ,  420.9738 ,  463.00748],
        [ 835.11536,  201.44376, 1015.41327,  461.78235]], dtype=float32),
 array([0.9999089 , 0.99991834], dtype=float32))

In [89]:
batch[-1]

array([[[ 58,  28,  19],
        [ 58,  28,  19],
        [ 58,  28,  19],
        ...,
        [ 13,  90,   5],
        [ 16,  89,   5],
        [ 16,  89,   5]],

       [[ 58,  28,  19],
        [ 58,  28,  19],
        [ 56,  27,  18],
        ...,
        [ 13,  90,   5],
        [ 16,  89,   5],
        [ 16,  89,   5]],

       [[ 59,  30,  22],
        [ 59,  30,  22],
        [ 59,  30,  22],
        ...,
        [ 12,  90,   2],
        [ 15,  89,   2],
        [ 15,  89,   2]],

       ...,

       [[251, 221, 202],
        [251, 221, 202],
        [251, 221, 202],
        ...,
        [171, 157, 146],
        [171, 157, 146],
        [171, 157, 146]],

       [[251, 221, 202],
        [251, 221, 202],
        [251, 221, 202],
        ...,
        [172, 158, 147],
        [172, 158, 147],
        [172, 158, 147]],

       [[251, 221, 202],
        [251, 221, 202],
        [251, 221, 202],
        ...,
        [172, 158, 147],
        [172, 158, 147],
        [172, 158, 147]]

In [84]:
type(batch[0][0][0][0])

numpy.uint8

In [85]:
batch.shape

(50, 720, 1280, 3)

In [78]:
face_detector_mtcnn = MTCNN()
bboxes, probs = face_detector_mtcnn.detect(batch)
faces = []
for bbox in bboxes:
    xmin, ymin, xmax, ymax = bbox[0]
    box = [ymin, ymax, xmin, xmax]
    faces.append(box)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (50,) + inhomogeneous part.

In [77]:
batch

array([[[[ 31, 116,  40],
         [ 31, 116,  40],
         [ 31, 116,  40],
         ...,
         [242, 221, 194],
         [242, 221, 194],
         [242, 221, 194]],

        [[ 31, 116,  40],
         [ 31, 116,  40],
         [ 31, 116,  40],
         ...,
         [242, 221, 194],
         [242, 221, 194],
         [242, 221, 194]],

        [[ 33, 116,  40],
         [ 33, 116,  40],
         [ 33, 116,  40],
         ...,
         [242, 221, 194],
         [242, 221, 194],
         [242, 221, 194]],

        ...,

        [[211, 185, 168],
         [211, 185, 168],
         [210, 183, 167],
         ...,
         [ 31,   6,  65],
         [ 32,   9,  63],
         [ 33,  10,  65]],

        [[211, 185, 168],
         [211, 185, 168],
         [210, 183, 167],
         ...,
         [ 31,   8,  62],
         [ 31,   9,  61],
         [ 32,  10,  62]],

        [[211, 185, 168],
         [211, 185, 168],
         [210, 183, 167],
         ...,
         [ 31,   8,  62],
        

In [67]:
boxes.shape

(50, 1, 4)

In [76]:
boxlist

[]

In [39]:
boxes_list = []
for i, bbox in enumerate(boxes):
    xmin, ymin, xmax, ymax = bbox
    box = [ymin, ymax, xmin, xmax]
    boxes_list.append((i, box))

ValueError: not enough values to unpack (expected 4, got 1)

In [36]:
boxes_list

[(0, [62.388275, 314.8143, 534.76434, 730.4663])]

In [73]:
yt_link = 'https://www.youtube.com/watch?v=vtT78TfDfXU'
my_test = VideoProcessor(skip_frames=10)
my_face_detector = FaceDetector(detection_type='mtcnn')
my_test.load_video_from_youtube(yt_link)
my_test.total_frames


179

In [None]:
# Initialize an empty numpy array to hold the frames
frames = np.empty((my_test.total_frames, my_test.height, my_test.width, 3), np.dtype('uint8'))

my_test.frames_read = 0
while True:
    ret, frame = self.video.read()
    
    if not ret:
    break

# Read the frames in batches and fill up the numpy array
for batch_start in range(0, self.frame_count, self.batch_size * self.skip_frames):
    batch_end = min(batch_start + (self.batch_size * self.skip_frames), self.frame_count)
    batch_index = 0

    for i in range(batch_start, batch_end):
        ret, frame = self.video.read()
        self.frames_read +=1
        if not ret:
            break

        if i % self.skip_frames == 0:
            frames[batch_index] = frame
            batch_index += 1

    # Resize the numpy array to fit the actual number of frames in the batch
    if batch_index < self.batch_size:
        frames = frames[:batch_index]

    # Yield the current batch of frames
    yield frames

# Release the video stream
self.video.release()

In [89]:
face_detector_hcc = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
gray = cv2.cvtColor(batch, cv2.COLOR_BGR2GRAY)
boxes = face_detector_hcc.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)
boxes = [[y, y+h, x, x+w] for (x,y,w,h) in boxes]

error: OpenCV(4.7.0) /Users/xperience/GHA-OCV-Python/_work/opencv-python/opencv-python/opencv/modules/imgproc/src/color.simd_helpers.hpp:92: error: (-2:Unspecified error) in function 'cv::impl::(anonymous namespace)::CvtHelper<cv::impl::(anonymous namespace)::Set<3, 4, -1>, cv::impl::(anonymous namespace)::Set<1, -1, -1>, cv::impl::(anonymous namespace)::Set<0, 2, 5>, cv::impl::(anonymous namespace)::NONE>::CvtHelper(cv::InputArray, cv::OutputArray, int) [VScn = cv::impl::(anonymous namespace)::Set<3, 4, -1>, VDcn = cv::impl::(anonymous namespace)::Set<1, -1, -1>, VDepth = cv::impl::(anonymous namespace)::Set<0, 2, 5>, sizePolicy = cv::impl::(anonymous namespace)::NONE]'
> Invalid number of channels in input image:
>     'VScn::contains(scn)'
> where
>     'scn' is 1


In [90]:
batch

array([[[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        ...,

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]],


       [[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0,

In [61]:
my_test.__dict__

{'batch_size': 50,
 'skip_frames': 10,
 'yt_link': 'https://www.youtube.com/watch?v=vtT78TfDfXU',
 'video': < cv2.VideoCapture 0x14e376470>,
 'frame_count': 1785,
 'fps': 24,
 'height': 720,
 'width': 1280,
 'total_frames': 179,
 'num_batches': 4,
 'frames_read': 1785}

In [63]:
my_face_det = FaceDetector(detection_type="mtcnn")

In [62]:
batch[0].shape

(720, 1280, 3)

In [11]:
yt_link = 'https://www.youtube.com/watch?v=vtT78TfDfXU'                   # 1 Actor
#yt_link = 'https://www.youtube.com/watch?v=embYkODkzcs'                 # 7 basic emotions
#yt_link = 'https://www.youtube.com/watch?v=m70UInZKJjU'                    # Two persons

my_test = VideoProcessor(skip_frames=5)
my_test.get_video(yt_link)
fourcc = cv2.VideoWriter_fourcc(*'mp4v') 
writer = cv2.VideoWriter('/Users/ben/neuefische/capstone/NeuralXpresso/notebooks/outputs/Output_video_3.mp4', fourcc, my_test.fps, (my_test.width, my_test.height))
counter = 0
frames = []
for idx, batch in enumerate(my_test.get_batches()):
    for frame in batch:
        frames.append(frame)
        writer.write(frame)
    counter +=1
writer.release()

In [None]:
yt_link = 'https://www.youtube.com/watch?v=vtT78TfDfXU'                   # 1 Actor
#yt_link = 'https://www.youtube.com/watch?v=embYkODkzcs'                 # 7 basic emotions
#yt_link = 'https://www.youtube.com/watch?v=m70UInZKJjU'                    # Two persons

my_test = VideoProcessor(skip_frames=10)
my_face_detector = FaceDetector(detection_type='mtcnn')
fourcc = cv2.VideoWriter_fourcc(*'mp4v') 
writer = cv2.VideoWriter('/Users/ben/neuefische/capstone/NeuralXpresso/notebooks/outputs/Output_video_3.mp4', fourcc, my_test.fps, (my_test.width, my_test.height))
counter = 0
my_test.get_video(yt_link)
for idx, batch in enumerate(my_test.get_batches()):
    print(f'{idx}: {len(batch)}')
    boxes = my_face_detector.detect_faces(batch)
    break

In [None]:
yt_link = 'https://www.youtube.com/watch?v=vtT78TfDfXU'                   # 1 Actor
#yt_link = 'https://www.youtube.com/watch?v=embYkODkzcs'                 # 7 basic emotions
#yt_link = 'https://www.youtube.com/watch?v=m70UInZKJjU'                    # Two persons

my_test = VideoProcessor(skip_frames=10)
my_face_detector = FaceDetector(detection_type='mtcnn')
#my_emotion_detector = EmotionDetector
batchlist = []
boxlist = []
my_test.get_video(yt_link)
for idx, batch in enumerate(my_test.get_batches()):
    print(f'{idx}: {len(batch)}')
    batchlist.append(batch)
    boxes = my_face_detector.detect_faces(batch)
    boxlist.append(boxes)
    #faces = extract_faces(batch, boxes, my_emotion_detector.input_shape)
    #emotions = my_emotion_detector.predict(faces)

In [102]:
len(frames)

357

In [None]:
def output_video(video, filename):
    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(video.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    return cv2.VideoWriter(filename, fourcc, 10, (width,height))

In [25]:
yt_video = YouTube(yt_link)
stream = yt_video.streams.get_highest_resolution()   
video =  cv2.VideoCapture(stream.url)


frame_count = 200
batch_size = 52

batches = []

for batch_start in range(0, frame_count, batch_size):
    batch_end = min(batch_start + batch_size, frame_count)
    frames = []



    # Read the frames in the current batch
    for i in range(batch_start, batch_end):
        counter = batch_start
        ret, frame = video.read()
        if not ret:
            break

        frames.append(frame)
        counter+=1

        if counter == batch_end:
            break
    batches.append(frames)

In [17]:
#     # Load the CNN face detector
face_detector = cv2.dnn.readNetFromCaffe("deploy.prototxt", "res10_300x300_ssd_iter_140000.caffemodel")

error: OpenCV(4.7.0) /Users/xperience/GHA-OCV-Python/_work/opencv-python/opencv-python/opencv/modules/dnn/src/caffe/caffe_io.cpp:1126: error: (-2:Unspecified error) FAILED: fs.is_open(). Can't open "deploy.prototxt" in function 'ReadProtoFromTextFile'
