# Intake Video and Produce tensor of needed features for trained Neural Network



In [None]:
from google.colab import drive
import cv2
import numpy as np
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
def video_to_frames(video, out_folder):
    """Convert mp4 video to still images at 1 per second to give folder name."""
    # Name each still image after it's timestamp in folder.
    # Return list of frames instead
    
    # Find fps
    vidcap = cv2.VideoCapture(video)
    fps = int(vidcap.get(cv2.CAP_PROP_FPS))
    

    # Write image to folder for every 1 second of video
    # Name the image after it's frame count
    count = 0
    read_image = True
    num_frames = 0
    while read_image:
        read_image, img = vidcap.read()
        
        # Only read one frame per second
        if count % fps == 0:
            cv2.imwrite(out_folder + '/frame_%d.jpg'%count, img)
            num_frames += 1
            
        count += 1
    print(f"Retrieved {num_frames} frames")
    return fps

In [None]:
vid_path = '/content/gdrive/MyDrive/video_data/frames'
video_to_frames('/content/gdrive/MyDrive/video_data/vid1.mp4', "frames")

FPS: 29
Retrieved 23 frames


29

In [None]:
# DOWNLOADING FROM URL
!pip install mhyt --quiet

from mhyt import yt_download
url = 'https://www.youtube.com/watch?v=LTXzcJNpxNw'
yt_download(url, '/content/gdrive/MyDrive/video_data/vid1.mp4')

# Computing features needed for model
* Generate CLIP embeddings of all frames. 
* Store obj detections using DETR that are over 70% 

In [None]:
import math

from PIL import Image
import requests
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

import ipywidgets as widgets
from IPython.display import display, clear_output

import torch
from torch import nn
from torchvision.models import resnet50
import torchvision.transforms as T
torch.set_grad_enabled(False);

In [None]:
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

In [None]:
model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)
model.eval();

In [None]:
def get_detections(img, threshold=0.7):
  """ Return a list of detection bounded boxes for each images"""
  # Make this function take in a folder, and produce a dictionary where the image name is the key, and the value is a list of bounding box detections for that image
  # This will NOT need to be done for training as our training data will be coco set. 

  # Normalize input image and run through DETR model
  proc_img = transform(img).unsqueeze(0)
  outputs = model(proc_img)

  # keep only predictions with confidence above threshold level
  probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
  keep = probas.max(-1).values > threshold

  bounded_bxs = rescale_bboxes(outputs['pred_boxes'][0, keep], img.size)

  detections = []
  for prob, (xmin, ymin, xmax, ymax) in zip(probas[keep], bounded_bxs.tolist()):
    # Grab index of most probable class.
    class_index = prob.argmax()

    #class_index = CLASSES[highest_prob]
    class_prob = prob[class_index]

    detections.append((class_index.item(), class_prob.item(), (xmin, ymin, xmax, ymax)))

  return detections

In [None]:
test = get_detections(im, threshold=0.7)

for cl_index, cl_prob, (xmin, ymin, xmax, ymax) in test:
  print(cl_index)
  print(cl_prob)
  print((xmin, ymin, xmax, ymax))
  print()

In [None]:
# CLIP EMBEDDING
# For CLIP model and cosine function
!pip install sentence-transformers --quiet
import sys
sys.path.append('/content/gdrive/MyDrive')

import sentence_transformers
from sentence_transformers import SentenceTransformer, util
import glob
import pickle
import zipfile
from collections import defaultdict
import tqdm.notebook as tq
from io import BytesIO