In [56]:
"""Test pre-trained RGB model on a single video.

Date: 01/15/18
Authors: Bolei Zhou and Alex Andonian

This script accepts an mp4 video as the command line argument --video_file
and averages ResNet50 (trained on Moments) predictions on num_segment equally
spaced frames (extracted using ffmpeg).

Alternatively, one may instead provide the path to a directory containing
video frames saved as jpgs, which are sorted and forwarded through the model.

ResNet50 trained on Moments is used to predict the action for each frame,
and these class probabilities are average to produce a video-level predction.

Optionally, one can generate a new video --rendered_output from the frames
used to make the prediction with the predicted category in the top-left corner.

"""

import os
import argparse
import moviepy.editor as mpy

import torch.optim
import torch.nn.parallel
from torch.nn import functional as F
import sys
sys.path.append("../moments_models/")

import numpy as np 
import cv2


import models
from utils import extract_frames, load_frames, render_frames


In [57]:
# Load model
model = models.load_model('resnet50')

# Get dataset categories
categories = models.load_categories()
eat_idx = categories.index("eating")

In [58]:
video_file = "mrbean.mp4"

# Load the video frame transform
transform = models.load_transform()

frames = extract_frames(video_file, 50)

# [num_frames, 3, 224, 224]
input = torch.stack([transform(frame) for frame in frames])

In [66]:
# Make video prediction
with torch.no_grad():
    logits = model(input)
    probs = F.softmax(logits, 1)

# Convert from Tensor to list
eat_probs = probs[:,eat_idx].tolist()


50


50


In [64]:
def render_frames_2(frames, predictions):
    """Write the predicted category in the top-left corner of each frame."""
    rendered_frames = []
    for index,frame in enumerate(frames):
        img = np.array(frame)
        height, width, _ = img.shape
        cv2.putText(img, predictions[index],
                    (1, int(height / 8)),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    1, (0, 255, 0), 3)
        rendered_frames.append(img)
    return rendered_frames

In [69]:

# # Output the prediction.
# video_name = args.frame_folder if args.frame_folder is not None else args.video_file
# print('RESULT ON ' + video_name)
# for i in range(0, 5):
#     print('{:.3f} -> {}'.format(probs[i], categories[idx[i]]))

# Render output frames with prediction text.

prediction = [str(round(i,5)) for i in eat_probs]
rendered_frames = render_frames_2(frames, prediction)
clip = mpy.ImageSequenceClip(rendered_frames, fps=1)
#animation = mpy.ipython_display(clip)
clip.ipython_display(fps=2, loop=1, autoplay=1)


t:  17%|█▋        | 17/100 [00:00<00:00, 158.74it/s, now=None]

<class 'moviepy.video.io.ImageSequenceClip.ImageSequenceClip'>
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                              

Moviepy - Done !
Moviepy - video ready __temp__.mp4


