[16.10.20]
Frame-by-frame classification
=======================================================

* Having a collection of videos with pre-defined "true" categories and a pretrained classifier:
    1. extract each frame of each video;
    2. perform a classification of each individual frame;
    3. extract the **per-frame accuracies** for the true category.
* Frame extractor variants:
    * [decord](https://github.com/dmlc/decord) [implemented]:
        ```
        vr = decord.VideoReader()
        frame_id_list = range(start, stop, pace)
        video_frames = vr.get_batch(frame_id_list).asnumpy()
        ```
    * [OpenCV](https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_gui/py_video_display/py_video_display.html) [TODO]
 

# Moments in Time videos

In [None]:
%%bash
unameOut="$(uname -s)"
case "${unameOut}" in
    Linux*)     machine=Linux;;
    Darwin*)    machine=Mac;;
    CYGWIN*)    machine=Cygwin;;
    MINGW*)     machine=MinGw;;
    *)          machine="UNKNOWN:${unameOut}"
esac

if [ ${machine} == 'Linux' ]
then
    pip install --upgrade decord
else
    echo "Install decord following the steps in https://github.com/dmlc/decord#mac-os"
fi

Clone the [frame-by-frame](https://github.com/CogNeuroUR/frame-by-frame) repo:

In [None]:
!git clone https://github.com/CogNeuroUR/frame-by-frame.git

In [None]:
# Imports
import os
from decord import VideoReader
from decord import cpu #, gpu
import decord
decord.bridge.set_bridge('native') # Seems to be the fastest option

from pathlib import Path

## Frame-by-frame classification

##### ResNet50-MiT (torch)

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch.nn import functional as F

from PIL import Image
from os import listdir
from os.path import isfile, join

import numpy as np
from random import sample, randint
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path_prefix = Path().parent.absolute() / 'frame-by-frame'

In [None]:
resnet50 = models.resnet50(pretrained=False, progress=True, num_classes=339).to('cuda')

[Down]-load MiT-pretrained model:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!curl http://moments.csail.mit.edu/moments_models/resnet50_moments-fd0c4436.pth --output resnet50_moments-fd0c4436.pth
#path_model = '/content/drive/My Drive/resnet50_moments-fd0c4436.pth'
path_model = '/content/resnet50_moments-fd0c4436.pth'
resnet50_moments = resnet50
resnet50_moments.load_state_dict(torch.load(path_model))
resnet50_moments.eval()

In [None]:
transformation = transforms.Compose([
                                     transforms.ToPILImage(mode='RGB'), # required if the input image is a nd.array
                                     transforms.Resize(224), # To be changed to rescale to keep the aspect ration?
                                     transforms.CenterCrop((224, 224)),
                                     transforms.ToTensor(),
                                     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                          std=[0.229, 0.224, 0.225])
])

In [None]:
#!curl https://raw.githubusercontent.com/zhoubolei/moments_models/master/category_momentsv1.txt --output category_momentsv1.txt
!curl https://raw.githubusercontent.com/vrabiutz/my_images/master/category_momentsv1.txt --output category_momentsv1.txt
def load_categories():
    """Load categories."""
    with open(path_prefix / 'labels/category_momentsv1.txt') as f:
        return [line.rstrip() for line in f.readlines()]

# load categories
categories = load_categories()

###### Single video

In [None]:
# Load video
#print(os.getcwd())
path_prefix = '/content/frame-by-frame'
video_fname = str(path_prefix / 'data/test/abseiling_k400.mp4')
vr = VideoReader(video_fname)#, ctx=cpu(0))
print('video frames:', len(vr))

Test on random frame

In [None]:
# Accuracy test
video_frames = vr.get_batch([i for i in range(len(vr))])
img = video_frames.asnumpy()[randint(0, len(vr))]

# Image transformation
t_img = transformation(img).to('cuda')
print(t_img.shape)

plt.imshow(t_img[0].cpu())
plt.show()

# Classification:
logit = resnet50_moments.forward(t_img.unsqueeze(0))
h_x = F.softmax(logit, 1).data.squeeze()
probs, idx = h_x.sort(0, True)

#print(img_url)
# output the prediction of action category
print('--Top Actions:')
for i in range(0, 5):
    print('{:.3f} -> {}'.format(probs[i], categories[idx[i]]))

Test on a range of frames

In [None]:
pred_accuracies = []
true_category = 'climbing'

for i in range(video_frames.shape[0]):
    frame = video_frames.asnumpy()[i]
    input = transformation(frame).to('cuda')
    
    # Classification:
    logit = resnet50_moments.forward(input.unsqueeze(0))
    h_x = F.softmax(logit, 1).data.squeeze()
    probs, idx = h_x.sort(0, True)

    # At each 20th frame, make a plot and print the top 5 categories with
    # the corresponding accuracy
    if i % 20 == 0:
        print(i)
        plt.imshow(input[0].cpu())
        plt.show()
        print('--Top Actions:')
        for i in range(0, 5):
            print('{:.3f} -> {}'.format(probs[i], categories[idx[i]]))
        
    pred_accuracies.append(probs.cpu()[[i for i in range(len(categories)) if categories[idx[i]]==true_category]])

Per frame accuracies

In [None]:
ax = sns.barplot(data=pred_accuracies)
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 20 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)
ax.set_title(f'Prediction accuracty for TRUE category \"{true_category}\""')
ax.set_xlabel('Frame nr.')
ax.set_ylabel('Prediction accuracy (softmax)')

plt.show()

###### Multi-video frame extraction

Go through each folder of the `MIT_sampleVideos_RAW` and
* Take the name of the category folder
* Iterate over the videos of that category AND extract the frame-by-frame accuracy

In [None]:
# Collect the names of all video files and their path in the dataset
import os

path_prefix = 'path/to/MIT_sampleVideos_RAW/'

# Define empty dictionary for file names per category
d_files_per_category = {}

# Iterate over categories in path_prefix
for root, dirs, files in os.walk(path_prefix, topdown=False):
    for name in sorted(dirs):
        category_path = os.path.join(root, name)
        category_name = category_path.split('/')[-1]
        print(category_name)

        # Iterate over files in cateogory_path
        d_files_per_category[category_name] = {}
        for root2, dirs2, videos  in os.walk(category_path, topdown=False):
            for video in videos:
                video_fname = os.path.join(root2, video)
                
                d_files_per_category[category_name][video] = video_fname


Routine to extract TRUE category accuracy per frame

In [None]:
import time
from torch.nn import functional as F

accuracies_per_category = {}
times_per_file = []

z = 1 # For verbose
# Iterate over categories in path_prefix
for category_name in list(d_files_per_category.keys()):
    print(f'{category_name} {z}/{len(list(d_files_per_category.keys()))}')
    z += 1
    # Iterate over files in cateogory_path
    accuracies_per_category[category_name] = {}
    for video in list(d_files_per_category[category_name]):
        #start_time = time.time()
        video_fname = d_files_per_category[category_name][video]
        
        print('\t', video)

        # Load video with Decord.VideoReader
        vr = VideoReader(video_fname, ctx=cpu(0))
        frame_id_list = range(0, len(vr), 1)
        video_frames = vr.get_batch(frame_id_list)

        pred_accuracies = []
        true_category = category_name
        #true_category = 'bathing'

        start_time = time.time()
        for i in range(video_frames.shape[0]):
            
            frame = video_frames.asnumpy()[i]
            input = transformation(frame).to('cuda')
            
            # Classification:
            logit = resnet50_moments.forward(input.unsqueeze(0))
            h_x = F.softmax(logit, 1).data.squeeze()
            probs, idx = h_x.sort(0, True)
            # Save accuracies
            #pred_accuracies.append(probs.numpy()[[i for i in range(len(categories)) if categories[idx[i]]==true_category]])
            pred_accuracies.append(probs.cpu().numpy()[[i for i in range(len(categories)) if categories[idx[i]]==category_name]])
        # Calculate avg duration per frame
        end_time = time.time()
        print('\tAvg duration per frame: %4.4f seconds.' % ((end_time - start_time)/(10*video_frames.shape[0])))
        
        accuracies_per_category[category_name][video] = pred_accuracies
        #accuracies_per_file.append(pred_accuracies)
        times_per_file.append((end_time - start_time)/10)
#accuracies_per_category.append(accuracies_per_file)  

In [None]:
# Save dictionary to pickle file
import pickle
dict_path = '/frame-by-frame/saved/accuracies_per_category_ResNet50_MiTv1.pkl'
f = open(dict_path, 'wb')
pickle.dump(accuracies_per_category, f)
f.close()

In [None]:
# Load
import pickle
dict_path = '/frame-by-frame/saved/accuracies_per_category_ResNet50_MiTv1.pkl'
f = open(dict_path, 'rb')
accuracies_per_category = pickle.load(f)

Test plot for single file

In [None]:
import seaborn as sns
ax = sns.barplot(data=accuracies_per_category['breaking']['giphy-LirabC73gVe92_3.mp4'])
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)
ax.set_title(f'Prediction accuracty for TRUE category \"{true_category}\""')
ax.set_xlabel('Frame nr.')
ax.set_ylabel('Prediction accuracy (softmax)')

plt.show()

###### Multi-video, multi-category per-frame accuracy extraction

In [None]:
import pickle
import time
from torch.nn import functional as F

In [None]:
accuracies_per_category = {}
times_per_file = []

z = 250 # For verbose
N = 50 # Multiples at which to save the dictionary

# Iterate over categories in path_prefix
for category_name in list(d_files_per_category.keys())[250:]:
    print(f'{category_name} {z}/{len(list(d_files_per_category.keys()))}')
    z += 1
    # Iterate over files in cateogory_path
    accuracies_per_category[category_name] = {}
    for video in list(d_files_per_category[category_name]):
        #start_time = time.time()
        video_fname = d_files_per_category[category_name][video]
        
        print('\t', video)

        # Load video with Decord.VideoReader
        vr = VideoReader(video_fname, ctx=cpu(0))
        frame_id_list = range(0, len(vr), 1)
        video_frames = vr.get_batch(frame_id_list)

        pred_accuracies = []
        true_category = category_name
        #true_category = 'bathing'

        start_time = time.time()
        for i in range(video_frames.shape[0]):
            
            frame = video_frames.asnumpy()[i]
            input = transformation(frame).to('cuda')
            
            # Classification:
            logit = resnet50_moments.forward(input.unsqueeze(0))
            h_x = F.softmax(logit, 1).data.squeeze().tolist()
            # Save accuracies
            pred_accuracies.append(h_x)
        # Calculate avg duration per frame
        end_time = time.time()
        print('\tAvg duration per frame: %4.4f seconds.' % ((end_time - start_time)/(10*video_frames.shape[0])))
        
        accuracies_per_category[category_name][video] = pred_accuracies

    # Save at every 50 categories:
    if z % N == 0:
        dict_path = f'/frame-by-frame/saved/accuracies_per_category_full_{z}.pkl'
        f = open(dict_path, 'wb')
        pickle.dump(accuracies_per_category, f)
        f.close()
#accuracies_per_category.append(accuracies_per_file)  

Test plot

In [None]:
import seaborn as sns
ax = sns.barplot(data=accuracies_per_category['breaking']['giphy-LirabC73gVe92_3.mp4'])
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)
ax.set_title(f'Prediction accuracty for TRUE category \"{true_category}\""')
ax.set_xlabel('Frame nr.')
ax.set_ylabel('Prediction accuracy (softmax)')

plt.show()

##### I3D (Kinetics) (**TODO**)

In [None]:
%%bash
pip install --upgrade pip
pip install --upgrade numpy
pip install --upgrade tensorflow tensorflow-gpu tensorflow-probability
pip install --upgrade dm-sonnet tqdm
#git clone https://github.com/deepmind/kinetics-i3d

In [None]:
import sonnet as snt
import tensorflow as tf

In [None]:
print("TensorFlow version: {}".format(tf.__version__))
print("    Sonnet version: {}".format(snt.__version__))

In [None]:
class MLP(snt.Module):

  def __init__(self):
    super(MLP, self).__init__()
    self.flatten = snt.Flatten()
    self.hidden1 = snt.Linear(1024, name="hidden1")
    self.hidden2 = snt.Linear(1024, name="hidden2")
    self.logits = snt.Linear(10, name="logits")

  def __call__(self, images):
    output = self.flatten(images)
    output = tf.nn.relu(self.hidden1(output))
    output = tf.nn.relu(self.hidden2(output))
    output = self.logits(output)
    return output

In [None]:
mlp = MLP()
mlp

In [None]:
%%bash
cd kinetics-i3d/
python i3d_test.py

In [None]:
import tensorflow_hub as hub

In [None]:
i3d = hub.load("https://tfhub.dev/deepmind/i3d-kinetics-400/1").signatures['default']

In [None]:
_LABEL_MAP_PATH = '/content/kinetics-i3d/data/label_map.txt'
kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)]

In [None]:
import cv2
import numpy as np

In [None]:
# Utilities to open video files using CV2
def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(y, x)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

In [None]:
def load_video(path, max_frames=0, resize=(224, 224)):
  cap = cv2.VideoCapture(path)
  frames = []
  try:
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, resize)
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)
      
      if len(frames) == max_frames:
        break
  finally:
    cap.release()
  return np.array(frames) / 255.0

In [None]:
sample = load_video(video_fname, max_frames=250)

In [None]:
def predict(sample_video, labels):
    # Add a batch axis to the to the sample video.
    model_input = tf.constant(sample_video, dtype=tf.float32)[tf.newaxis, ...]
    #print(model_input.shape)

    logits = i3d(model_input)['default']#[0]
    probabilities = tf.nn.softmax(logits[0])
    print(logits.shape)

    print("Top 5 actions:")
    for i in np.argsort(probabilities)[::-1][:5]:
        print(f"  {labels[i]:22}: {probabilities[i] * 100:5.2f}%")
    return logits

In [None]:
logits = predict(sample, kinetics_classes)

In [None]:
logits.shape

##### I3D (OanaIgnat) (**TODO**)

In [None]:
!git clone https://github.com/OanaIgnat/i3d_keras.git

In [None]:
%%bash
cd i3d_keras/
./main.sh

In [None]:
!cp abseiling_k400 id

## Vizualizations

After extracting accuracies over frames for all categories and saving them as nested dictionaries in `frame-by-frame/saved/` as `.pkl` file, it would be useful to visualize these quantities. \\
For that, load the nested dictionary from pickle file:

In [None]:
# Imports

In [None]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
# Define paths
path_prefix = Path().parent.absolute() / 'frame-by-frame'
dict_path = path_prefix / 'saved/ResNet50_MiTv1_accuracies_per_category.pkl'
# Load from file
f = open(dict_path, 'rb')
accuracies_per_category = pickle.load(f)
# Print the categories
print(sorted(list(accuracies_per_category.keys())))

###### Max-accuracy frame

Example of per-frame accuracies from a given file: 

In [None]:
x = np.array(accuracies_per_category['cuddling']['yt-bsmpimPzz4c_8.mp4'])
x = x.reshape((len(accuracies_per_category['cuddling']['yt-bsmpimPzz4c_8.mp4'], )))
print(x)
print(f'Max accuracy at frame: {np.argmax(x)} with value: {x[np.argmax(x)]}')

Sweep through categories and through files and extract the frame number where the accuracy is maximal:

In [None]:
best_frame_dict = {}
worst_frame_dict = {}
z = 1 # For verbose
# Iterate over categories in path_prefix
for category_name in sorted(list(accuracies_per_category.keys())):
    print(f'{category_name} {z}/{len(list(accuracies_per_category.keys()))}')
    z += 1

    best_frame_dict[category_name] = {}
    worst_frame_dict[category_name] = {}
    # Iterate over files in cateogory
    for video_fname in list(accuracies_per_category[category_name].keys()):

        # Extract accuracies as (n_frames, 1) arrays
        per_frame_accuracies = np.array(accuracies_per_category[category_name][video_fname])
        per_frame_accuracies.reshape((per_frame_accuracies.shape[0], ))
        
        print(f'\t{video_fname} : Min/Max accuracy at frame:' \
        f' {np.argmax(per_frame_accuracies)}/{np.argmin(per_frame_accuracies)}' \
        f' with value: {per_frame_accuracies[np.argmax(per_frame_accuracies)]}' \
        f' / {per_frame_accuracies[np.argmin(per_frame_accuracies)]}')


        # Determined the index of the frame w/ max accuracy and write to dict
        best_frame_dict[category_name][video_fname] = (np.argmax(per_frame_accuracies),
                                                       per_frame_accuracies[np.argmax(per_frame_accuracies)])
        worst_frame_dict[category_name][video_fname] = (np.argmin(per_frame_accuracies),
                                                        per_frame_accuracies[np.argmin(per_frame_accuracies)])
        

In [None]:
per_category = []
z = 1
for category_name in sorted(list(accuracies_per_category.keys())):
    print(f'{category_name} {z}/{len(list(accuracies_per_category.keys()))}')
    
    z += 1
    vals = []
    # Iterate over files in cateogory
    for video_fname in list(accuracies_per_category[category_name].keys()):
        vals.append(best_frame_dict[category_name][video_fname][1][0])
        #print(best_frame_dict[category_name][video_fname][1][0])
    if np.array(vals).size != 0: # If not empty
        #print(np.amax(np.array(vals)))
        per_category.append([category_name, np.mean(vals)]) # np.amax(vals) for max value
print(per_category)

In [None]:
import pandas as pd
acc_df = pd.DataFrame(columns = ['labels', 'accuracies'], data=np.array(per_category))
#acc_df = acc_df['accuracies'].explode()
acc_df['accuracies'] = acc_df['accuracies'].astype('float')
# Sort values
acc_df = acc_df.sort_values(by=['accuracies'], ascending=False)
print(acc_df.head())

In [None]:
from matplotlib import rc
import matplotlib.ticker as ticker
rc('text', usetex=False) # change to True, if TeX is installed

sns.set_style("whitegrid", {'grid.linestyle': '--'})

plt.figure(figsize=(30, 5))
ax = sns.barplot(x = 'labels', y = 'accuracies',
                 data=acc_df)

plt.xticks(rotation=90, size=6)
ax.yaxis.set_major_locator(ticker.MultipleLocator(0.1))
ax.set_title(f'Ordered classification accuracies per category (average over videos) w/ ResNet50-MiTv1')
ax.set_xlabel('Categories')
ax.set_ylabel('Classification accuracy (softmax)')
#plt.savefig('ordered_accuracies_RN50_MiTv1.pdf')
plt.show()

###### Best vs. Worst frame

In [None]:
# Imports
from decord import VideoReader
from decord import cpu #, gpu
import decord
decord.bridge.set_bridge('native') # Seems to be the fastest option

In [None]:
c_name = 'jogging' # because it's loadead in repo
f_name =  'yt-cN0DqxL3bgM_52.mp4' #'yt-mKCXk3Gu-eg_74.mp4'

x = np.array(accuracies_per_category[c_name][f_name])
x = x.reshape((len(x, )))
print(x)
print(f'{video_fname} : Max/Min accuracy at frame:' \
        f' {np.argmax(per_frame_accuracies)}/{np.argmin(per_frame_accuracies)}' \
        f' with value: {per_frame_accuracies[np.argmax(per_frame_accuracies)]}' \
        f' / {per_frame_accuracies[np.argmin(per_frame_accuracies)]}')

In [None]:
import seaborn as sns
ax = sns.barplot(data=accuracies_per_category[c_name][f_name])
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)
ax.set_title(f'Prediction accuracty for \"{c_name}\""')
ax.set_xlabel('Frame nr.')
ax.set_ylabel('Prediction accuracy (softmax)')

plt.show()

In [None]:
# Load file
path_2_file = path_prefix / f'data/MIT_sampleVideos_RAW/{c_name}/{f_name}'
vr = VideoReader(str(path_2_file))

In [None]:
# Load best and worst frames
best_idx  = best_frame_dict[c_name][f_name][0]
worst_idx = worst_frame_dict[c_name][f_name][0]

best_frame = vr.get_batch([best_idx])
worst_frame = vr.get_batch([worst_idx])

In [None]:
sns.set_style("whitegrid", {'grid.linestyle': '--'})

plt.figure(figsize=(20, 10))

ax1 = plt.subplot(212)
#ax1.margins(0.05)          
ax1 = sns.barplot(data=accuracies_per_category[c_name][f_name])
ax1.set_title('Per-frame accuracies')
ax1.set_xlabel('Frame nr.')
ax1.set_ylabel('Prediction accuracy (softmax)')


ax2 = plt.subplot(221)
#ax2.margins(2, 2)           
ax2.imshow(worst_frame.asnumpy()[0])
ax2.set_title(f'Worst frame ({worst_idx})')

ax3 = plt.subplot(222)
#ax3.margins(x=0, y=-0.25)  
ax3.imshow(best_frame.asnumpy()[0])
ax3.set_title(f'Best frame ({best_idx})')

plt.savefig(str(path_prefix / 'plots/best_vs_worst_frame_example.pdf'))
plt.show()

In [None]:
c_name = 'throwing' # because it's loadead in repo
f_name =  'yt-_9SB27cLFKo_6.mp4' #'yt-mKCXk3Gu-eg_74.mp4'

# Load file
path_2_file =  f'/content/{c_name}/{f_name}'
vr = VideoReader(str(path_2_file))

# Load best and worst frames
best_idx  = best_frame_dict[c_name][f_name][0]
worst_idx = worst_frame_dict[c_name][f_name][0]

best_frame = vr.get_batch([best_idx])
worst_frame = vr.get_batch([worst_idx])

# Plot
sns.set_style("whitegrid", {'grid.linestyle': '--'})

plt.figure(figsize=(20, 10))

ax1 = plt.subplot(212)
#ax1.margins(0.05)          
ax1 = sns.barplot(data=accuracies_per_category[c_name][f_name])
ax1.set_title('Per-frame accuracies')
ax1.set_xlabel('Frame nr.')
ax1.set_ylabel('Prediction accuracy (softmax)')


ax2 = plt.subplot(221)
#ax2.margins(2, 2)           
ax2.imshow(worst_frame.asnumpy()[0])
ax2.set_title(f'Worst frame ({worst_idx})')

ax3 = plt.subplot(222)
#ax3.margins(x=0, y=-0.25)  
ax3.imshow(best_frame.asnumpy()[0])
ax3.set_title(f'Best frame ({best_idx})')

plt.savefig(str(path_prefix / 'plots/best_vs_worst_frame_example_throwing.pdf'))
plt.show()