# Most Informative Frame (MIF) extraction

**Task:**

Having a given set of videos, perform frame-wise classification of these videos using a pre-trained model.
In this case, the model is a ResNet50 pretrained on Moments in Time v1 (MiTv1).

## Preparation

In [None]:
# Install decord if necessary
# AB: Why? What does it do?
# AB: Generally check out the comments in '01.mif_extraction_exploratory' and copy some infor here as well; this should be sufficient
!pip install --upgrade decord

Collecting decord
[?25l  Downloading https://files.pythonhosted.org/packages/64/5e/e2be6a3a3a46275059574d9c6a1d422aa6c7c3cbf6614939b8a3c3f8f2d5/decord-0.5.2-py3-none-manylinux2010_x86_64.whl (14.1MB)
[K     |████████████████████████████████| 14.1MB 250kB/s 
Installing collected packages: decord
Successfully installed decord-0.5.2


#### Load pretrained model

In [2]:
#%% Define and load model
from pathlib import Path
import torch
import torch.nn as nn
import torchvision.models as models

with_cuda = False
# Define and load model
from pathlib import Path
import torch
import torch.nn as nn
import torchvision.models as models

with_cuda = False
path_prefix = Path('..')

if with_cuda:
    resnet50 = models.resnet50(pretrained=False, progress=True, num_classes=339).to('cuda')
else:
    resnet50 = models.resnet50(pretrained=False, progress=True, num_classes=339)

# AB: where from?
# Load pretrained weights (MiTv1)
#path_model = Path('/content/drive/MyDrive/resnet50_moments-fd0c4436.pth')
path_model = path_prefix / 'models/weights/resnet50_moments-fd0c4436.pth'
resnet50.load_state_dict(torch.load(path_model))

# AB: source / structure
# Evaluation mode
resnet50.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

#### Define transformations

In [3]:
#%% Transformations
import torchvision.transforms as transforms
transformation = transforms.Compose([
                                     transforms.ToPILImage(mode='RGB'), # required if the input image is a nd.array
                                     transforms.Resize(224), # To be changed to rescale to keep the aspect ration?
                                     transforms.CenterCrop((224, 224)), # AB: cropped to 224x224px
                                     transforms.ToTensor(), # AB: creates Tensor (?)
                                     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                          std=[0.229, 0.224, 0.225])
])

#### Load categories

In [7]:
# %% Load categories
# AB: How/Where stored? Just the names/labels of the categories? --> make clear
path_labels = path_prefix / 'models/labels/category_momentsv1.txt'

def load_categories():
    """Load categories."""
    with open(path_labels) as f:
        return [line.rstrip() for line in f.readlines()]

# load categories
categories = load_categories()

#### Load videos

In [9]:
#%% Sweep through files in subfolders of path_input
import os
path_input = path_prefix / 'input_data/TRIMMING/MP4s'

l_videos = []
for path, subdirs, files in os.walk(path_input):
  for name in files:
    if name[-3:] == 'mp4':
      l_videos.append([path.split('/')[-1],   # category
                       name])                 # file name
    else:
      print('Ignored: ', name)

if l_videos:
  l_videos = sorted(l_videos)
print('Total nr. of MP4s: ', len(l_videos))


Ignored:  .DS_Store
Total nr. of MP4s:  834


In [10]:
print(l_videos[:10])

[['aiming', 'yt-0gwUV4Ze-Hs_390.mp4'], ['aiming', 'yt-0qYbATyHm2A_59.mp4'], ['aiming', 'yt-iVSy96zolvw_23.mp4'], ['applauding', 'yt-06tUmXhgnSY_4.mp4'], ['applauding', 'yt-A70byjNkwdA_4.mp4'], ['applauding', 'yt-e9YwItc0qUc_5.mp4'], ['arresting', 'yt-A8r4MK3R4PI_175.mp4'], ['arresting', 'yt-aAVfUYxx12g_18.mp4'], ['arresting', 'yt-wY-LUhSZtv8_16.mp4'], ['ascending', 'vine_R-Mah0Jqm9OXZ_1.mp4']]


## TRUE class & MIF only

Extract prediction accuracy for the TRUE category on MIFs (most informative frame, i.e. frame w/ highest prediction accuracy)

**Output**

```
pandas.DataFrame w/ columns ['category', 'fname', 'mif_idx', 'softmax[category]']
```

Extract prediction accuracy for the TRUE category on MIFs (most informative frame, i.e. frame w/ highest prediction accuracy)

In [13]:
# %% Sweep through videos
import time
import decord
decord.bridge.set_bridge('native') # Seems to be the fastest option
from decord import cpu, gpu
from decord import VideoReader
from torch.nn import functional as F
import numpy as np
import pandas as pd

start = time.time()

l_mifs = []

# Iterate over entries in l_videos:
for j in range(len(l_videos)):
  # Verbose
  if j%50 == 0:
    print(f'{j}/{len(l_videos)}')
  
  
  # Define path
  category, file_name = l_videos[j]
  cat_idx = categories.index(category)
  path_input_file = str(path_input / category/ file_name)
  
  print(category, cat_idx)
  """
  # Load video with Decord.VideoReader
  vr = VideoReader(path_input_file)
  video_frames = vr.get_batch(range(0, len(vr), 1)).asnumpy()
  
  # Define empty array for accuracies
  pred_accuracies = np.zeros((video_frames.shape[0], ))

  # Iterate over frames
  for i in range(video_frames.shape[0]):
    if with_cuda:
      input = transformation(video_frames[i]).to('cuda')
    else:
      input = transformation(video_frames[i])
    
    # Classification:
    logit = resnet50.forward(input.unsqueeze(0))      # extract output to given input 
    h_x = F.softmax(logit, 1).data.squeeze()[cat_idx] # transform to softmax
    pred_accuracies[i]= h_x

  
  # Append to output list
  l_mifs.append([category, file_name,
                 np.argmax(pred_accuracies),
                 pred_accuracies[np.argmax(pred_accuracies)]])
   """
stop = time.time()
duration = stop-start
print(f'\nTime elapsed:: {duration:.4f}s (~ {duration/j:.2f}s per file)')

0/834
aiming 35
aiming 35
aiming 35
applauding 206
applauding 206
applauding 206
arresting 260
arresting 260
arresting 260
ascending 328
ascending 328
ascending 328
asking 50
asking 50
asking 50
assembling 217
assembling 217
assembling 217
attacking 174
attacking 174
attacking 174
autographing 267
autographing 267
autographing 267
baking 273
baking 273
baking 273
balancing 113
balancing 113
balancing 113
baptizing 198
baptizing 198
baptizing 198
barbecuing 231
barbecuing 231
barbecuing 231
bathing 104
bathing 104
bathing 104
bending 143
bending 143
bending 143
bicycling 152
bicycling 152
bicycling 152
biting 294
biting 294
biting 294
blocking 317
blocking 317
50/834
blocking 317
blowing 278
blowing 278
blowing 278
boarding 20
boarding 20
boarding 20
boating 196
boating 196
boating 196
bouncing 290
bouncing 290
bouncing 290
bowing 263
bowing 263
bowing 263
boxing 144
boxing 144
boxing 144
breaking 18
breaking 18
breaking 18
brushing 117
brushing 117
brushing 117
bubbling 102
bubbling 10

In [14]:
categories[327]

'speaking'

Stack together and save to csv as: \\
  `category, fname, mif_idx`

In [9]:
import pandas as pd

# Additional information comparing pandas data frame and pickle files see also at the end.

df = pd.DataFrame(l_mifs, columns=['category', 'fname', 'mif_idx', 'softmax[category]'])
print(df)
df.to_csv(path_prefix / 'temp/mifs.csv')

     category                   fname  mif_idx  softmax[category]
0      aiming  yt-0gwUV4Ze-Hs_390.mp4       58           0.607241
1      aiming   yt-0qYbATyHm2A_59.mp4        4           0.068653
2      aiming  yt-2yYb3iQCivw_130.mp4       41           0.407697
3      aiming   yt-chT_6aIyhD4_47.mp4        8           0.019289
4      aiming  yt-fG9wZzs4jis_124.mp4        1           0.354655
5      aiming   yt-fM2iXUuaP7U_48.mp4        0           0.051184
6      aiming   yt-iVSy96zolvw_23.mp4       43           0.270948
7  applauding    yt-06tUmXhgnSY_4.mp4       48           0.062576
8  applauding    yt-A70byjNkwdA_4.mp4        0           0.026515
9  applauding   yt-E14-2TmbCD8_12.mp4        1           0.540867


#### Testing

Using a previously computed dictionary of softmax values, check if same values were collected

In [16]:
from pathlib import Path
import pickle
import numpy as np

# AB: A bit more info considering the pickle file and where it comes from (was this in '01.mif_extraction_exploratory' initially?)

dict_path = path_prefix / 'temp/ResNet50_MiTv1_accuracies_per_category.pkl'
# Load from file
f = open(dict_path, 'rb')
accuracies_per_category = pickle.load(f)

l_categories = categories

# AB: Example ./aiming/yt-0gwUV4Ze-Hs_390.mp4

category_name = 'aiming'
video_fname = 'yt-0gwUV4Ze-Hs_390.mp4'

per_frame_accuracies = np.array(accuracies_per_category[category_name][video_fname])

print(f'\t{video_fname} : Max/Min accuracy at frame:' \
f' {np.argmax(per_frame_accuracies)}/{np.argmin(per_frame_accuracies)}' \
f' with value: {per_frame_accuracies[np.argmax(per_frame_accuracies)]}' \
f' / {per_frame_accuracies[np.argmin(per_frame_accuracies)]}')

	yt-0gwUV4Ze-Hs_390.mp4 : Max/Min accuracy at frame: 58/23 with value: [0.60724086] / [0.083207]


In [29]:
df.loc[lambda df: df['fname'] == video_fname]

Unnamed: 0,category,fname,mif_idx,softmax[category]
0,aiming,yt-0gwUV4Ze-Hs_390.mp4,58,0.607241


## ALL classes & ALL frames

**Output**

    ```
    {
        category_i : {
            video_j : [list of per-frame accuracies for all categories]
        }
    }
    ```

Time duration:
* i5-7400 CPU @ 3.00GHz; 16GB RAM; No CUDA: **>15h**
* Google Colab + GPU (CUDA) acceleration: **3-5h**

In [None]:
import time
from torch.nn import functional as F

accuracies_per_category = {}
z = 1 # For verbose

time_init = time.time()

# Iterate over categories in path_prefix
for category_name in list(d_files_per_category.keys()):
    print(f'{category_name} {z}/{len(list(d_files_per_category.keys()))}'); z += 1
    
    # Define empty dictionary entry for current category
    accuracies_per_category[category_name] = {}

    # Iterate over files in cateogory_path
    for video in list(d_files_per_category[category_name]):
        # verbose
        print('\t', video)
 
        video_fname = d_files_per_category[category_name][video]
        
        # Load video with Decord.VideoReader
        vr = VideoReader(video_fname, ctx=cpu(0))
        frame_id_list = range(0, len(vr), 1)
        video_frames = vr.get_batch(frame_id_list)

        pred_accuracies = []
        true_category = category_name

        start_time = time.time()
        # Iterate through frames
        for i in range(video_frames.shape[0]):
            # Load frame
            frame = video_frames.asnumpy()[i]

            # Transform
            if with_cuda:
                input = transformation(frame).to('cuda')
            else:
                input = transformation(frame)
            
            # Classification:
            logit = resnet50_moments.forward(input.unsqueeze(0))
            h_x = F.softmax(logit, 1).data.squeeze()
            probs, idx = h_x.sort(0, True)

            # Save accuracies for the all classes (TYPE #2)
            pred_accuracies.append(probs.cpu().numpy())
        
        # Calculate avg duration per frame
        end_time = time.time()
        print('\tAvg duration per frame: %4.4f seconds.' % ((end_time - start_time)/(video_frames.shape[0])))
        
        # Add computed list of accuracies as entry in the output dictionary
        accuracies_per_category[category_name][video] = pred_accuracies

time_final = time.time()
duration = time_final-time_init
print(f'\nTime elapsed: {duration:.2f}s (~ {duration/z:.2f}s per file)')
        

In [None]:
# Save dictionary to pickle file

# Difference: pandas data frame || pickle file
# The pickle (.pkl) file is different form the pandas dataframe used earlier.

# Where the dataframe contains the foruc columns:
# category fname mif_idx  softmax[category]

# the pickle file contains the accuracy per category for all files (massive data)
# additionally it contains the nested structure of the data
# (the mif indices could also restored from the larger pickle file again,
# but not the other way round)

import pickle
dict_path = path_prefix / 'temp/accuracies_per_category_mitv1_fps-25.pkl'

if dict_path.is_file():
    print(f'File {dict_path} already exists. Change dict_path and re-run cell!')
else:
    with open(dict_path, 'wb') as f:
        pickle.dump(accuracies_per_category, f)
        f.close()
    print(f'Saved at {dict_path.absolute()}')

Difference: pandas data frame || pickle file

The pickle (.pkl) file is different form the pandas dataframe used earlier.

Where the dataframe contains the foruc columns:
category fname mif_idx  softmax[category]

the pickle file contains the accuracy per category for all files (massive data) additionally it contains the nested structure of the data
(the mif indices could also restored from the larger pickle file again, but not the other way round)