# D3D Helper

Unofficial example code for using a pre-trained Distilled 3D Network (D3D) for video classification. For more details, please refer to the paper:

"D3D: Distilled 3D Networks for Video Action Recognition."  
_Jonathan C. Stroud, David A. Ross, Chen Sun, Jia Deng, and Rahul Sukthankar._  
[arXiv 2018](https://arxiv.org/abs/1812.08249)

## Prerequisites

1. Tensorflow 1.12
2. imageio
3. skimage
2. Place model checkpoints from [Google Drive](https://drive.google.com/drive/folders/1Yb-g-Ae_B4tyM7N7bk0kRd2r5ChO-oa3?usp=sharing) and place them in `d3dhelper/weights/<model_name>`.

For example, if you download `d3d_kinetics_600` and unzip it in the correct folder, the output tree will be:
```
weights
└── d3d_kinetics_600
    ├── model.ckpt.data-00000-of-00001
    ├── model.ckpt.index
    └── model.ckpt.meta
```    
Make sure to change the variable `_MODEL_CKPT` to load the new model (along with the other variables that changes the input dataset and the network type, for example)

In [None]:
import sys
import imageio
import numpy as np
import skimage.transform
import tensorflow as tf
import matplotlib.pyplot as plt
import time
%matplotlib inline

sys.path.append('./models/research/slim')
from nets import i3d
from nets import s3dg
arg_scope = tf.contrib.framework.arg_scope

# Model options
_NET_TYPE = 's3dg'  # Options: 's3dg', 'i3d'
_DATASET = 'kinetics_600'  # Options: 'kinetics_400', 'kinetics_600'
_MODEL_CKPT = './weights/d3d_kinetics_600/model.ckpt'

## Load sample videos

In [None]:
def load_video(fn, num_frames, input_size):

    video = np.ndarray((1, num_frames, input_size, input_size, 3), np.float32)
    reader = imageio.get_reader(fn)

    for i, im in zip(range(num_frames), reader):
        # Convert to float
        im = im / 255
        # Scale
        h, w = im.shape[:2]
        min_side = min(h, w)
        scale_factor = input_size/min_side
        im = skimage.transform.resize(im, (int(h*scale_factor), int(w*scale_factor)))
        # Center crop
        h, w = im.shape[:2]
        im = im[(h-input_size)//2:(h+input_size)//2,
                (w-input_size)//2:(w+input_size)//2]
        video[:, i] = im
        
    return video

In [None]:
# Video options
_INPUT_SIZE = 224
_NUM_FRAMES = 64

abseiling = load_video('abseiling.mp4', _NUM_FRAMES, _INPUT_SIZE)
airdrum = load_video('airdrum.mp4', _NUM_FRAMES, _INPUT_SIZE)

# Parte Custom
framesInput = 300
inputSize = 224
volleyball = load_video('volleyball.mp4', framesInput, inputSize)

In [None]:
def show_video(video, num_frames):
    plt.subplot(1, 3, 1)
    plt.imshow(video[:, num_frames//4].squeeze())
    plt.axis('off')

    plt.subplot(1, 3, 2)
    plt.imshow(video[:, num_frames//2].squeeze())
    plt.axis('off')

    plt.subplot(1, 3, 3)
    plt.imshow(video[:, 3*num_frames//4].squeeze())
    plt.axis('off')
    plt.show()

In [None]:
show_video(abseiling, _NUM_FRAMES)
show_video(airdrum, _NUM_FRAMES)

# Custom
show_video(volleyball, framesInput)

## Initialize model architecture

In [None]:
inputs = tf.placeholder(tf.float32, (1, framesInput, inputSize, inputSize, 3))

if _DATASET == 'kinetics_400':
    num_classes = 400
elif _DATASET == 'kinetics_600':
    num_classes = 600

if _NET_TYPE == 's3dg':
    sc = s3dg.s3dg_arg_scope()
    with arg_scope(sc):
        net, end_points = s3dg.s3dg(inputs, num_classes, is_training=False)
elif _NET_TYPE == 'i3d':
    sc = i3d.i3d_arg_scope()
    with arg_scope(sc):
        net, end_points = i3d.i3d(inputs, num_classes, is_training=False)
else:
    print('Network type not supported.')

## Load D3D weights and run model

In [None]:
saver = tf.train.Saver()

with tf.Session() as sess:
    saver.restore(sess, _MODEL_CKPT)
    
    # net_eval_abseiling = sess.run(net, feed_dict={inputs: abseiling})
    # net_eval_airdrum = sess.run(net, feed_dict={inputs: airdrum})
    start_time = time.time()
    
    net_eval_volley = sess.run(net, feed_dict={inputs: volleyball})
    
    end_time = time.time()
    print("--- Execution time: %s seconds ---" % (end_time - start_time))

## Check results

"abseiling" and "air drumming" are classes 0 and 1 in Kinetics-400, respectively.

In [None]:
# print(net_eval_abseiling.argmax())

In [None]:
# print(net_eval_airdrum.argmax())

In [None]:
# Custom
print(net_eval_volley.argmax())