# Video Classifcation/Action Recognition using CNN and RNN

## Data collection

Dataset: UCF101 Action Recognition dataset (https://www.crcv.ucf.edu/data/UCF101.php)

In order to keep the data collection simple, we have already downloaded UCF101 dataset split(1) to google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
!cp /content/gdrive/MyDrive/ucf101.tar.gz /content

In [None]:
!tar xf ucf101.tar.gz

## Setup

In [None]:
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os
import pickle
import math

## Define hyperparameters

In [None]:
IMG_SIZE = 224

FPS = 6
Frames_to_skip = math.floor(25/FPS) - 1  #UCF101 videos are 25fps standardized

MAX_FRAMES = 0
MAX_SEQ_LENGTH = 20  #1 2 3 *4 5 6 7 *8 9 10 11 *12 13 14 15 *16 17 18 19 *20 21 22 23 *24 25 ---> 3.2 sec of 1 video
NUM_FEATURES = 1280

## Data preparation

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 9537
Total videos for testing: 3783


Unnamed: 0,video_name,tag
3826,v_HighJump_g21_c03.avi,HighJump
669,v_BaseballPitch_g24_c04.avi,BaseballPitch
7481,v_Shotput_g11_c05.avi,Shotput
4354,v_JugglingBalls_g10_c02.avi,JugglingBalls
8546,v_TaiChi_g10_c01.avi,TaiChi
8171,v_StillRings_g25_c06.avi,StillRings
7797,v_Skijet_g22_c04.avi,Skijet
7616,v_SkateBoarding_g20_c02.avi,SkateBoarding
6265,v_PlayingTabla_g08_c03.avi,PlayingTabla
4918,v_Lunges_g11_c04.avi,Lunges


In [None]:
# Helper functions to load, extract and preprocess frames from video

def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]

# Returns frames of a video
def load_video(path, max_frames=MAX_FRAMES, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
      
        target = Frames_to_skip
        counter = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if counter == target:     # To get desired fps for videocapture
              frame = crop_center_square(frame)
              frame = cv2.resize(frame, resize)
              frame = frame[:, :, [2, 1, 0]]
              frames.append(frame)

              if len(frames) == max_frames:
                  break
              
              counter = 0   #reset fps counter

            else:
              counter += 1

    finally:
        cap.release()
    return np.array(frames)


## Defining Feature Extractor

We have used MobileNetV2 model to extract meaningful features from the extracted frames. This is a state-of-the-art model pre-trained on the ImageNet-1k dataset.

In [None]:

def build_feature_extractor():
    feature_extractor = keras.applications.mobilenet_v2.MobileNetV2(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.mobilenet_v2.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()   # Returns CNN model to feature_extractor

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5


Encoding class labels

In [None]:
# Get class names
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
)
class_names = label_processor.get_vocabulary()
print(class_names)

# save class names
with open('class_name_list.pickle', 'wb') as f:
  pickle.dump(class_names, f)
!cp class_name_list.pickle /content/gdrive/MyDrive

['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress', 'Biking', 'Billiards', 'BlowDryHair', 'BlowingCandles', 'BodyWeightSquats', 'Bowling', 'BoxingPunchingBag', 'BoxingSpeedBag', 'BreastStroke', 'BrushingTeeth', 'CleanAndJerk', 'CliffDiving', 'CricketBowling', 'CricketShot', 'CuttingInKitchen', 'Diving', 'Drumming', 'Fencing', 'FieldHockeyPenalty', 'FloorGymnastics', 'FrisbeeCatch', 'FrontCrawl', 'GolfSwing', 'Haircut', 'HammerThrow', 'Hammering', 'HandstandPushups', 'HandstandWalking', 'HeadMassage', 'HighJump', 'HorseRace', 'HorseRiding', 'HulaHoop', 'IceDancing', 'JavelinThrow', 'JugglingBalls', 'JumpRope', 'JumpingJack', 'Kayaking', 'Knitting', 'LongJump', 'Lunges', 'MilitaryParade', 'Mixing', 'MoppingFloor', 'Nunchucks', 'ParallelBars', 'PizzaTossing', 'PlayingCello', 'PlayingDaf', 'PlayingDhol', 'PlayingFlute', 'PlayingGuitar', 'PlayingPiano', 'PlayingSitar', 'PlayingTabla', 'P

Finally, we can put all the pieces together to create our data processing utility.

In [None]:

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels

### Extract Features

In [None]:
train_data, train_labels = prepare_all_videos(train_df, "train")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

Frame features in train set: (9537, 20, 1280)
Frame masks in train set: (9537, 20)


Save extracted features from train data

In [None]:
with open('train_data.pickle', 'wb') as f:
    pickle.dump(train_data, f)

# Save train labels
with open('train_labels.pickle', 'wb') as f:
    pickle.dump(train_labels, f)

# zip and save to drive
!tar cf train_features.tar.gz train_data.pickle train_labels.pickle
!cp train_features.tar.gz /content/gdrive/MyDrive

 Extract features from test set

In [None]:
test_data, test_labels = prepare_all_videos(test_df, "test")

Save extracted features from test data

In [None]:
# Save extracted features from test data
with open('test_data.pickle', 'wb') as f:
    pickle.dump(test_data, f)

# Save test labels
with open('test_labels.pickle', 'wb') as f:
    pickle.dump(test_labels, f)

# zip and save to drive
!tar cf test_features.tar.gz test_data.pickle test_labels.pickle
!cp test_features.tar.gz /content/gdrive/MyDrive