# Video Classifier Using CNN and RNN
#!dir

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os

dataset_path = os.listdir('/content/gdrive/MyDrive/no_fight')

label_types = os.listdir('/content/gdrive/MyDrive/no_fight')
label_types = label_types[0:2]
print (label_types)  

['noFight', 'fight']


# Preparing Training Data

In [5]:
rooms = []

for item in dataset_path:
 # Get all the file names
 all_rooms = os.listdir('/content/gdrive/MyDrive/no_fight' + '/' +item)

 # Add them to the list
 for room in all_rooms:
    rooms.append((item, str('/content/gdrive/MyDrive/no_fight' + '/' +item) + '/' + room))
    
# Build a dataframe        
train_df = pd.DataFrame(data=rooms, columns=['tag', 'video_name'])
print(train_df.head())
print(train_df.tail())

       tag                                         video_name
0  noFight  /content/gdrive/MyDrive/no_fight/noFight/nofi1...
1  noFight  /content/gdrive/MyDrive/no_fight/noFight/nofi1...
2  noFight  /content/gdrive/MyDrive/no_fight/noFight/nofi0...
3  noFight  /content/gdrive/MyDrive/no_fight/noFight/nofi0...
4  noFight  /content/gdrive/MyDrive/no_fight/noFight/nofi0...
       tag                                        video_name
295  fight  /content/gdrive/MyDrive/no_fight/fight/fi143.mp4
296  fight  /content/gdrive/MyDrive/no_fight/fight/fi059.mp4
297  fight  /content/gdrive/MyDrive/no_fight/fight/fi132.mp4
298  fight  /content/gdrive/MyDrive/no_fight/fight/fi085.mp4
299  fight  /content/gdrive/MyDrive/no_fight/fight/fi134.mp4


In [6]:
df = train_df.loc[:,['video_name','tag']]
df
df.to_csv('train.csv')

# Preparing Test Data

In [5]:
# dataset_path = os.listdir('/content/gdrive/MyDrive/no_fight')
# print(dataset_path)

# room_types = os.listdir('dataset/test')
# print("Types of activities found: ", len(dataset_path))

# rooms = []

# for item in dataset_path:
#  # Get all the file names
#  all_rooms = os.listdir('dataset/test' + '/' +item)

#  # Add them to the list
#  for room in all_rooms:
#     rooms.append((item, str('dataset/test' + '/' +item) + '/' + room))
    
# # Build a dataframe        
# test_df = pd.DataFrame(data=rooms, columns=['tag', 'video_name'])
# print(test_df.head())
# print(test_df.tail())

# df = test_df.loc[:,['video_name','tag']]
# df
# df.to_csv('test.csv')

In [7]:
!pip install git+https://github.com/tensorflow/docs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/tensorflow/docs
  Cloning https://github.com/tensorflow/docs to /tmp/pip-req-build-j7pltgcl
  Running command git clone --filter=blob:none --quiet https://github.com/tensorflow/docs /tmp/pip-req-build-j7pltgcl
  Resolved https://github.com/tensorflow/docs to commit f2427a69cf3b706192288403c09d19d2b3dfc231
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting astor
  Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Building wheels for collected packages: tensorflow-docs
  Building wheel for tensorflow-docs (setup.py) ... [?25l[?25hdone
  Created wheel for tensorflow-docs: filename=tensorflow_docs-0.0.0.dev0-py3-none-any.whl size=182612 sha256=c1acb376b94245584812e94dba6fad1dce6bbbe3b4216f9497cf93bcdfb2d207
  Stored in directory: /tmp/pip-ephem-wheel-cache-pal8u23u/wheels/fc/f8/3b/5d21409a59cb1be9b1ade11f682039ced75b84de9dd6a0c8de
Successful

In [9]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

In [10]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
  except RuntimeError as e:
    print(e)

# Data preparation

In [11]:
train_df = pd.read_csv("train.csv")
# test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
# print(f"Total videos for testing: {len(test_df)}")

df = df.sample(frac=1)

df

Total videos for training: 300


Unnamed: 0,video_name,tag
50,/content/gdrive/MyDrive/no_fight/noFight/nofi1...,noFight
270,/content/gdrive/MyDrive/no_fight/fight/fi068.mp4,fight
65,/content/gdrive/MyDrive/no_fight/noFight/nofi1...,noFight
45,/content/gdrive/MyDrive/no_fight/noFight/nofi1...,noFight
38,/content/gdrive/MyDrive/no_fight/noFight/nofi0...,noFight
...,...,...
101,/content/gdrive/MyDrive/no_fight/noFight/nofi0...,noFight
203,/content/gdrive/MyDrive/no_fight/fight/fi011.mp4,fight
86,/content/gdrive/MyDrive/no_fight/noFight/nofi0...,noFight
223,/content/gdrive/MyDrive/no_fight/fight/fi106.mp4,fight


# Feed the videos to a network:


In [12]:
# The following two methods are taken from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
IMG_SIZE = 224


def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

   ### Feature Extraction

In [13]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


### Label Encoding
StringLookup layer encode the class labels as integers.

In [14]:
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(df["tag"]))
print(label_processor.get_vocabulary())

labels = df["tag"].values
labels = label_processor(labels[..., None]).numpy()
labels

['fight', 'noFight']


array([[1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
    

Finally, we can put all the pieces together to create our data processing utility.

In [None]:
#print(train_data[0].shape)
#train_data[0]

In [16]:
#Define hyperparameters

IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 100

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [18]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    
    ##take all classlabels from train_df column named 'tag' and store in labels
    labels = df["tag"].values
    
    #convert classlabels to label encoding
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool") # 145,20
    frame_features = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") #145,20,2048

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(df, "train")
# test_data, test_labels = prepare_all_videos(test_df, "test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")



print(f"train_labels in train set: {train_labels.shape}")

# print(f"test_labels in train set: {test_labels.shape}")

# MAX_SEQ_LENGTH = 20, NUM_FEATURES = 2048. We have defined this above under hyper parameters

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Frame features in train set: (300, 20, 2048)
Frame masks in train set: (300, 20)
train_labels in train set: (300, 1)


In [21]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
EPOCHS = 100
early_stopping = EarlyStopping(monitor='val_accuracy',
                          min_delta=0,
                          patience=25,
                          verbose=1,
                          restore_best_weights=True
                          )

reduce_learningrate = ReduceLROnPlateau(monitor='val_accuracy',
                              factor=0.2,
                              patience=20,
                              verbose=1,
                              min_delta=0.0001)
filepath = "./tmp/video_classifier"

# checkpoint = keras.callbacks.ModelCheckpoint(
#         filepath, save_weights_only=True, save_best_only=True, verbose=1
#     )

callbacks_list = [early_stopping,reduce_learningrate]

# The sequence model
Now, we can feed this data to a sequence model consisting of recurrent layers like GRU.

In [22]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(frame_features_input, mask=mask_input)
    x = keras.layers.Dropout(0.3)(x)
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(128, activation="relu")(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(1024, activation="relu")(x)
    x = keras.layers.Dropout(0.1)(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model

EPOCHS = 100
# Utility for running experiments.
def run_experiment():
    

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.2,
        epochs=EPOCHS,
        callbacks=callbacks_list,
        batch_size=32
    )

    # seq_model.load_weights(filepath)
    # _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    # print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()

NameError: ignored

In [43]:
sequence_model.save("my_model")
sequence_model.save("trained_model.h5")



# Inference

In [19]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames

test_video = np.random.choice(train_df["video_name"].values.tolist())
print(f"Test video path: {'/content/gdrive/MyDrive/no_fight/noFight/nofi035.mp4'}")

test_frames = sequence_prediction('/content/gdrive/MyDrive/no_fight/noFight/nofi035.mp4')


Test video path: /content/gdrive/MyDrive/no_fight/noFight/nofi035.mp4


NameError: ignored