In [None]:
import numpy as np
import tensorflow as tf
import cv2
import os
import random
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import deque
from moviepy.editor import VideoFileClip

In [None]:
# Constants
DATASET_DIR = r"C:\Users\dines\Downloads\AI Project\UCF50\UCF50"
CLASSES_LIST = ["Biking", "Punch", "Diving"]
IMAGE_HEIGHT, IMAGE_WIDTH = 64, 64
SEQUENCE_LENGTH = 20

def frames_extraction(video_path):
    frames_list = []
    video_reader = cv2.VideoCapture(video_path)
    video_frames_count = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
    skip_frames_window = max(int(video_frames_count / SEQUENCE_LENGTH), 1)
    for frame_counter in range(SEQUENCE_LENGTH):
        video_reader.set(cv2.CAP_PROP_POS_FRAMES, frame_counter * skip_frames_window)
        success, frame = video_reader.read()
        if not success:
            break
        resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
        normalized_frame = resized_frame / 255
        frames_list.append(normalized_frame)
    video_reader.release()
    return frames_list

def create_dataset():
    features = []
    labels = []
    video_files_paths = []
    for class_index, class_name in enumerate(CLASSES_LIST):
        print(f'Extracting Data of Class: {class_name}')
        files_list = os.listdir(os.path.join(DATASET_DIR, class_name))
        for file_name in files_list:
            video_file_path = os.path.join(DATASET_DIR, class_name, file_name)
            frames = frames_extraction(video_file_path)
            if len(frames) == SEQUENCE_LENGTH:
                features.append(frames)
                labels.append(class_index)
                video_files_paths.append(video_file_path)
    features = np.asarray(features)
    labels = np.array(labels)
    return features, labels, video_files_paths

features, labels, video_files_paths = create_dataset()

one_hot_encoded_labels = to_categorical(labels)
num_samples = len(features)
test_size = max(0.25, 1 / num_samples)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features, one_hot_encoded_labels, test_size=test_size, random_state=42)

In [3]:

def create_cnn_model():
    model = Sequential()
    model.add(Conv3D(32, (3, 3, 3), activation='relu', input_shape=(SEQUENCE_LENGTH, IMAGE_HEIGHT, IMAGE_WIDTH, 3)))
    model.add(MaxPooling3D((1, 2, 2)))  # Adjusted kernel size
    model.add(Dropout(0.25))
    model.add(Conv3D(64, (3, 3, 3), activation='relu'))
    model.add(MaxPooling3D((1, 2, 2)))  # Adjusted kernel size
    model.add(Dropout(0.25))
    model.add(Conv3D(128, (3, 3, 3), activation='relu'))
    model.add(MaxPooling3D((1, 2, 2)))  # Adjusted kernel size
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(CLASSES_LIST), activation='softmax'))
    model.summary()
    return model

cnn_model = create_cnn_model()
# early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, mode='min', restore_best_weights=True)
cnn_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=["accuracy"])
cnn_model_training_history = cnn_model.fit(x=X_train, y=y_train, epochs=50, batch_size=4,
                                           shuffle=True, validation_split=0.2)
                                           

model_evaluation_history = cnn_model.evaluate(X_test, y_test)
# Calculate additional metrics
y_pred = cnn_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_true_classes, y_pred_classes)
precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')
cnn_model.save('cnn_model.keras')


Extracting Data of Class: Biking
Extracting Data of Class: Punch
Extracting Data of Class: Diving


Epoch 1/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 431ms/step - accuracy: 0.3621 - loss: 1.3965 - val_accuracy: 0.5362 - val_loss: 1.0244
Epoch 2/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 423ms/step - accuracy: 0.5378 - loss: 0.8682 - val_accuracy: 0.5507 - val_loss: 0.9481
Epoch 3/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 427ms/step - accuracy: 0.6394 - loss: 0.8542 - val_accuracy: 0.6377 - val_loss: 0.9197
Epoch 4/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 445ms/step - accuracy: 0.6906 - loss: 0.7535 - val_accuracy: 0.6232 - val_loss: 1.0989
Epoch 5/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 430ms/step - accuracy: 0.6330 - loss: 0.8850 - val_accuracy: 0.4928 - val_loss: 0.9622
Epoch 6/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 428ms/step - accuracy: 0.4758 - loss: 0.9807 - val_accuracy: 0.4203 - val_loss: 1.0722
Epoch 7/50
[1m69/69[

In [4]:
X_train_cnn_reshaped = np.expand_dims(X_train, axis=-1)
X_test_cnn_reshaped = np.expand_dims(X_test, axis=-1)

def create_lstm_model(cnn_model):
    # Define the input shape for TimeDistributed layer based on reshaped CNN model output
    input_shape = (SEQUENCE_LENGTH,) + cnn_model.output_shape[1:]
    
    model = Sequential()
    model.add(TimeDistributed(Flatten(input_shape=input_shape)))
    model.add(LSTM(64))
    model.add(Dense(len(CLASSES_LIST), activation='softmax'))
    model.summary()
    return model

lstm_model = create_lstm_model(cnn_model)
# early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, mode='min', restore_best_weights=True)

# callbacks=[early_stopping_callback]
# Compile and train the LSTM model
lstm_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=["accuracy"])
lstm_model.fit(X_train_cnn_reshaped, y_train, epochs=50, batch_size=4, shuffle=True, validation_split=0.2)
model_evaluation_history = lstm_model.evaluate(X_test_cnn_reshaped, y_test)
# Calculate additional metrics
y_pred = lstm_model.predict(X_test_cnn_reshaped)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_true_classes, y_pred_classes)
precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')
lstm_model.save('lstm_model.keras')

Epoch 1/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 129ms/step - accuracy: 0.4715 - loss: 1.5996 - val_accuracy: 0.5217 - val_loss: 1.0574
Epoch 2/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 118ms/step - accuracy: 0.5223 - loss: 0.9692 - val_accuracy: 0.6232 - val_loss: 0.8748
Epoch 3/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 121ms/step - accuracy: 0.6547 - loss: 0.8070 - val_accuracy: 0.6232 - val_loss: 0.8502
Epoch 4/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 119ms/step - accuracy: 0.6796 - loss: 0.7816 - val_accuracy: 0.6812 - val_loss: 0.8122
Epoch 5/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 119ms/step - accuracy: 0.7120 - loss: 0.7723 - val_accuracy: 0.6957 - val_loss: 0.7617
Epoch 6/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 112ms/step - accuracy: 0.7130 - loss: 0.7302 - val_accuracy: 0.6812 - val_loss: 0.8085
Epoch 7/50
[1m69/69[0m [

In [5]:
def predict_on_video(video_file_path, output_file_path, SEQUENCE_LENGTH):
    '''
    This function will perform action recognition on a video using the LRCN model.
    Args:
    video_file_path:  The path of the video stored in the disk on which the action recognition is to be performed.
    output_file_path: The path where the ouput video with the predicted action being performed overlayed will be stored.
    SEQUENCE_LENGTH:  The fixed number of frames of a video that can be passed to the model as one sequence.
    '''

    # Initialize the VideoCapture object to read from the video file.
    video_reader = cv2.VideoCapture(video_file_path)

    # Get the width and height of the video.
    original_video_width = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH))
    original_video_height = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Initialize the VideoWriter Object to store the output video in the disk.
    video_writer = cv2.VideoWriter(output_file_path, cv2.VideoWriter_fourcc('M', 'P', '4', 'V'), 
                                   video_reader.get(cv2.CAP_PROP_FPS), (original_video_width, original_video_height))

    # Declare a queue to store video frames.
    frames_queue = deque(maxlen = SEQUENCE_LENGTH)

    # Initialize a variable to store the predicted action being performed in the video.
    predicted_class_name = ''

    # Iterate until the video is accessed successfully.
    while video_reader.isOpened():

        # Read the frame.
        ok, frame = video_reader.read() 
        
        # Check if frame is not read properly then break the loop.
        if not ok:
            break

        # Resize the Frame to fixed Dimensions.
        resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
        
        # Normalize the resized frame by dividing it with 255 so that each pixel value then lies between 0 and 1.
        normalized_frame = resized_frame / 255

        # Appending the pre-processed frame into the frames list.
        frames_queue.append(normalized_frame)

        # Check if the number of frames in the queue are equal to the fixed sequence length.
        if len(frames_queue) == SEQUENCE_LENGTH:

            # Pass the normalized frames to the model and get the predicted probabilities.
            predicted_labels_probabilities = lstm_model.predict(np.expand_dims(frames_queue, axis = 0))[0]

            # Get the index of class with highest probability.
            predicted_label = np.argmax(predicted_labels_probabilities)

            # Get the class name using the retrieved index.
            predicted_class_name = CLASSES_LIST[predicted_label]

        # Write predicted class name on top of the frame.
        cv2.putText(frame, predicted_class_name, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # Write The frame into the disk using the VideoWriter Object.
        video_writer.write(frame)
        
    # Release the VideoCapture and VideoWriter objects.
    video_reader.release()
    video_writer.release()

In [7]:
# Construct the output video path.
output_video_file_path = r"C:\Users\dines\Downloads\bikingl.mp4"
input_video_file_path = r"C:\Users\dines\Downloads\biking.mp4"
# Perform Action Recognition on the Test Video.
predict_on_video(input_video_file_path, output_video_file_path, SEQUENCE_LENGTH)

# Display the output video.
VideoFileClip(output_video_file_path, audio=False, target_resolution=(300,None)).ipython_display()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 536ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5

                                                                                                                       

Moviepy - Done !
Moviepy - video ready __temp__.mp4
