**Instituto Tecnológico de Costa Rica**

**Escuela de Ingeniería en Computación**

**Maestría Académica en Ciencias de la Computación**

**Curso: Electiva Deep Learning**

**Segundo Semestre 2024**

**Profesor: Dr. Luis-Alexander Calvo-Valverde**

---

**Proyecto:**

**Datos de la entrega:** Jueves 21 de noviembre 2024

---

**Estudiantes:**
- Andrey Arguedas Espinoza

## • Prerequisites

### - This version is implemented in Anaconda Navigator, if you want to to run it on Google Colab make sure to have the Pro version and mount the instance
### - You need at leat 25GB RAM to run this project
### - Add the dataset to the same path you have the notebook so it loads inmediately

## • Import required libraries

In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" #Super important for these type of projects where plotting consumes a lot of resources
import random

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten, Dense, LSTM, TimeDistributed, Dropout
from tensorflow.keras.layers import BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam

### • Define global variables for the project

In [2]:
# Path to UCF-101 dataset (replace with your path)
dataset_path = './UCF101/UCF-101/'
video_file = 'BaseballPitch/v_BaseballPitch_g07_c01.avi'  # example video file

checkpoints_classification = 'checkpoints_classification'

#checkpoint filename to save
checkpoint_filename = 'detection_model{epoch:02d}.h5'
#checkpoint filename to load
checkpoint_filename_load = 'detection_model20.h5'

# Specify frame size (width, height) for resizing
frame_size = (60, 60)
frames_per_video = 64
batch_size = 16

### • Functions to show video frames

In [3]:
# Function to read video frames
def read_video(video_path):
    frames = []
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
        frames.append(frame)
    cap.release()
    return frames

# Function to visualize video frames in a multi-row grid
def visualize_video(frames, num_frames=16, num_rows=2):
    num_cols = num_frames // num_rows
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 5 * num_rows))
    
    for i in range(num_frames):
        row = i // num_cols
        col = i % num_cols
        if i < len(frames):
            axes[row, col].imshow(frames[i])
        axes[row, col].axis('off')
        
    plt.tight_layout()
    plt.show()

In [4]:
# Load and visualize video
#video_path = os.path.join(dataset_path, video_file)
#frames = read_video(video_path)
#visualize_video(frames, num_frames=64, num_rows=8)  # Display 64 frames

### • Load the UFC-101

In [5]:
def load_ucf101_category(category, frame_size=frame_size):
    """
    Load and resize videos from a specified category in the UCF-101 dataset.
    
    Args:
    - category (str): Name of the category folder (e.g., 'ApplyEyeMakeup')
    - frame_size (tuple): Desired frame size (width, height) for resizing
    
    Returns:
    - videos (dict): A dictionary with video filenames as keys and lists of frames as values.
    """
    category_path = os.path.join(dataset_path, category)
    videos = {}

    for video_file in os.listdir(category_path):
        if video_file.endswith('.avi'):
            video_path = os.path.join(category_path, video_file)
            cap = cv2.VideoCapture(video_path)
            frames = []

            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break
                # Resize the frame
                frame = cv2.resize(frame, frame_size)
                # Convert BGR to RGB for consistency with other libraries
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame)
            
            cap.release()
            videos[video_file] = frames  # Store frames by video filename
    
    return videos

In [6]:
# Example usage
category = 'Basketball'  # Replace with a category you want to load
#videos = load_ucf101_category(category, frame_size=frame_size)

# Checking one example video
#for video_name, frames in videos.items():
    #print(f"Loaded video '{video_name}' with {len(frames)} frames resized to {frame_size}.")
    #visualize_video(frames, num_frames=64, num_rows=8)  # Display 64 frames
    #break  # Just to print one example, remove if you want to list all

In [7]:
# Function to read video and resize frames
def load_video(video_path, frame_size=frame_size, num_frames=frames_per_video):
    cap = cv2.VideoCapture(video_path.numpy().decode('utf-8'))
    frames = []
    try:
        while len(frames) < num_frames:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, frame_size)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
    finally:
        cap.release()

    # If video is shorter than num_frames, repeat last frame
    while len(frames) < num_frames:
        frames.append(frames[-1])

    return tf.convert_to_tensor(frames, dtype=tf.uint8)

# Wrapper to make function compatible with tf.data
def load_video_wrapper(video_path):
    return tf.py_function(load_video, [video_path], tf.uint8)

# Function to create dataset
def create_ucf101_dataset(dataset_path, frame_size=frame_size, frames_per_video=frames_per_video, load_ratio = 2):
    # Get list of all video file paths and corresponding labels
    video_paths = []
    labels = []
    class_names = sorted(os.listdir(dataset_path))
    for label, class_name in enumerate(class_names):
        class_dir = os.path.join(dataset_path, class_name)
        for video_file in os.listdir(class_dir):
            if video_file.endswith('.avi') and random.randint(0, load_ratio) == 1:
                video_paths.append(os.path.join(class_dir, video_file))
                labels.append(label)

    # Convert to tf.data.Dataset
    video_paths_ds = tf.data.Dataset.from_tensor_slices(video_paths)
    labels_ds = tf.data.Dataset.from_tensor_slices(labels)

    # Load and process videos
    videos_ds = video_paths_ds.map(
        lambda path: load_video_wrapper(path),
        num_parallel_calls=tf.data.AUTOTUNE
    )

    # Combine videos and labels
    dataset = tf.data.Dataset.zip((videos_ds, labels_ds))

    # Batch, shuffle, and prefetch
    dataset = dataset.shuffle(buffer_size=100).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return dataset, class_names, len(video_paths)

In [8]:
# Plot a batch of videos in a grid format
def plot_video_batch(video_batch, num_frames=frame_size, num_rows=8):
    num_videos = video_batch.shape[0]  # Batch size
    num_cols = num_frames // num_rows

    fig, axes = plt.subplots(num_rows * num_videos, num_cols, figsize=(20, 5 * num_rows * num_videos))
    fig.subplots_adjust(hspace=0.5)

    for v in range(num_videos):
        frames = video_batch[v]
        for i in range(num_frames):
            row = (v * num_rows) + (i // num_cols)
            col = i % num_cols
            ax = axes[row, col] if num_videos > 1 else axes[col]
            ax.imshow(frames[i].numpy())
            ax.axis('off')

    plt.show()

In [9]:
# Create the dataset
dataset, class_names, amount_of_videos = create_ucf101_dataset(dataset_path, frame_size=frame_size, frames_per_video=frames_per_video, load_ratio = 10)

In [10]:
# Example usage: Iterate over the dataset
for video, label in dataset.take(5):
    print("Video batch shape:", video.shape)  # Should be (batch_size, frames_per_video, height, width, channels)
    #print(video)
    #plot_video_batch(video, num_frames=frames_per_video, num_rows=8)
    print("Label batch shape:", label.shape)
    print(label)

Video batch shape: (16, 64, 60, 60, 3)
Label batch shape: (16,)
tf.Tensor([5 4 0 5 6 2 1 0 1 6 4 6 7 2 5 4], shape=(16,), dtype=int32)
Video batch shape: (16, 64, 60, 60, 3)
Label batch shape: (16,)
tf.Tensor([4 5 6 2 3 1 6 0 3 8 1 8 8 2 7 1], shape=(16,), dtype=int32)
Video batch shape: (16, 64, 60, 60, 3)
Label batch shape: (16,)
tf.Tensor([7 8 2 9 2 5 8 9 8 6 4 8 7 9 1 7], shape=(16,), dtype=int32)
Video batch shape: (16, 64, 60, 60, 3)
Label batch shape: (16,)
tf.Tensor([ 8  9  0 10 10  8  2  1  8  7  6  0  0  6  2  0], shape=(16,), dtype=int32)
Video batch shape: (16, 64, 60, 60, 3)
Label batch shape: (16,)
tf.Tensor([ 8  4  5  8  0  0  6 11 11  4 10  5  2  8  7  0], shape=(16,), dtype=int32)


### • Data normalization

#### Normalize very video in the range of [0,1]

In [11]:
# Normalization function
def normalize_video(video, label):
    video = tf.image.convert_image_dtype(video, tf.float32)  # Converts to float32 in range [0, 1]
    return video, label

In [12]:
# Normalize the dataset
dataset = dataset.map(normalize_video, num_parallel_calls=tf.data.AUTOTUNE)

### • Dataset split

In [13]:
# Function to split the dataset
def split_dataset(dataset, train_split=0.7, val_split=0.15, test_split=0.15, shuffle_buffer_size=1000, dataset_amount = 10000):
    # Calculate sizes
    dataset_size = dataset_amount # Get total dataset size
    train_size = int(train_split * dataset_size)
    val_size = int(val_split * dataset_size)
    print("Sizes creados")
    # Shuffle and split
    dataset = dataset.shuffle(shuffle_buffer_size, reshuffle_each_iteration=False)
    print("Shuffle")
    train_dataset = dataset.take(train_size)
    print("Train take", dataset_amount)
    val_dataset = dataset.skip(train_size).take(val_size)
    print("Val take", val_size)
    test_dataset = dataset.skip(train_size + val_size)
    print("Test take")
    return train_dataset, val_dataset, test_dataset

In [14]:
print("Amount of videos", amount_of_videos)
# Split the dataset
train_dataset, val_dataset, test_dataset = split_dataset(dataset, train_split=0.7, val_split=0.15, test_split=0.15, shuffle_buffer_size=1000, dataset_amount = amount_of_videos)

# Batch, shuffle, and prefetch
train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

Amount of videos 1230
Sizes creados
Shuffle
Train take 1230
Val take 184
Test take


### Architectures

In [15]:
def build_action_detection_model(input_shape=(16, 60, 60, 3), num_classes=101):
    model = Sequential()

    # Reduced 3D CNN for spatio-temporal feature extraction
    model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling3D(pool_size=(1, 2, 2)))

    model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling3D(pool_size=(2, 2, 2)))

    model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling3D(pool_size=(2, 2, 2)))

    # Flatten output and pass through LSTM
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(256, return_sequences=False))

    # Fully connected layers with fewer units
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation='softmax'))

    return model

In [16]:
#First we try to load the saved model, if exists we loaded otherwise we create a new one
was_action_detection_classification_model_loaded_from_disk = False

if os.path.isfile(checkpoints_classification + '/'  + checkpoint_filename_load):
    print("Loading saved classification model!!!")
    action_detection_model = tf.keras.models.load_model(checkpoints_classification + '/'  + checkpoint_filename_load)
    was_action_detection_classification_model_loaded_from_disk = True
else:
    print("Creating classification model!!!")
    action_detection_model = build_action_detection_model(input_shape=(frames_per_video, 60, 60, 3))
    
action_detection_model.summary()

Creating classification model!!!
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d (Conv3D)             (None, 62, 58, 58, 32)    2624      
                                                                 
 batch_normalization (BatchN  (None, 62, 58, 58, 32)   128       
 ormalization)                                                   
                                                                 
 max_pooling3d (MaxPooling3D  (None, 62, 29, 29, 32)   0         
 )                                                               
                                                                 
 conv3d_1 (Conv3D)           (None, 60, 27, 27, 64)    55360     
                                                                 
 batch_normalization_1 (Batc  (None, 60, 27, 27, 64)   256       
 hNormalization)                                                 
                       

# *********************** Training phase *************************

### We use this common function to train all the architectures of our project, it also creates the checkpoints that we will save for each architecture

In [17]:
# Training the model
def train_model(model, train_dataset, val_dataset, batch_size=64, epochs=5, checkpoint_dir=''):
    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'Precision'])

    # Create checkpoints folder
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    
    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(checkpoint_dir, checkpoint_filename),
        save_weights_only=False,  # Save the entire model, not just weights
        save_best_only=False,     # Save the model after every epoch, not just the best one
        monitor='loss',
        mode='min'
    )
    
    # Train the model
    history = model.fit(train_dataset, epochs=epochs, batch_size=batch_size, validation_data=val_dataset, callbacks=[checkpoint_cb])
    return history

In [18]:
def visualize_metrics(history, under_lim = -1, upper_lim = 1):
    pd.DataFrame(history.history).plot(figsize=(10, 7))
    plt.grid(True)
    plt.gca().set_ylim(under_lim, upper_lim)
    plt.xlabel("epochs")
    plt.show()

In [19]:
if was_action_detection_classification_model_loaded_from_disk is False:
    history = train_model(action_detection_model, train_dataset, val_dataset, batch_size=batch_size, epochs=5, checkpoint_dir=checkpoints_classification)
    visualize_metrics(history)

Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node 'sequential/conv3d/Relu' defined at (most recent call last):
    File "C:\ProgramData\anaconda3\envs\tf\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\ProgramData\anaconda3\envs\tf\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelapp.py", line 701, in start
      self.io_loop.start()
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "C:\ProgramData\anaconda3\envs\tf\lib\asyncio\windows_events.py", line 321, in run_forever
      super().run_forever()
    File "C:\ProgramData\anaconda3\envs\tf\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "C:\ProgramData\anaconda3\envs\tf\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "C:\ProgramData\anaconda3\envs\tf\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 534, in dispatch_queue
      await self.process_one()
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 523, in process_one
      await dispatch(*args)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 429, in dispatch_shell
      await result
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 767, in execute_request
      reply_content = await reply_content
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\ipykernel\ipkernel.py", line 429, in do_execute
      res = shell.run_cell(
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell
      result = self._run_cell(
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell
      result = runner(coro)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\AndreyArguedas\AppData\Local\Temp\ipykernel_24840\1672635815.py", line 2, in <module>
      history = train_model(action_detection_model, train_dataset, val_dataset, batch_size=batch_size, epochs=5, checkpoint_dir=checkpoints_classification)
    File "C:\Users\AndreyArguedas\AppData\Local\Temp\ipykernel_24840\22140041.py", line 19, in train_model
      history = model.fit(train_dataset, epochs=epochs, batch_size=batch_size, validation_data=val_dataset, callbacks=[checkpoint_cb])
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\sequential.py", line 410, in call
      return super().call(inputs, training=training, mask=mask)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\layers\convolutional\base_conv.py", line 314, in call
      return self.activation(outputs)
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\activations.py", line 317, in relu
      return backend.relu(
    File "C:\ProgramData\anaconda3\envs\tf\lib\site-packages\keras\backend.py", line 5366, in relu
      x = tf.nn.relu(x)
Node: 'sequential/conv3d/Relu'
input and filter must have the same depth: 60 vs 3
	 [[{{node sequential/conv3d/Relu}}]] [Op:__inference_train_function_4454]