<a href="https://colab.research.google.com/github/AnjanDutta/EEEM068/blob/main/Notebooks/Human_Action_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lets download the pickled dataset from the above link.

In [None]:
!pip install gdown torch torchvision datasets evaluate torchmetrics transformers random

In [None]:
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm.notebook import trange, tqdm
import gdown
import zipfile
import random

In [None]:
# Download the dataset


# Google Drive file ID
file_id = "1BqMBtsuvb6mTpiZUZ9WKcJA8f1hkI2yX"
url = f"https://drive.google.com/uc?id={file_id}"

# Download file
output = "HMDB_simp.zip"
gdown.download(url, output, quiet=False)

# Unzip the file
with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall(".")

print("Download and extraction complete!")

In [None]:
CATEGORY_INDEX = {
    "brush_hair": 0,
    "cartwheel": 1,
    "catch": 2,
    "chew": 3,
    "climb": 4,
    "climb_stairs": 5,
    "draw_sword": 6,
    "eat": 7,
    "fencing": 8,
    "flic_flac": 9,
    "golf": 10,
    "handstand": 11,
    "kiss": 12,
    "pick": 13,
    "pour": 14,
    "pullup": 15,
    "pushup": 16,
    "ride_bike": 17,
    "shoot_bow": 18,
    "shoot_gun": 19,
    "situp": 20,
    "smile": 21,
    "smoke": 22,
    "throw": 23,
    "wave": 24
}

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

def load_sampled_frames(frame_dir, frame_rate=8):
    """
    Load every [frame_rate]-th frame from a directory and apply transformations.
    """
    frame_files = sorted(os.listdir(frame_dir))  # Ensure frames are in order
    sampled_frames = []
    frame_metadata = []

    for i in range(0, len(frame_files), frame_rate):
        frame_path = os.path.join(frame_dir, frame_files[i])
        frame = Image.open(frame_path).convert("RGB")  # Convert to RGB
        frame = transform(frame)  # Apply transformations
        sampled_frames.append(frame)
        frame_metadata.append({'index': i, 'used_in_clip': False, 'file_path': frame_path}) #set up

    return sampled_frames, frame_metadata  # List of torch tensors

def create_clips(frames, frame_metadata, clip_size=8):
    """
    Given a list of sampled frames, create multiple [clip_size]-frame clips.
    """
    clips = []
    clip_indices = []
    updated_metadata = []
    if len(frames) < clip_size:
        return clips, updated_metadata, clip_indices  # Not enough frames to create a clip
    for i in range(0, len(frames) - clip_size + 1, clip_size):  # Sliding window
        clip = torch.stack(frames[i:i + clip_size])  # Stack into (clip_size, 3, 224, 224)
        clips.append(clip)
        clip_indices.append([frame_metadata[j]['index'] for j in range(i, i + clip_size)])
        clip_metadata = [frame_metadata[j]['file_path'] for j in range(i, i + clip_size)]
        updated_metadata.append(clip_metadata)

    return clips, updated_metadata, clip_indices


DATASET_PATH = "/content/HMDB_simp" # specified path

def split_sources(dataset_path, train_ratio=0.8):
    """
    Splits source folders into train and val sets before processing clips.
    Ensures that all clips from a source video stay in the same set.
    """
    train_sources = {}
    val_sources = {}

    for category in os.listdir(dataset_path):  # Iterate over action categories
        category_path = os.path.join(dataset_path, category)
        if not os.path.isdir(category_path):
            continue

        instances = os.listdir(category_path)  # List all source folders (video IDs)
        random.shuffle(instances)  # Shuffle instances before splitting

        split_idx = int(len(instances) * train_ratio)
        train_sources[category] = instances[:split_idx]  # First 80% for training
        val_sources[category] = instances[split_idx:]  # Last 20% for validation

    return train_sources, val_sources


def process_dataset(dataset_path, sources_dict):
    """
    Processes dataset based on a predefined list of sources.
    """
    dataset = []
    all_updated_metadata = []

    for category, instances in tqdm(sources_dict.items()):
        category_path = os.path.join(dataset_path, category)

        for instance in instances:
            instance_path = os.path.join(category_path, instance)
            if not os.path.isdir(instance_path):
                continue  # Skip non-directory files

            # Load sampled frames
            frames, frame_metadata = load_sampled_frames(instance_path)

            # Create 8-frame clips
            clips, updated_metadata, clip_indices = create_clips(frames, frame_metadata)


            for i, clip in enumerate(clips):
                dataset.append((clip, CATEGORY_INDEX[category]))  # Store (clip, label)

                all_updated_metadata.append(updated_metadata[i])

    return dataset, all_updated_metadata  # List of (clip, label)

class VideoDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        clip, label = self.dataset[idx]
        return clip, torch.tensor(label, dtype=torch.long)

from torch.utils.data import default_collate

class VideoDataCollator:
    """
    Custom data collator for TimeSFormer.
    Converts (clip, label) tuples into a dictionary format.
    """
    def __call__(self, features):
        clips, labels = zip(*features)  # Unpack (clip, label)
        batch = {
            "pixel_values": torch.stack(clips),  # Stack clips into batch
            "labels": torch.tensor(labels, dtype=torch.long)  # Convert labels to tensor
        }
        return batch

from collections import Counter

def count_classes(dataset):
    """
    Function to print the number of clips of 8 created per class
    """
    class_counts = Counter(label for _, label in dataset)
    sorted_class_counts = dict(sorted(class_counts.items()))

    for class_label, count in sorted_class_counts.items():
        print(f"Class {class_label}: {count} clips of 8")

    return sorted_class_counts

#count_classes(train_dataset);

# Split source folders into train & val
train_sources, val_sources = split_sources(DATASET_PATH)

# Process train and val sets separately
train_dataset, train_metadata = process_dataset(DATASET_PATH, train_sources)
val_dataset, val_metadata = process_dataset(DATASET_PATH, val_sources)

dataset_size = len(train_dataset) + len(val_dataset)



In [None]:
for key in train_sources:  # Iterate through keys (category names)
  print(f"Category: {key}")
  for item in train_sources[key]:
    print(item, end=', ')  # Iterate through
  print()

In [None]:

import torch
import numpy as np
import cv2
import os
from transformers import TimesformerForVideoClassification, AutoImageProcessor
from pytorch_grad_cam.utils.image import show_cam_on_image


In [None]:
def split_sources(dataset_path, train_ratio=0.8):
    """
    Splits source folders into train and val sets before processing clips.
    Ensures that all clips from a source video stay in the same set.
    """
    train_sources = {}
    val_sources = {}

    for category in os.listdir(dataset_path):  # Iterate over action categories
        category_path = os.path.join(dataset_path, category)
        if not os.path.isdir(category_path):
            continue

        instances = os.listdir(category_path)  # List all source folders (video IDs)
        random.shuffle(instances)  # Shuffle instances before splitting

        split_idx = int(len(instances) * train_ratio)
        train_sources[category] = instances[:split_idx]  # First 80% for training
        val_sources[category] = instances[split_idx:]  # Last 20% for validation

    return train_sources, val_sources

In [None]:
# GLOBAL MATRIX
import os
import numpy as np

def global_matrix(dataset_dir = "HMDB_simp"):
  CATEGORY_INDEX = {
      "brush_hair": 0,
      "cartwheel": 1,
      "catch": 2,
      "chew": 3,
      "climb": 4,
      "climb_stairs": 5,
      "draw_sword": 6,
      "eat": 7,
      "fencing": 8,
      "flic_flac": 9,
      "golf": 10,
      "handstand": 11,
      "kiss": 12,
      "pick": 13,
      "pour": 14,
      "pullup": 15,
      "pushup": 16,
      "ride_bike": 17,
      "shoot_bow": 18,
      "shoot_gun": 19,
      "situp": 20,
      "smile": 21,
      "smoke": 22,
      "throw": 23,
      "wave": 24
  }

  MATR = []
  for category_name in CATEGORY_INDEX:
    category_path = os.path.join(dataset_dir, category_name)
    if os.path.exists(category_path):
      # subfolders inside the category
      global_array = []
      for folder in os.listdir(category_path):
          folder_path = os.path.join(category_path, folder)

          if os.path.isdir(folder_path):
              files = os.listdir(folder_path)
              number_of_jpgs = len(files)
              global_array.append(number_of_jpgs)

      MATR.append(global_array)

  return MATR


def print_matrix(matrix):
    rows = len(matrix)
    cols = len(matrix[0])

    for i in range(rows):
        for j in range(cols):
            print(f'{matrix[i][j]:5}', end=' ')
        print()


MATR = global_matrix()
Matr1 = np.array(MATR)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def plot_heatmap(MATR, category_names):
    # Convert MATR to a numpy array if it's a list of lists
    MATR_array = np.array(MATR)

    # Create a heatmap without annotations (values in the cells)
    plt.figure(figsize=(10, 8))
    ax = sns.heatmap(MATR_array, annot=False, cmap='YlGnBu', cbar=True, linewidths=0.5, square=True, yticklabels=category_names)

    # Set the title
    ax.set_title("Heatmap of number of frames values", fontsize=16)

    # Set x and y labels based on categories or custom labels
    ax.set_xlabel('Videos', fontsize=12)
    ax.set_ylabel('Categories', fontsize=12)

    # Display the plot
    plt.show()

category_names = [
    "brush_hair", "cartwheel", "catch", "chew", "climb", "climb_stairs",
    "draw_sword", "eat", "fencing", "flic_flac", "golf", "handstand",
    "kiss", "pick", "pour", "pullup", "pushup", "ride_bike", "shoot_bow",
    "shoot_gun", "situp", "smile", "smoke", "throw", "wave"
]
plot_heatmap(MATR, category_names)
#print_matrix(MATR)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Example matrix MATR (25 categories x 50 videos)
# Convert your actual data into a numpy array
MATR = np.array(MATR)

# Category names
category_names = [
    "brush_hair", "cartwheel", "catch", "chew", "climb", "climb_stairs",
    "draw_sword", "eat", "fencing", "flic_flac", "golf", "handstand",
    "kiss", "pick", "pour", "pullup", "pushup", "ride_bike", "shoot_bow",
    "shoot_gun", "situp", "smile", "smoke", "throw", "wave"
]

# Calculating mean frames per category
# The mean should be the sum of frames for each category divided by the number of videos
mean_frames_category = np.sum(MATR, axis=1) / 50


# Plotting the Mean Frames per Category
plt.figure(figsize=(10, 6))
plt.bar(range(MATR.shape[0]), mean_frames_category, color='thistle')  # Lilac/purple color
plt.title('Mean Frames per Category', fontsize=14)
plt.xlabel('Category', fontsize=12)
plt.ylabel('Mean Frames', fontsize=12)
plt.xticks(range(MATR.shape[0]), category_names, rotation=90, fontsize=10)
plt.grid(True, axis='y', linestyle='--', alpha=0.5, color='grey')  # Adding grey grid to the y-axis
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
from scipy.stats import skew, kurtosis

# Convert the list of lists into a numpy array
MATR = np.array(MATR)

# 1. Total number of frames per video (sum across rows for each column)
total_frames_video = np.sum(MATR, axis=0)

# 2. Total number of frames per category (sum across columns for each row)
total_frames_category = np.sum(MATR, axis=1)

# 3. Mean number of frames per video (mean across rows for each column)
mean_frames_video = np.mean(MATR, axis=0)

# 4. Mean number of frames per category (mean across columns for each row)
mean_frames_category = np.mean(MATR, axis=1)

# 5. Variance of frames per video (variance across rows for each column)
var_frames_video = np.var(MATR, axis=0)

# 6. Standard deviation of frames per video (std across rows for each column)
std_frames_video = np.std(MATR, axis=0)

# 7. Maximum and minimum number of frames per video
max_frames_video = np.max(MATR, axis=0)
min_frames_video = np.min(MATR, axis=0)

# 8. Total number of frames across all videos and categories
total_frames_all = np.sum(MATR)

# 9. Skewness and Kurtosis (using scipy.stats)
skew_frames_video = skew(MATR, axis=0)
kurt_frames_video = kurtosis(MATR, axis=0)

# 10. Percentiles (25th, 50th, 75th percentile per video)
percentiles_video = np.percentile(MATR, [25, 50, 75], axis=0)

# Print the results
print("Total frames per video:", total_frames_video)
print("Total frames per category:", total_frames_category)
print("Mean frames per video:", mean_frames_video)
print("Mean frames per category:", mean_frames_category)
print("Variance frames per video:", var_frames_video)
print("Standard deviation frames per video:", std_frames_video)
print("Max frames per video:", max_frames_video)
print("Min frames per video:", min_frames_video)
print("Total frames across all videos and categories:", total_frames_all)
print("Skewness frames per video:", skew_frames_video)
print("Kurtosis frames per video:", kurt_frames_video)
print("Percentiles (25th, 50th, 75th) frames per video:", percentiles_video)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis

# Example matrix MATR (25 categories x 50 videos)
# Convert your actual data into a numpy array
MATR = np.random.randint(1, 100, size=(25, 50))  # Example, replace with your actual data

# Calculating statistics
total_frames_video = np.sum(MATR, axis=0)
total_frames_category = np.sum(MATR, axis=1)
mean_frames_video = np.mean(MATR, axis=0)
mean_frames_category = np.mean(MATR, axis=1)
var_frames_video = np.var(MATR, axis=0)
std_frames_video = np.std(MATR, axis=0)
max_frames_video = np.max(MATR, axis=0)
min_frames_video = np.min(MATR, axis=0)
total_frames_all = np.sum(MATR)
skew_frames_video = skew(MATR, axis=0)
kurt_frames_video = kurtosis(MATR, axis=0)
percentiles_video = np.percentile(MATR, [25, 50, 75], axis=0)

# Plotting the statistics
fig, axes = plt.subplots(4, 2, figsize=(15, 15))


# Total frames per video
axes[0, 0].bar(range(MATR.shape[1]), total_frames_video)
axes[0, 0].set_title('Total Frames per Video')
axes[0, 0].set_xlabel('Video Index')
axes[0, 0].set_ylabel('Total Frames')


# Total frames per category
axes[0, 1].bar(range(MATR.shape[0]), total_frames_category)
axes[0, 1].set_title('Total Frames per Category')
axes[0, 1].set_xlabel('Category Index')
axes[0, 1].set_ylabel('Total Frames')

# Mean frames per video
axes[1, 0].bar(range(MATR.shape[1]), mean_frames_video)
axes[1, 0].set_title('Mean Frames per Video')
axes[1, 0].set_xlabel('Video Index')
axes[1, 0].set_ylabel('Mean Frames')


# Mean frames per category
axes[1, 1].bar(range(MATR.shape[0]), mean_frames_category)
axes[1, 1].set_title('Mean Frames per Category')
axes[1, 1].set_xlabel('Category Index')
axes[1, 1].set_ylabel('Mean Frames')

# Variance of frames per video
axes[2, 0].bar(range(MATR.shape[1]), var_frames_video)
axes[2, 0].set_title('Variance of Frames per Video')
axes[2, 0].set_xlabel('Video Index')
axes[2, 0].set_ylabel('Variance')

# Standard deviation of frames per video
axes[2, 1].bar(range(MATR.shape[1]), std_frames_video)
axes[2, 1].set_title('Standard Deviation of Frames per Video')
axes[2, 1].set_xlabel('Video Index')
axes[2, 1].set_ylabel('Standard Deviation')

# Maximum frames per video
axes[3, 0].bar(range(MATR.shape[1]), max_frames_video)
axes[3, 0].set_title('Max Frames per Video')
axes[3, 0].set_xlabel('Video Index')
axes[3, 0].set_ylabel('Max Frames')

# Minimum frames per video
axes[3, 1].bar(range(MATR.shape[1]), min_frames_video)
axes[3, 1].set_title('Min Frames per Video')
axes[3, 1].set_xlabel('Video Index')
axes[3, 1].set_ylabel('Min Frames')

plt.tight_layout()
plt.show()

# Skewness and Kurtosis plot (for each video)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(range(MATR.shape[1]), skew_frames_video, marker='o', color='b', label='Skewness')
plt.title('Skewness of Frames per Video')
plt.xlabel('Video Index')
plt.ylabel('Skewness')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(MATR.shape[1]), kurt_frames_video, marker='o', color='r', label='Kurtosis')
plt.title('Kurtosis of Frames per Video')
plt.xlabel('Video Index')
plt.ylabel('Kurtosis')
plt.legend()

plt.tight_layout()
plt.show()

# Percentiles plot (25th, 50th, 75th percentile)
plt.figure(figsize=(12, 6))
plt.plot(range(MATR.shape[1]), percentiles_video[0], label='25th Percentile', color='g')
plt.plot(range(MATR.shape[1]), percentiles_video[1], label='50th Percentile (Median)', color='b')
plt.plot(range(MATR.shape[1]), percentiles_video[2], label='75th Percentile', color='r')

plt.title('Percentiles of Frames per Video')
plt.xlabel('Video Index')
plt.ylabel('Frames')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Example means vector
means = np.array([265.0, 74.32, 39.62, 99.02, 162.74, 78.92, 75.26, 89.0, 77.22,
                  67.72, 80.48, 82.44, 180.02, 88.74, 171.02, 79.86, 93.22, 96.36,
                  175.54, 94.16, 92.76, 71.24, 145.0, 99.02, 77.52])

# Reshape means to a 2D array (25 rows, 1 column)
means_reshaped = means.reshape(-1, 1)

# Plot the heatmap without annotations (numbers inside cells)
plt.figure(figsize=(10, 8))
ax = sns.heatmap(means_reshaped, annot=False, cmap='YlGnBu', cbar=True,
                 linewidths=0.5, square=True, xticklabels=False, yticklabels=category_names)

# Set the title
ax.set_title("Heatmap of Means", fontsize=16)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Example means vector
means = np.array([265.0, 74.32, 39.62, 99.02, 162.74, 78.92, 75.26, 89.0, 77.22,
                  67.72, 80.48, 82.44, 180.02, 88.74, 171.02, 79.86, 93.22, 96.36,
                  175.54, 94.16, 92.76, 71.24, 145.0, 99.02, 77.52])

# Reshape means to a 2D array (1 row, 25 columns)
means_reshaped = means.reshape(1, -1)

# Plot the heatmap as a row
plt.figure(figsize=(12, 2))  # Adjust the figure size to make the row more visible
ax = sns.heatmap(means_reshaped, annot=False, cmap='YlGnBu', cbar=True,
                 linewidths=0.5, square=False, xticklabels=True, yticklabels=False)

# Set the title
ax.set_title("Heatmap of Row Means", fontsize=16)
plt.show()


In [None]:
import numpy as np

# Assume global_matrix is a NumPy array of shape (25, 50)
def category_stats(global_matrix):
    for idx, row in enumerate(global_matrix):
        mean_val = np.mean(row)
        median_val = np.median(row)
        std_val = np.std(row)
        min_val = np.min(row)
        max_val = np.max(row)
        print(f'Category {idx+1}: Mean={mean_val:.2f}, Median={median_val}, Std={std_val:.2f}, Min={min_val}, Max={max_val}')


In [None]:
import os
import numpy as np


dataset_dir = "HMDB_simp"

mean_number_per_categ = []
names = []
arr = []
global_matrix = []
for category_name in CATEGORY_INDEX:
  category_path = os.path.join(dataset_dir, category_name)


  frames_number_per_video = []
  if os.path.exists(category_path):
    # subfolders inside the category
    global_array = []
    for folder in os.listdir(category_path):
        folder_path = os.path.join(category_path, folder)

        if os.path.isdir(folder_path):
            files = os.listdir(folder_path)
            number_of_jpgs = len(files)
            global_array.append(number_of_jpgs)
            frames_number_per_video.append(number_of_jpgs)

    mean_jpgs_among_videos = np.mean(frames_number_per_video)
    mean_number_per_categ.append(mean_jpgs_among_videos)
    names.append([category_name])
    arr.append([category_name, mean_jpgs_among_videos])
    global_matrix.append(global_array)
    #print(f"'{category_name}': {mean_jpgs_among_videos}" )


mean_number_per_categ.sort()

In [None]:
sorted_data = sorted(arr, key=lambda x: x[1])

print("Average number of frames per category")
print("----------------------------------")
for el in sorted_data:
  print(f"{el[0]:>15}: {el[1]:>7}" )

In [None]:
# (mean_number, category_name)
# reverse CATEGORY_INDEX to map index -> category
index_to_category = {index: name for name, index in CATEGORY_INDEX.items()}

#  paired list
paired = [(mean_number_per_categ[i], index_to_category[i]) for i in range(len(mean_number_per_categ))]
paired_sorted = sorted(paired)

sorted_means, sorted_names = zip(*paired_sorted)
sorted_means = list(sorted_means)
sorted_names = list(sorted_names)

# Print the result
for name, mean in zip(sorted_names, sorted_means):
    print(f"{name:>15}: {mean:>7}")


In [None]:
# Reverse mapping: index to category name
index_to_category = {index: name for name, index in CATEGORY_INDEX.items()}
combined = [(index_to_category[i], mean_number_per_categ[i]) for i in range(len(mean_number_per_categ))]
sorted_combined = sorted(combined, key=lambda x: x[1])


print("Average number of frames per class")
print("----------------------------------")
for category, frames in sorted_combined:

    print(f"{category:>15} {frames:>7}")
