In [37]:
import torch
import numpy as np
import mediapipe as mp
from torchvision.io import read_video
from torchvision.transforms import ToPILImage, ToTensor, Compose


def preprocess(video_path, n):
    # Initialize the mediapipe model for pose estimation
    mp_pose = mp.solutions.pose

    # Extract the video label from the path (name without extension)
    label = video_path.split('/')[-2]

    # Read the video using torchvision
    video, _, _ = read_video(video_path, pts_unit='sec')

    # Total number of frames in the video
    total_frames = video.size(0)

    # List to store matrices of relative coordinates
    data = []

    # Iterate over frames with a step of 'n'
    for i in range(0, total_frames, n):
        # Make sure not to go out of range
        end_idx = min(i + n, total_frames)

        # Extract the frame package
        frame_package = video[i:end_idx].permute(0, 3, 1, 2)
        
        # Inference poses in the frame package using mediapipe
        with mp_pose.Pose() as pose:
            pose_data = []
            for frame in frame_package:
                frame =frame.permute(1, 2, 0)
                result = pose.process(np.array(frame).astype(np.uint8))
                # Extract relative coordinates of landmarks 
                landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in result.pose_landmarks.landmark]
                pose_data.append(landmarks)
                
        data.append(torch.tensor(pose_data))

    # If the last package is smaller, pad with zeros
    if len(data) > 1:
        last_package_size = len(data[-1])
        if last_package_size < n:
            padding_size = n - last_package_size
            zero_padding = torch.zeros(padding_size, len(data[-1][0]))
            data[-1] = torch.cat([data[-1], zero_padding])

    return torch.stack(data), label

# Example usage
video_path = "data/barbell biceps curl/barbell biceps curl_61.mp4"
n_frames_per_package = 10

tensor_coordinates, label = preprocess(video_path, n_frames_per_package)

# Print the shape of the resulting tensor
print(f"Shape of the coordinates tensor: {tensor_coordinates.shape}")


Shape of the coordinates tensor: torch.Size([7, 10, 33, 3])


In [None]:
import os
from torch.utils.data import Dataset

class VideoDataset(Dataset):
    def __init__(self, data_folder, n_frames_per_batch):
        self.data_folder = data_folder
        self.n_frames_per_batch = n_frames_per_batch
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Get the list of subfolders (classes)
        self.classes = sorted(os.listdir(data_folder))

        # Map classes to numeric indices
        self.class_to_index = {cls: idx for idx, cls in enumerate(self.classes)}

        # List to store video paths and their labels
        self.data = []

        # Iterate through subfolders and create the data list
        for cls in self.classes:
            cls_path = os.path.join(data_folder, cls)
            if os.path.isdir(cls_path):
                videos = os.listdir(cls_path)
                for video in videos:
                    video_path = os.path.join(cls_path, video)
                    self.data.append((video_path, self.class_to_index[cls]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        video_path, label = self.data[index]
        coordinates_tensor, label = preprocess(video_path, self.n_frames_per_batch)
        
        return coordinates_tensor, label

In [None]:
def load_data(data_folder, n_frames_per_batch, batch_size, shuffle=True):
    dataset = VideoDataset(data_folder, n_frames_per_batch)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return data_loader

# Example usage
data_folder = "data"  # Path to the folder containing subfolders for each class
n_frames_per_batch = 10
batch_size = 32

data_loader = load_data(data_folder, n_frames_per_batch, batch_size)