In [1]:
import os
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import torch
import pdb
import numpy as np

In [2]:
# Define the path to your processed dataset
data_path = './processed/'

# Define the subjects for each split
subjects = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 3, 5, 6, 7, 10]
val_subjects = [24, 25, 1, 4]
test_subjects = [22, 2, 8, 9]

# Define the background variations
background_variations = ['d1', 'd2', 'd3', 'd4']

In [3]:
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    # Add more augmentations if needed
])

In [4]:
processed_folder = './processed/'
# Define the number of frames per subsequence
num_frames_per_subsequence = 20
# List all action folders in the processed folder
action_folders = os.listdir(processed_folder)

classes = action_folders
class_to_label = {class_name: idx for idx, class_name in enumerate(classes)}
sequences = []
target_arr = []
print(class_to_label)
# Iterate over each action folder
for action_folder in action_folders:
    target = action_folder
    action_path = os.path.join(processed_folder, action_folder)

    # List all person folders in the action folder
    person_folders_actual = os.listdir(action_path)
    # Filter videos based on subjects and background variations
    person_folders_target = [f'person{subject:02d}_{action_folder}_{bg}' 
                   for subject in subjects 
                   for bg in background_variations]
    person_folders = set(person_folders_actual) & set(person_folders_target) # it can be that some background variations (or smth else) is missing. 
    # Iterate over each person folder
    for person_folder in person_folders:
        person_path = os.path.join(action_path, person_folder)
        # List all image files in the person folder
        image_files = os.listdir(person_path)

        # Split the image files into subsequences
        num_frames = len(image_files)
        num_subsequences = num_frames // num_frames_per_subsequence

        for i in range(num_subsequences):
            start_index = i * num_frames_per_subsequence
            end_index = start_index + num_frames_per_subsequence

            # Load and process the frames in the subsequence
            subsequence_frames = []
            for j in range(start_index, end_index):
                image_path = os.path.join(person_path, image_files[j])
                frame = Image.open(image_path).convert('RGB')
                # Apply any desired spatial augmentations to the frame
                frame = transform(frame)
                subsequence_frames.append(frame)

            # Apply any desired temporal augmentations to the subsequence
            
            # Process the subsequence (e.g., feed it to a model for action classification)
            subsequence_frames = np.array(subsequence_frames)
            # Perform further processing on the subsequence

            target_arr.append(class_to_label[target])
            sequences.append(subsequence_frames)

{'boxing': 0, 'jogging': 1, 'handclapping': 2, 'walking': 3, 'running': 4, 'handwaving': 5}


  subsequence_frames = np.array(subsequence_frames)
  subsequence_frames = np.array(subsequence_frames)


In [7]:
print(np.array(sequences).shape)
print(np.array(target_arr).shape)
print(np.array(sequences)[0][1])

(9844, 20)
(9844,)
tensor([[[0.1725, 0.1843, 0.1882,  ..., 0.2353, 0.2196, 0.2196],
         [0.1608, 0.1725, 0.1804,  ..., 0.2196, 0.2039, 0.1922],
         [0.1647, 0.1725, 0.1804,  ..., 0.2353, 0.2314, 0.2157],
         ...,
         [0.1451, 0.1529, 0.1608,  ..., 0.2196, 0.2157, 0.2078],
         [0.1451, 0.1529, 0.1608,  ..., 0.2275, 0.2157, 0.2039],
         [0.1333, 0.1490, 0.1608,  ..., 0.2353, 0.2275, 0.2078]],

        [[0.1725, 0.1843, 0.1882,  ..., 0.2353, 0.2196, 0.2196],
         [0.1608, 0.1725, 0.1804,  ..., 0.2196, 0.2039, 0.1922],
         [0.1647, 0.1725, 0.1804,  ..., 0.2353, 0.2314, 0.2157],
         ...,
         [0.1451, 0.1529, 0.1608,  ..., 0.2196, 0.2157, 0.2078],
         [0.1451, 0.1529, 0.1608,  ..., 0.2275, 0.2157, 0.2039],
         [0.1333, 0.1490, 0.1608,  ..., 0.2353, 0.2275, 0.2078]],

        [[0.1725, 0.1843, 0.1882,  ..., 0.2353, 0.2196, 0.2196],
         [0.1608, 0.1725, 0.1804,  ..., 0.2196, 0.2039, 0.1922],
         [0.1647, 0.1725, 0.1804,  ...,

In [3]:
# # Define your custom dataset class
# class KTHDataset(Dataset):
#     def __init__(self, data_path, subjects, background_variations, transform=None, num_frames=20):
#         self.data_path = data_path
#         self.transform = transform
#         self.num_frames = num_frames
#         self.classes = os.listdir(data_path)

#         # Filter videos based on subjects and background variations
#         self.videos = [f'{cls}/person{subject:02d}_{cls}_{bg}' 
#                        for subject in subjects 
#                        for cls in self.classes 
#                        for bg in background_variations]
        
        

#     def __len__(self):
#         return len(self.videos)

#     def __getitem__(self, idx):
#         video_name = self.videos[idx]
#         print(idx)
#         video_path = os.path.join(self.data_path, video_name)
#         sorted_images = sorted(os.listdir(video_path))
#         images = []
#         for frame_init_id in range(0, len(sorted_images), self.num_frames):
#             if frame_init_id + 20 > len(sorted_images):
#                 break
#             else:
#                 for frame_id in range(frame_init_id, frame_init_id+self.num_frames):
#                     frame = sorted_images[frame_id]
#                     frame_path = os.path.join(video_path, frame)
#                     image = Image.open(frame_path).convert('RGB')

#                     if self.transform:
#                         image = self.transform(image)

#                     images.append(image)

#         # Stack images along the time dimension to form a sequence
#         sequence = torch.stack(images, dim=0)
#         # Extract the action class from the video name
#         class_name = video_name.split('_')[1]
#         class_idx = self.classes.index(class_name)

#         return sequence, class_idx

In [4]:
# # Define data transformations and augmentations
# transform = transforms.Compose([
#     transforms.Resize((256, 256)),
#     transforms.ToTensor(),
#     # Add more augmentations if needed
# ])

# # Initialize datasets for training, validation, and test
# train_dataset = KTHDataset(data_path, train_subjects, background_variations, transform=transform)
# val_dataset = KTHDataset(data_path, val_subjects, background_variations, transform=transform)
# test_dataset = KTHDataset(data_path, test_subjects, background_variations, transform=transform)

# # Define the corresponding DataLoaders
# batch_size = 32
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [5]:
# train_dataset

<__main__.KTHDataset at 0x7fc8005e0d30>

In [6]:
# # Now, you can iterate over the training, validation, and test dataloaders
# for epoch in range(20):
#     for batch in train_dataloader:
#         inputs, labels = batch
#         # Your model training code goes here

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


RuntimeError: stack expects each tensor to be equal size, but got [340, 3, 256, 256] at entry 0 and [540, 3, 256, 256] at entry 1