In [43]:
import os
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from PIL import Image
import torch
import pdb
import numpy as np
import yaml
from tqdm import tqdm

In [2]:
# Define the subjects for each split
train_subjects = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 3, 5, 6, 7, 10]
val_subjects = [24, 25, 1, 4]
test_subjects = [22, 2, 8, 9]

# Define the background variations
background_variations = ['d1', 'd2', 'd3', 'd4']

processed_folder = './processed/'

In [3]:
#load config file and hyperparams
config = yaml.safe_load(open("config.yaml"))
LR = float(config["LR"])
batch_size = int(config["BATCH_SIZE"])
num_epochs = int(config["NUM_EPOCHS"])


In [4]:
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    # Add more augmentations if needed
])

In [5]:
class SequencesExtractor:
    def __init__(self, path, num_frames_per_subsequence=20):
        # Define the number of frames per subsequence
        self.num_frames_per_subsequence = num_frames_per_subsequence
        # List all action folders in the processed folder
        self.classes = os.listdir(path) # folders correspond to classes/labels
        self.class_to_label = {class_name: idx for idx, class_name in enumerate(self.classes)}
    
    def get_classes(self):
        return self.classes
    def create_sequences(self, subjects, background_variations):
        sequences = []
        target_arr = []
        # Iterate over each action folder
        for action_folder in self.classes:
            target = action_folder
            action_path = os.path.join(processed_folder, action_folder)

            # List all person folders in the action folder
            person_folders_actual = os.listdir(action_path)
            # Filter videos based on subjects and background variations
            person_folders_target = [f'person{subject:02d}_{action_folder}_{bg}' 
                           for subject in subjects 
                           for bg in background_variations]
            person_folders = set(person_folders_actual) & set(person_folders_target) # it can be that some background variations (or smth else) is missing. 
            # Iterate over each person folder
            for person_folder in person_folders:
                person_path = os.path.join(action_path, person_folder)
                # List all image files in the person folder
                image_files = sorted(os.listdir(person_path))

                # Split the image files into subsequences
                num_frames = len(image_files)
                num_subsequences = num_frames // self.num_frames_per_subsequence

                for i in range(num_subsequences):
                    start_index = i * self.num_frames_per_subsequence
                    end_index = start_index + self.num_frames_per_subsequence

                    # Load and process the frames in the subsequence
                    subsequence_frames = []
                    for j in range(start_index, end_index):
                        try:
                            image_path = os.path.join(person_path, image_files[j])
                            frame = Image.open(image_path).convert('RGB')
                            # Apply any desired spatial augmentations to the frame
                            frame = transform(frame)
                            subsequence_frames.append(frame)
                        except:
                            print("Tried to read wrong file. Continuing")
                            continue
                    # Apply any desired temporal augmentations to the subsequence

                    # Process the subsequence (e.g., feed it to a model for action classification)
                    subsequence_frames = torch.stack(subsequence_frames, dim=0)
                    # Perform further processing on the subsequence

                    target_arr.append(self.class_to_label[target])
                    sequences.append(subsequence_frames)
        return np.array(sequences), np.array(target_arr)

In [6]:
sequencesExtractor = SequencesExtractor(path=processed_folder, num_frames_per_subsequence=20)
train_sequences, train_target_arr = sequencesExtractor.create_sequences(train_subjects, background_variations)
test_sequences, test_target_arr = sequencesExtractor.create_sequences(test_subjects, background_variations)
val_sequences, val_target_arr = sequencesExtractor.create_sequences(val_subjects, background_variations)

Tried to read wrong file. Continuing
Tried to read wrong file. Continuing
Tried to read wrong file. Continuing


  return np.array(sequences), np.array(target_arr)
  return np.array(sequences), np.array(target_arr)


Tried to read wrong file. Continuing
Tried to read wrong file. Continuing
Tried to read wrong file. Continuing
Tried to read wrong file. Continuing
Tried to read wrong file. Continuing
Tried to read wrong file. Continuing


In [8]:
print(f"Training sequences length: {len(train_sequences)}")
print(f"Validation sequences length: {len(val_sequences)}")
print(f"Test sequences length: {len(test_sequences)}")

Training sequences length: 9844
Validation sequences length: 2146
Test sequences length: 2252


In [9]:
from torch.utils.data import Dataset, DataLoader

In [10]:
class KTHDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]

        return sequence, label

In [11]:
train_dataset = KTHDataset(train_sequences, train_target_arr)
test_dataset = KTHDataset(test_sequences, test_target_arr)
val_dataset = KTHDataset(val_sequences, val_target_arr)

In [12]:
train_dataset

<__main__.KTHDataset at 0x7f9a2def10d0>

In [13]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [28]:
class ConvBlock(nn.Module):
    """
    Encapuslation of a convolutional block (conv + activation + pooling)
    """
    def __init__(self, in_ch, out_ch, k_size, pool=False):
        super(ConvBlock, self).__init__()
        layers = []
        layers.append(nn.Conv2d(in_ch, out_ch, k_size))
        layers.append(nn.ReLU())
        if(pool):
            layers.append(nn.MaxUnpool2d(kernel_size=2))
        self.module = nn.Sequential(*layers)
    def forward(self, x):
        return(self.module(x))


In [71]:
class ConvRecurrentClassifier(nn.Module):
    def __init__(self, input_channels, hidden_size, num_classes,num_layers = 1, mode="zeros"):
        assert mode in ["zeros", "random"]
        super(ConvRecurrentClassifier, self).__init__()
        self.mode = mode
        self.num_layers = 1
        self.hidden_dim = hidden_size
        #Convolutional Encoder
        # self.conv_encoder = nn.Sequential(
        #     ConvBlock(3, 16, 3, pool=False),
        #     ConvBlock(16, 32, 3, pool=True),
        #     ConvBlock(32, 64, 3, pool=False),
        #     ConvBlock(64, 128, 3, pool=True)
        # )
        
        self.conv_encoder = nn.Sequential(
            nn.Conv2d(input_channels, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Add more convolutional layers if needed
        )
        
        #Recurrent Module
        self.lstm = nn.LSTM(input_size=128 * 16 * 16, hidden_size=hidden_size, batch_first=True)
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, num_classes)
        )
        
    def forward(self, x):
        # Reshape input to (batch_size * sequence_length, channels, height, width)
        x = x.view(-1, x.size(2), x.size(3), x.size(4))
        
        h, c = self.init_state(b_size=batch_size, device=device)
         # Convolutional Encoder
        x = self.conv_encoder(x)
        
        # Reshape for LSTM
        x = x.view(x.size(0), -1, 128 * 16 * 16)
        
        # Recurrent Module
        out, (h_out, c_out) = self.lstm(x, (h,c))
        
        # Take the output from the last time step
        out = out[:, -1, :]
        
        # Classifier
        out = self.classifier(out)
        
        return out
    def init_state(self, b_size, device):
        """ Initializing hidden and cell state """
        if(self.mode == "zeros"):
            h = [torch.zeros(b_size, self.hidden_dim).to(device) for _ in range(self.num_layers)]
            c = [torch.zeros(b_size, self.hidden_dim).to(device) for _ in range(self.num_layers)]
        elif(self.mode == "random"):
            h = [torch.zeros(b_size, self.hidden_dim).to(device) for _ in range(self.num_layers)]
            c = [torch.zeros(b_size, self.hidden_dim).to(device) for _ in range(self.num_layers)]
        return h, c

In [72]:
# batch_size, sequence_length, input_channels, height, width = 32, 20, 3, 64, 64
num_classes = len(sequencesExtractor.get_classes())

In [73]:
model = ConvRecurrentClassifier(input_channels=3, hidden_size=128, num_classes=num_classes)

In [74]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()  # Note, that this already includes a Softmax!
optimizer = torch.optim.AdamW(model.parameters(), lr=LR) #adamW was used in the paper
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=len(train_dataloader), epochs=num_epochs)

In [75]:
@torch.no_grad()
def eval_model(model):
    """ Computing model accuracy """
    correct = 0
    total = 0
    loss_list = []
    
    for sequences, labels in val_dataloader:
        sequences = sequences.to(device)
        labels = labels.to(device)
        
        # Forward pass only to get logits/output
        outputs = model(sequences)
                 
        loss = criterion(outputs, labels)
        loss_list.append(loss.item())
            
        # Get predictions from the maximum value
        preds = torch.argmax(outputs, dim=1)
        correct += len( torch.where(preds==labels)[0] )
        total += len(labels)
                 
    # Total correct predictions and loss
    accuracy = correct / total * 100
    loss = np.mean(loss_list)
    return accuracy, loss

In [76]:
#SAMPLE Training
loss_hist = []
train_acc_hist = []
valid_loss_hist = []
valid_acc_hist = []
for epoch in range(num_epochs):
    loss_list = []
    acc_list = []
    progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    for i, (sequences, labels) in progress_bar:
        sequences = sequences.to(device)
        sequences = sequences.to(device)
        
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        
        # Forward pass to get output/logits
        outputs = model(images)
         
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        loss_list.append(loss.item())
        
        with torch.no_grad():
            predicted = outputs.argmax(dim=-1)
            correct = (predicted == labels).sum().item()
            accuracy = correct/labels.shape[0] * 100
        acc_list.append(accuracy)
        # Getting gradients w.r.t. parameters
        loss.backward()
       # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2.0)
        # Updating parameters
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_description(f"Epoch {epoch+1} Iter {i+1}: loss {loss.item():.5f}. ")
    
    loss_hist.append(np.mean(loss_list))
    train_acc_hist.append(np.mean(acc_list))
    val_accuracy, valid_loss = eval_model(model)
    print(f"Val accuracy at epoch {epoch}: {round(val_accuracy, 2)}%")
    valid_loss_hist.append(valid_loss)
    valid_acc_hist.append(val_accuracy)

  0%|          | 0/308 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'dim'