In [1]:
import os
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from PIL import Image
import torch
import pdb
import numpy as np
import yaml
from tqdm import tqdm

In [2]:
# Define the subjects for each split
train_subjects = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 3, 5, 6, 7, 10]
val_subjects = [24, 25, 1, 4]
test_subjects = [22, 2, 8, 9]

# Define the background variations
background_variations = ['d1', 'd2', 'd3', 'd4']

processed_folder = '/home/nfs/inf6/data/datasets/kth_actions/processed/'

In [3]:
#load config file and hyperparams
config = yaml.safe_load(open("config.yaml"))
LR = float(config["LR"])
batch_size = int(config["BATCH_SIZE"])
num_epochs = int(config["NUM_EPOCHS"])


In [4]:
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    # Add more augmentations if needed
])

In [5]:
class SequencesExtractor:
    def __init__(self, path, num_frames_per_subsequence=20):
        # Define the number of frames per subsequence
        self.num_frames_per_subsequence = num_frames_per_subsequence
        # List all action folders in the processed folder
        self.classes = os.listdir(path) # folders correspond to classes/labels
        self.class_to_label = {class_name: idx for idx, class_name in enumerate(self.classes)}
    
    def get_classes(self):
        return self.classes
    def create_sequences(self, subjects, background_variations):
        sequences = []
        target_arr = []
        # Iterate over each action folder
        for action_folder in self.classes:
            target = action_folder
            action_path = os.path.join(processed_folder, action_folder)

            # List all person folders in the action folder
            person_folders_actual = os.listdir(action_path)
            # Filter videos based on subjects and background variations
            person_folders_target = [f'person{subject:02d}_{action_folder}_{bg}' 
                           for subject in subjects 
                           for bg in background_variations]
            person_folders = set(person_folders_actual) & set(person_folders_target) # it can be that some background variations (or smth else) is missing. 
            # Iterate over each person folder
            for person_folder in person_folders:
                person_path = os.path.join(action_path, person_folder)
                # List all image files in the person folder
                image_files = sorted(os.listdir(person_path))

                # Split the image files into subsequences
                num_frames = len(image_files)
                num_subsequences = num_frames // self.num_frames_per_subsequence

                for i in range(num_subsequences):
                    start_index = i * self.num_frames_per_subsequence
                    end_index = start_index + self.num_frames_per_subsequence

                    # Load and process the frames in the subsequence
                    subsequence_frames = []
                    for j in range(start_index, end_index):
                        try:
                            image_path = os.path.join(person_path, image_files[j])
                            frame = Image.open(image_path).convert('RGB')
                            # Apply any desired spatial augmentations to the frame
                            frame = transform(frame)
                            subsequence_frames.append(frame)
                        except:
                            print("Tried to read wrong file. Continuing")
                            continue
                    # Apply any desired temporal augmentations to the subsequence

                    # Process the subsequence (e.g., feed it to a model for action classification)
                    subsequence_frames = torch.stack(subsequence_frames, dim=0)
                    # Perform further processing on the subsequence

                    target_arr.append(self.class_to_label[target])
                    sequences.append(subsequence_frames)
        return np.array(sequences), np.array(target_arr)

In [6]:
processed_folder = '/home/nfs/inf6/data/datasets/kth_actions/processed/'
sequencesExtractor = SequencesExtractor(path=processed_folder, num_frames_per_subsequence=20)
train_sequences, train_target_arr = sequencesExtractor.create_sequences(train_subjects, background_variations)
test_sequences, test_target_arr = sequencesExtractor.create_sequences(test_subjects, background_variations)
val_sequences, val_target_arr = sequencesExtractor.create_sequences(val_subjects, background_variations)

In [7]:
print(f"Training sequences length: {len(train_sequences)}")
print(f"Validation sequences length: {len(val_sequences)}")
print(f"Test sequences length: {len(test_sequences)}")

Training sequences length: 9844
Validation sequences length: 2146
Test sequences length: 2252


In [8]:
train_sequences.shape

(9844, 20, 3, 64, 64)

In [9]:
from torch.utils.data import Dataset, DataLoader

In [10]:
class KTHDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]

        return sequence, label

In [11]:
train_dataset = KTHDataset(train_sequences, train_target_arr)
test_dataset = KTHDataset(test_sequences, test_target_arr)
val_dataset = KTHDataset(val_sequences, val_target_arr)

In [12]:
train_dataset

<__main__.KTHDataset at 0x7efe60ac4520>

In [13]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [14]:
import torch
import torch.nn as nn

class OurLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(OurLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Input gate
        self.W_ii = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hi = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_ii = nn.Parameter(torch.Tensor(hidden_size))
        self.b_hi = nn.Parameter(torch.Tensor(hidden_size))

        # Forget gate
        self.W_if = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hf = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_if = nn.Parameter(torch.Tensor(hidden_size))
        self.b_hf = nn.Parameter(torch.Tensor(hidden_size))

        # Cell gate
        self.W_ig = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hg = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_ig = nn.Parameter(torch.Tensor(hidden_size))
        self.b_hg = nn.Parameter(torch.Tensor(hidden_size))

        # Output gate
        self.W_io = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_ho = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_io = nn.Parameter(torch.Tensor(hidden_size))
        self.b_ho = nn.Parameter(torch.Tensor(hidden_size))

        self.init_weights()
    
    def init_weights(self):
        for p in self.parameters():
            # dimension greater equal to 2 is typically associated with weight tensors
            if p.data.ndimension() >= 2:
                nn.init.xavier_uniform_(p.data)
            else:
            # bias tensors are commonly initialized to 0
                nn.init.zeros_(p.data)
    
    
    def forward(self, x, init_states=None):
        # bs, _ = x.size()

        # Access prior hidden state if existent, else initialize to zero (i.e. future is unknown)
        h_t, c_t = (torch.zeros(self.hidden_size).to(x.device),
                    torch.zeros(self.hidden_size).to(x.device)) if init_states is None else init_states

        # We note: In a perfect world, this should be parallelized
        # Input gate
        i_t = torch.sigmoid(x @ self.W_ii.t() + self.b_ii + h_t @ self.W_hi.t() + self.b_hi)
        
        # Forget gate
        f_t = torch.sigmoid(x @ self.W_if.t() + self.b_if + h_t @ self.W_hf.t() + self.b_hf)
        
        # Cell gate
        g_t = torch.tanh(x @ self.W_ig.t() + self.b_ig + h_t @ self.W_hg.t() + self.b_hg)
        
        # Output gate
        o_t = torch.sigmoid(x @ self.W_io.t() + self.b_io + h_t @ self.W_ho.t() + self.b_ho)
        
        # Note: * is the Hadamard product
        # Update cell state
        c_t = f_t * c_t + i_t * g_t

        # Update hidden state
        h_t = o_t * torch.tanh(c_t)

        return h_t, c_t

In [15]:
torchcell = nn.LSTMCell(200, 100)
# torchcell(input)

In [17]:
cell = OurLSTMCell(200, 100)
input = torch.ones(10, 200)

 #cell(input)

We note, that we omitted the hadamard products from the paper to stay more similar to our LSTM implementation from above
https://arxiv.org/pdf/1506.04214v2.pdf

In [20]:
class ConvLSTMCell(nn.Module):
    def __init__(self, channels, input_size, kernel_size = 3):
        '''
        Params:

        channels: number of channels, which is used as input and output
        input_size: The input size of the image
        kernel_size: size of the convolutional kernel
        '''
        super(ConvLSTMCell, self).__init__()

        self.input_size = input_size
        self.kernel_size = kernel_size

        # Input gate
        self.W_ii = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=3, stride=1, padding=1)
        self.W_hi = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=3, stride=1, padding=1)
        self.b_i = nn.Parameter(torch.zeros(input_size))

        # Forget gate
        self.W_if = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=3, stride=1, padding=1)
        self.W_hf = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=3, stride=1, padding=1)
        self.b_f = nn.Parameter(torch.zeros(input_size))

        # Cell gate
        self.W_ig = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=3, stride=1, padding=1)
        self.W_hg = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=3, stride=1, padding=1)
        self.b_g = nn.Parameter(torch.zeros(input_size))

        # Output gate
        self.W_io = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=3, stride=1, padding=1)
        self.W_ho = nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=3, stride=1, padding=1)
        self.b_o = nn.Parameter(torch.zeros(input_size))

        self.init_weights()

    # Same as above
    def init_weights(self):
        for p in self.parameters():
            if p.data.ndimension() >= 2:
                nn.init.xavier_uniform_(p.data)
            else:
                nn.init.zeros_(p.data)

    def forward(self, x, init_states=None):
        bs, channels, rows, cols = x.size()

        h_t, c_t = (torch.zeros(self.input_size).to(x.device),
                    torch.zeros(self.input_size)) if init_states is None else init_states

        
        # Input gate
        i_t = torch.sigmoid(self.W_ii(x) + self.b_i + self.W_hi(h_t))
        
        # Forget gate
        f_t = torch.sigmoid(self.W_if(x) + self.W_hf(h_t) + self.b_f)
        
        # Cell gate
        g_t = torch.tanh(self.W_ig(x) + self.b_g + self.W_hg(h_t))
        
        # Output gate
        o_t = torch.sigmoid(self.W_io(x) + self.b_o + self.W_ho(h_t))
        
        # Update cell state
        c_t = f_t * c_t + i_t * g_t

        # Update hidden state
        h_t = o_t * torch.tanh(c_t)

        return h_t, c_t



input_seq = torch.randn(128, 32, 64, 64)


conv_lstm_cell = ConvLSTMCell(input_size = (32, 64, 64), channels=32)

# Random input sequence

# Forward pass
output_seq, final_states = conv_lstm_cell(input_seq)



AttributeError: 'ConvLSTMCell' object has no attribute 'hidden_size'

In [18]:
torch.zeros((64, 64))

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [23]:
# Construct LSTM from scratch
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm_cells = nn.ModuleList([LSTMCell(input_size, hidden_size) for _ in range(num_layers)])

    def forward(self, x, init_states=None):
        bs, seq_len, _ = x.size()

        h_t, c_t = (torch.zeros(self.num_layers, self.hidden_size).to(x.device),
                    torch.zeros(self.num_layers, self.hidden_size).to(x.device)) if init_states is None else init_states

        h_t, c_t = h_t.clone(), c_t.clone()

        output = []

        for t in range(seq_len):
            x_t = x[:, t, :]
            for layer in range(self.num_layers):
                h_t[layer], c_t[layer] = self.lstm_cells[layer](x_t, (h_t[layer], c_t[layer]))
                x_t = h_t[layer]

            output.append(x_t)

        output = torch.stack(output, dim=1)

        return output, (h_t, c_t)


# Example usage:
input_size = 10
hidden_size = 20
num_layers = 2
seq_len = 5
batch_size = 3

lstm = LSTM(input_size, hidden_size, num_layers)

# Random input sequence
input_seq = torch.randn(batch_size, seq_len, input_size)

# Forward pass
output_seq, final_states = lstm(input_seq)

print("Input sequence shape:", input_seq.shape)
print("Output sequence shape:", output_seq.shape)
print("Final hidden state shape:", final_states[0].shape)
print("Final cell state shape:", final_states[1].shape)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam


In [None]:
class LSTMbyHand(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

In [14]:
class ConvBlock(nn.Module):
    """
    Encapuslation of a convolutional block (conv + activation + pooling)
    """
    def __init__(self, in_ch, out_ch, k_size, pool=False):
        super(ConvBlock, self).__init__()
        layers = []
        layers.append(nn.Conv2d(in_ch, out_ch, k_size))
        layers.append(nn.ReLU())
        if(pool):
            layers.append(nn.MaxPool2d(kernel_size=2))
        self.module = nn.Sequential(*layers)
    def forward(self, x):
        return(self.module(x))


In [27]:
class ConvRecurrentClassifier(nn.Module):
    def __init__(self, input_channels, hidden_size, num_classes,num_layers = 1, mode="zeros"):
        assert mode in ["zeros", "random"]
        super(ConvRecurrentClassifier, self).__init__()
        self.mode = mode
        self.num_layers = 1
        self.hidden_dim = hidden_size
        #Convolutional Encoder
        # self.conv_encoder = nn.Sequential(
        #     ConvBlock(3, 16, 3, pool=False),
        #     ConvBlock(16, 32, 3, pool=True),
        #     ConvBlock(32, 64, 3, pool=False),
        #     ConvBlock(64, 128, 3, pool=True)
        # )
        
        self.conv_encoder = nn.Sequential(
            # nn.Conv2d(input_channels, 64, kernel_size=3, padding=1),
            # nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2, stride=2),
            # nn.Conv2d(64, 128, kernel_size=3, padding=1),
            # nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2, stride=2),
            # Add more convolutional layers if needed
            ConvBlock(3, 16, 3, pool=False),
            ConvBlock(16, 32, 3, pool=True),
            ConvBlock(32, 64, 3, pool=False),
            ConvBlock(64, 128, 3, pool=True),
            ConvBlock(128, 256, 3, pool=False)
        )
        
        #Recurrent Module
        self.lstm = nn.LSTM(input_size=256 * 315 * 11, hidden_size=hidden_size, batch_first=True)
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, num_classes)
        )
        
    def forward(self, x):
        # Reshape input to (batch_size * sequence_length, channels, height, width)
        b_size, seq_length, n_channels, width, height = x.shape
        
        x = x.view(b_size, n_channels, seq_length*width, height)
        
        h, c = self.init_state(b_size=batch_size, device=device)
         # Convolutional Encoder
        x = self.conv_encoder(x)
        
        # Reshape for LSTM
        x = x.view(x.size(0), -1, x.size(1) * x.size(2) * x.size(3))
        
        # Recurrent Module
        out, (h_out, c_out) = self.lstm(x, (h,c))
        
        # Take the output from the last time step
        out = out[:, -1, :]
        
        # Classifier
        out = self.classifier(out)
        
        return out
    def init_state(self, b_size, device):
        """ Initializing hidden and cell state """
        if(self.mode == "zeros"):
            h = torch.zeros(self.num_layers, b_size, self.hidden_dim)
            c = torch.zeros(self.num_layers, b_size, self.hidden_dim)
        elif(self.mode == "random"):
            h = torch.randn(self.num_layers, b_size, self.hidden_dim)
            c = torch.randn(self.num_layers, b_size, self.hidden_dim)
        elif(self.mode == "learned"):
            h = self.learned_h.repeat(1, b_size, 1)
            c = self.learned_c.repeat(1, b_size, 1)
        h = h.to(device)
        c = c.to(device)
        return h, c

In [28]:
# batch_size, sequence_length, input_channels, height, width = 32, 20, 3, 64, 64
num_classes = len(sequencesExtractor.get_classes())

In [29]:
model = ConvRecurrentClassifier(input_channels=3, hidden_size=128, num_classes=num_classes)

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()  # Note, that this already includes a Softmax!
optimizer = torch.optim.AdamW(model.parameters(), lr=LR) #adamW was used in the paper
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=len(train_dataloader), epochs=num_epochs)

In [31]:
@torch.no_grad()
def eval_model(model):
    """ Computing model accuracy """
    correct = 0
    total = 0
    loss_list = []
    
    for sequences, labels in val_dataloader:
        sequences = sequences.to(device)
        labels = labels.to(device)
        
        # Forward pass only to get logits/output
        outputs = model(sequences)
                 
        loss = criterion(outputs, labels)
        loss_list.append(loss.item())
            
        # Get predictions from the maximum value
        preds = torch.argmax(outputs, dim=1)
        correct += len( torch.where(preds==labels)[0] )
        total += len(labels)
                 
    # Total correct predictions and loss
    accuracy = correct / total * 100
    loss = np.mean(loss_list)
    return accuracy, loss

In [None]:
#SAMPLE Training
loss_hist = []
valid_acc_hist = []
for epoch in range(num_epochs):
    loss_list = []
    acc_list = []
    progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    for i, (sequences, labels) in progress_bar:
        sequences = sequences.to(device)
        sequences = sequences.to(device)
        
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        
        # Forward pass to get output/logits
        outputs = model(sequences)
         
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        loss_list.append(loss.item())
        
        with torch.no_grad():
            predicted = outputs.argmax(dim=-1)
            correct = (predicted == labels).sum().item()
            accuracy = correct/labels.shape[0] * 100
        acc_list.append(accuracy)
        # Getting gradients w.r.t. parameters
        loss.backward()
       # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2.0)
        # Updating parameters
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_description(f"Epoch {epoch+1} Iter {i+1}: loss {loss.item():.5f}. ")
    
    loss_hist.append(np.mean(loss_list))
    train_acc_hist.append(np.mean(acc_list))
    val_accuracy, valid_loss = eval_model(model)
    print(f"Val accuracy at epoch {epoch}: {round(val_accuracy, 2)}%")
    valid_loss_hist.append(valid_loss)
    valid_acc_hist.append(val_accuracy)

Epoch 1 Iter 94: loss 1.50806. :  31%|███       | 94/308 [10:21<23:34,  6.61s/it]


KeyboardInterrupt: 