In [31]:
# %pip install opencv-python
%pip install imageio[ffmpeg]

Note: you may need to restart the kernel to use updated packages.


In [32]:

csv_path = 'data/3DYoga90.csv'
sequence_path = 'short/downloaded_log.txt'
pose_list = ['mountain', 'half-way-lift', 'standing-forward-bend', 'downward-dog']
NUM_CLASSES = len(pose_list)
video_dir = 'short'

In [33]:
# Constants
FRAME_HEIGHT = 224  # VGG16 input size
FRAME_WIDTH = 224
SEQUENCE_LENGTH = 16 
BATCH_SIZE = 8

# Dataset

Work Left
1. Data Augmentation
2. Expanding to more classes

In [34]:
import pandas as pd
import os
import torch
import numpy as np
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import imageio

class YogaVideoDataset(Dataset):
    def __init__(self, csv_path, sequence_path, pose_list, video_dir):
        with open(sequence_path) as f:
            sequence_list = f.read().splitlines()
            sequence_list = [int(x) for x in sequence_list]
            
        self.df = pd.read_csv(csv_path)
        # Keep only downloaded sequences
        self.df = self.df[self.df['sequence_id'].isin(sequence_list)]
        # Keep only required classes
        self.df = self.df[self.df['l3_pose'].isin(pose_list)]

        self.pose_to_idx = {pose: idx for idx, pose in enumerate(pose_list)}

        self.length_of_dataset = len(self.df)

        self.video_dir = video_dir

        self.transforms = transforms.Compose([
            transforms.Resize((FRAME_HEIGHT, FRAME_WIDTH)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return self.length_of_dataset

    def print(self):
        print(len(self.df))
        print(self.pose_to_idx)
        print(len(self))

    def __getitem__(self, i):
        sequence_id = self.df.iloc[i]['sequence_id']
        # print(sequence_id)
        video_path = os.path.join(self.video_dir, f"{sequence_id}.mp4")
        pose = self.df.iloc[i]['l3_pose']

        label = torch.zeros(NUM_CLASSES)
        label[self.pose_to_idx[pose]] = 1

        frames = self._get_frames(video_path)
        # print(frames.shape)
        
        return frames, label
    
    def _get_frames(self, video_path):
        reader = imageio.get_reader(video_path, 'ffmpeg')
        total_frames = reader.count_frames()
        # print(total_frames)
        indices = np.linspace(0, total_frames-1, SEQUENCE_LENGTH, dtype=int)
        
        frames = []
        for i, frame in enumerate(reader):
            if i in indices:
                frame = Image.fromarray(frame)
                frame = self.transforms(frame)
                frames.append(frame)
        
        reader.close()
        frames = torch.stack([torch.tensor(np.array(f)) for f in frames])
        return frames  

# Model
``` (VGG16 to get feature map and LSTM to go through the frame sequences)```

Work Left
1. Using only last time step output from LSTM to using average value, max value, using attention mechanism
2. Using other imagenet model to extract the feature map

In [30]:
import torch.nn as nn 
import torchvision.models as models


class CNNLSTM(nn.Module):
    def __init__(self, num_classes):
        super(CNNLSTM, self).__init__()
        # Load pretrained VGG16
        vgg = models.vgg16(pretrained=True)
        # Remove the last classifier layer
        self.features = nn.Sequential(*list(vgg.features.children()))
        
        # Freeze VGG16 parameters
        for param in self.features.parameters():
            param.requires_grad = False
            
        # LSTM configuration
        self.lstm = nn.LSTM(
            input_size=512*7*7,  # VGG16 output size
            hidden_size=512,
            num_layers=2,
            batch_first=True
        )
        
        # Final classifier
        self.classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.size()
        
        # Combine batch and sequence dimensions
        x = x.view(batch_size * seq_length, c, h, w)
        
        # Extract CNN features
        x = self.features(x)
        
        # Flatten the CNN output
        x = x.view(batch_size, seq_length, -1)
        
        # Pass through LSTM
        lstm_out, _ = self.lstm(x)
        
        # Use the last time step output
        x = lstm_out[:, -1, :] 
        
        # Classify
        x = self.classifier(x)
        return x

In [35]:
dataset = YogaVideoDataset(csv_path, sequence_path, pose_list, video_dir)
dataset[0]


(tensor([[[[-0.2171, -0.0801,  0.0912,  ..., -0.0116, -0.1486, -0.1999],
           [-0.1999, -0.0801,  0.0912,  ...,  0.0569, -0.0972, -0.0287],
           [-0.1657, -0.0801,  0.0741,  ...,  0.2453, -0.0287, -0.0458],
           ...,
           [ 1.0159,  0.9988,  0.9646,  ...,  0.0569,  0.0398,  0.0398],
           [ 0.9988,  0.9303,  0.9303,  ...,  0.1083,  0.0741,  0.0569],
           [ 0.9817,  0.9303,  0.9988,  ...,  0.1254,  0.0912,  0.0569]],
 
          [[-0.1975, -0.1099,  0.0126,  ..., -0.0924, -0.2325, -0.3025],
           [-0.1975, -0.1099, -0.0049,  ..., -0.0574, -0.2150, -0.1099],
           [-0.1800, -0.1099, -0.0049,  ...,  0.1176, -0.1450, -0.1275],
           ...,
           [ 0.8004,  0.7829,  0.7479,  ..., -0.1625, -0.1625, -0.1800],
           [ 0.7829,  0.7304,  0.7129,  ..., -0.1099, -0.1625, -0.1975],
           [ 0.7654,  0.7304,  0.7829,  ..., -0.1099, -0.1625, -0.1975]],
 
          [[-0.7238, -0.5670, -0.4101,  ...,  0.1999,  0.0605,  0.0431],
           [-