In [77]:
# %pip install opencv-python
%pip install imageio[ffmpeg]
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [42]:

csv_path = 'data/3DYoga90.csv'
sequence_path = 'short/downloaded_log.txt'
pose_list = ['mountain', 'half-way-lift', 'standing-forward-bend', 'downward-dog']
NUM_CLASSES = len(pose_list)
video_dir = 'short'

In [43]:
# Constants
FRAME_HEIGHT = 224  # VGG16 input size
FRAME_WIDTH = 224
SEQUENCE_LENGTH = 16 

BATCH_SIZE = 8
LEARNING_RATE = 0.001
VALIDATION_SPLIT = 0.2
TEST_SPLIT = 0.1
NUM_EPOCHS = 30

# Dataset

Work Left
1. Data Augmentation
2. Expanding to more classes

In [44]:
import pandas as pd
import os
import torch
import numpy as np
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import imageio

class YogaVideoDataset(Dataset):
    def __init__(self, csv_path, sequence_path, pose_list, video_dir):
        with open(sequence_path) as f:
            sequence_list = f.read().splitlines()
            sequence_list = [int(x) for x in sequence_list]
            
        self.df = pd.read_csv(csv_path)
        # Keep only downloaded sequences
        self.df = self.df[self.df['sequence_id'].isin(sequence_list)]
        # Keep only required classes
        self.df = self.df[self.df['l3_pose'].isin(pose_list)]

        self.pose_to_idx = {pose: idx for idx, pose in enumerate(pose_list)}

        self.length_of_dataset = len(self.df)

        self.video_dir = video_dir

        self.transforms = transforms.Compose([
            transforms.Resize((FRAME_HEIGHT, FRAME_WIDTH)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return self.length_of_dataset

    def print(self):
        print(len(self.df))
        print(self.pose_to_idx)
        print(len(self))

    def __getitem__(self, i):
        sequence_id = self.df.iloc[i]['sequence_id']
        # print(sequence_id)
        video_path = os.path.join(self.video_dir, f"{sequence_id}.mp4")
        pose = self.df.iloc[i]['l3_pose']

        label = torch.zeros(NUM_CLASSES)
        label[self.pose_to_idx[pose]] = 1

        frames = self._get_frames(video_path)
        # print(frames.shape)
        
        return frames, label
    
    def _get_frames(self, video_path):
        reader = imageio.get_reader(video_path, 'ffmpeg')
        total_frames = reader.count_frames()
        # print(total_frames)
        indices = np.linspace(0, total_frames-1, SEQUENCE_LENGTH, dtype=int)
        
        frames = []
        for i, frame in enumerate(reader):
            if i in indices:
                frame = Image.fromarray(frame)
                frame = self.transforms(frame)
                frames.append(frame)
        
        reader.close()
        frames = torch.stack([torch.tensor(np.array(f)) for f in frames])
        return frames  

# Model
``` (VGG16 to get feature map and LSTM to go through the frame sequences)```

Work Left
1. Using only last time step output from LSTM to using average value, max value, using attention mechanism
2. Using other imagenet model to extract the feature map

In [45]:
import torch.nn as nn 
import torchvision.models as models
import torch.nn.functional as F


class CNNLSTM(nn.Module):
    def __init__(self, num_classes):
        super(CNNLSTM, self).__init__()
        # Load pretrained VGG16
        vgg = models.vgg16(pretrained=True)
        # Remove the last classifier layer
        self.features = nn.Sequential(*list(vgg.features.children()))
        
        # Freeze VGG16 parameters
        for param in self.features.parameters():
            param.requires_grad = False
            
        # # LSTM configuration
        # self.lstm = nn.LSTM(
        #     input_size=512*7*7,  # VGG16 output size
        #     hidden_size=512,
        #     num_layers=2,
        #     batch_first=True
        # )
                # LSTM 
        self.lstm = nn.LSTM(
            input_size=512*7*7,  # VGG16 output size
            hidden_size=512,
            num_layers=2,
            batch_first=True,
            dropout=0.5
        )
        
        # Final classifier
        self.classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.size()
        
        # Combine batch and sequence dimensions
        x = x.view(batch_size * seq_length, c, h, w)
        
        # Extract CNN features
        x = self.features(x)
        
        # Flatten the CNN output
        x = x.view(batch_size, seq_length, -1)
        
        # Pass through LSTM
        lstm_out, _ = self.lstm(x)
        
        # Use the last time step output
        x = lstm_out[:, -1, :] 
        
        # Classify
        x = self.classifier(x)
        return F.log_softmax(x, dim=1)  # for multi-class classification

In [46]:
from torch.utils.data import random_split

def train_val_test_split(dataset):
    total_size = len(dataset)
    test_size = int(TEST_SPLIT * total_size)
    val_size = int(VALIDATION_SPLIT * total_size)
    train_size = total_size - val_size - test_size
    
    train_dataset, val_dataset, test_dataset = random_split(
        dataset, 
        [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)  # For reproducibility
    )
    
    return train_dataset, val_dataset, test_dataset

def create_data_loaders(train_dataset, val_dataset, test_dataset):
    train_loader = DataLoader(
        train_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=False,
        num_workers=2
    )
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=False,
        num_workers=2
    )
    
    return train_loader, val_loader, test_loader

In [78]:
import matplotlib.pyplot as plt

def plot_training_curves(train_losses, val_losses, train_accs, val_accs, save_dir, timestamp):
    """Plot and save training curves"""
    plt.figure(figsize=(12, 4))
    
    # Plot losses
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    
    # Plot accuracies
    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Training Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.title('Accuracy over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f'training_curves_{timestamp}.png'))
    plt.close()

In [None]:
from torch.optim import lr_scheduler
from tqdm import tqdm
import os
from datetime import datetime

def train_model(model, train_loader, val_loader, criterion, optimizer, 
                num_epochs=50, save_dir='checkpoints'):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Create save directory
    os.makedirs(save_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Initialize scheduler
    scheduler = lr_scheduler.ReduceLROnPlateau(
        optimizer, 
        mode='min',
        factor=0.1,
        patience=5,
        verbose=True
    )
    
    # Initialize tracking variables
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    best_val_acc = 0
    best_epoch = 0
    
    # Main training loop
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        
        # Training phase
        model.train()
        running_loss = 0.0
        running_corrects = 0
        total_samples = 0
        
        # Progress bar for training
        train_pbar = tqdm(train_loader, desc='Training')
        
        for inputs, labels in train_pbar:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # Optimizer step
            optimizer.step()
            
            # Statistics
            running_loss += loss.item()
            
            # For one-hot encoded labels
            _, predicted = torch.max(outputs, 1)
            _, labels_idx = torch.max(labels, 1)
            running_corrects += torch.sum(predicted == labels_idx).item()
            total_samples += inputs.size(0)
            
            # Update progress bar
            train_pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{100.0 * running_corrects / total_samples:.2f}%'
            })
        
        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = 100.0 * running_corrects / total_samples
        
        # Validation phase
        model.eval()
        running_loss = 0.0
        running_corrects = 0
        total_samples = 0
        
        # Progress bar for validation
        val_pbar = tqdm(val_loader, desc='Validation')
        
        with torch.no_grad():
            for inputs, labels in val_pbar:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                running_loss += loss.item()
                
                _, predicted = torch.max(outputs, 1)
                _, labels_idx = torch.max(labels, 1)
                running_corrects += torch.sum(predicted == labels_idx).item()
                total_samples += inputs.size(0)
                
                val_pbar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'acc': f'{100.0 * running_corrects / total_samples:.2f}%'
                })
        
        epoch_val_loss = running_loss / len(val_loader.dataset)
        epoch_val_acc = 100.0 * running_corrects / total_samples
        
        # Store statistics
        train_losses.append(epoch_train_loss)
        val_losses.append(epoch_val_loss)
        train_accuracies.append(epoch_train_acc)
        val_accuracies.append(epoch_val_acc)
        
        # Print epoch statistics
        print(f'\nTraining Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.2f}%')
        print(f'Validation Loss: {epoch_val_loss:.4f} Acc: {epoch_val_acc:.2f}%')
        
        # Learning rate scheduling
        scheduler.step(epoch_val_loss)
        
        # Save best model
        if epoch_val_acc > best_val_acc:
            best_val_acc = epoch_val_acc
            best_epoch = epoch
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_acc': epoch_val_acc,
                'val_loss': epoch_val_loss,
            }, os.path.join(save_dir, f'best_model_{timestamp}.pt'))
        
        # Save training stats
        np.save(os.path.join(save_dir, f'training_stats_{timestamp}.npy'), {
            'train_losses': train_losses,
            'val_losses': val_losses,
            'train_accuracies': train_accuracies,
            'val_accuracies': val_accuracies,
            'best_epoch': best_epoch,
            'best_val_acc': best_val_acc
        })
        
        # Plot and save training curves
        plot_training_curves(
            train_losses, val_losses, 
            train_accuracies, val_accuracies,
            save_dir, timestamp
        )
    
    return model, {
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_accuracies': train_accuracies,
        'val_accuracies': val_accuracies,
        'best_epoch': best_epoch,
        'best_val_acc': best_val_acc
    }

In [None]:
class EarlyStopping:
    """Early stopping to prevent overfitting"""
    def __init__(self, patience=7, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        # on default = 7 successive val_loss increase stop
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [None]:
dataset = YogaVideoDataset(csv_path, sequence_path, pose_list, video_dir)
train_loader, val_loader, test_loader = train_val_test_split(dataset)

model = CNNLSTM(num_classes=NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
trained_model, training_stats = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=50,
    save_dir='checkpoints'
)

# Load best model if needed
checkpoint = torch.load('checkpoints/best_model_[timestamp].pt')
model.load_state_dict(checkpoint['model_state_dict'])

In [85]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

def evaluate_model(model, test_loader, criterion, class_names):
    """Evaluate model on test set"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    test_loss = 0.0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            test_loss += loss.item()
            
            predicted = torch.sigmoid(outputs) > 0.5
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    test_loss = test_loss / len(test_loader)
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    
    # Create confusion matrix
    cm = confusion_matrix(all_labels.argmax(axis=1), all_predictions.argmax(axis=1))
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    # Calculate accuracy
    accuracy = (all_predictions == all_labels).all(axis=1).mean()
    print(f'Test Loss: {test_loss:.4f}')
    print(f'Test Accuracy: {accuracy*100:.2f}%')


In [65]:
a = torch.rand(3,3)
print(a)
_, pred = torch.max(a, 1)
pred

tensor([[0.5056, 0.7598, 0.0992],
        [0.2523, 0.2669, 0.5835],
        [0.5865, 0.1670, 0.2410]])


tensor([1, 2, 0])

In [67]:
label = torch.Tensor([[1,0,0],[0,1,0],[1,0,0]])
_, li = torch.max(label, dim =1)
li

tensor([0, 1, 0])

In [56]:
dataset = YogaVideoDataset(csv_path, sequence_path, pose_list, video_dir)
td, vd, td = train_val_test_split(dataset)
dataset[333]


(tensor([[[[ 2.2489,  2.2489,  2.2489,  ...,  0.1426,  0.4851,  0.6049],
           [ 2.2489,  2.2489,  2.2489,  ...,  0.1254,  0.4851,  0.6049],
           [ 2.2489,  2.2489,  2.2489,  ...,  0.0056,  0.4337,  0.6049],
           ...,
           [-0.0458, -0.0287, -0.0116,  ..., -0.7993, -0.8335, -0.8678],
           [-0.0801, -0.0629, -0.0458,  ..., -0.8164, -0.8507, -0.8849],
           [-0.0972, -0.0801, -0.0629,  ..., -0.8164, -0.8507, -0.8849]],
 
          [[ 2.4286,  2.4286,  2.4286,  ...,  0.1001,  0.4678,  0.5903],
           [ 2.4286,  2.4286,  2.4286,  ...,  0.0826,  0.4678,  0.5903],
           [ 2.4286,  2.4286,  2.4286,  ..., -0.0574,  0.4153,  0.5903],
           ...,
           [-0.1800, -0.1625, -0.1450,  ..., -1.1429, -1.1779, -1.2129],
           [-0.2150, -0.1975, -0.1800,  ..., -1.1604, -1.1954, -1.2304],
           [-0.2325, -0.2150, -0.1975,  ..., -1.1604, -1.1954, -1.2304]],
 
          [[ 2.6400,  2.6400,  2.6400,  ..., -0.0615,  0.3219,  0.4439],
           [ 