In [1]:
# %pip install opencv-python
%pip install imageio[ffmpeg]
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
base_path = ""

In [3]:
import os

csv_path = os.path.join(base_path,'data/3DYoga90.csv')
sequence_path = os.path.join(base_path, 'short/downloaded_log.txt')
pose_list = ['mountain', 'half-way-lift', 'standing-forward-bend', 'downward-dog']
NUM_CLASSES = len(pose_list)
video_dir = os.path.join(base_path, 'short')

In [4]:
# Constants
FRAME_HEIGHT = 224  # VGG16 input size
FRAME_WIDTH = 224
SEQUENCE_LENGTH = 16 

BATCH_SIZE = 8
LEARNING_RATE = 0.001
VALIDATION_SPLIT = 0.2
TEST_SPLIT = 0.1
NUM_EPOCHS = 30

# Dataset

Work Left
1. Data Augmentation
2. Expanding to more classes

In [5]:
import pandas as pd
import os
import torch
import numpy as np
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import imageio

class YogaVideoDataset(Dataset):
    def __init__(self, csv_path, sequence_path, pose_list, video_dir):
        with open(sequence_path) as f:
            sequence_list = f.read().splitlines()
            sequence_list = [int(x) for x in sequence_list]
            
        self.df = pd.read_csv(csv_path)
        # Keep only downloaded sequences
        self.df = self.df[self.df['sequence_id'].isin(sequence_list)]
        # Keep only required classes
        self.df = self.df[self.df['l3_pose'].isin(pose_list)]

        self.pose_to_idx = {pose: idx for idx, pose in enumerate(pose_list)}

        self.length_of_dataset = len(self.df)

        self.video_dir = video_dir

        self.transforms = transforms.Compose([
            transforms.Resize((FRAME_HEIGHT, FRAME_WIDTH)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return self.length_of_dataset

    def print(self):
        print(len(self.df))
        print(self.pose_to_idx)
        print(len(self))

    def __getitem__(self, i):
        sequence_id = self.df.iloc[i]['sequence_id']
        # print(sequence_id)
        video_path = os.path.join(self.video_dir, f"{sequence_id}.mp4")
        pose = self.df.iloc[i]['l3_pose']

        label = torch.zeros(NUM_CLASSES)
        label[self.pose_to_idx[pose]] = 1

        frames = self._get_frames(video_path)
        # print(frames.shape)
        
        return frames, label
    
    def _get_frames(self, video_path):
        reader = imageio.get_reader(video_path, 'ffmpeg')
        total_frames = reader.count_frames()
        # print(total_frames)
        indices = np.linspace(0, total_frames-1, SEQUENCE_LENGTH, dtype=int)
        
        frames = []
        for i, frame in enumerate(reader):
            if i in indices:
                frame = Image.fromarray(frame)
                frame = self.transforms(frame)
                frames.append(frame)
        
        reader.close()
        frames = torch.stack([torch.tensor(np.array(f)) for f in frames])
        return frames  

# Model
``` (VGG16 to get feature map and LSTM to go through the frame sequences)```

Work Left
1. Using only last time step output from LSTM to using average value, max value, using attention mechanism
2. Using other imagenet model to extract the feature map

In [6]:
import torch.nn as nn 
import torchvision.models as models
import torch.nn.functional as F


class CNNLSTM(nn.Module):
    def __init__(self, num_classes):
        super(CNNLSTM, self).__init__()
        # Load pretrained VGG16
        vgg = models.vgg16(pretrained=True)
        # Remove the last classifier layer
        self.features = nn.Sequential(*list(vgg.features.children()))
        
        # Freeze VGG16 parameters
        for param in self.features.parameters():
            param.requires_grad = False
            
        # # LSTM configuration
        # self.lstm = nn.LSTM(
        #     input_size=512*7*7,  # VGG16 output size
        #     hidden_size=512,
        #     num_layers=2,
        #     batch_first=True
        # )
                # LSTM 
        self.lstm = nn.LSTM(
            input_size=512*7*7,  # VGG16 output size
            hidden_size=512,
            num_layers=2,
            batch_first=True,
            dropout=0.5
        )
        
        # Final classifier
        self.classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.size()
        
        # Combine batch and sequence dimensions
        x = x.view(batch_size * seq_length, c, h, w)
        
        # Extract CNN features
        x = self.features(x)
        
        # Flatten the CNN output
        x = x.view(batch_size, seq_length, -1)
        
        # Pass through LSTM
        lstm_out, _ = self.lstm(x)
        
        # Use the last time step output
        x = lstm_out[:, -1, :] 
        
        # Classify
        x = self.classifier(x)
        return F.log_softmax(x, dim=1)  # for multi-class classification

In [7]:
from torch.utils.data import random_split

def train_val_test_split(dataset):
    total_size = len(dataset)
    test_size = int(TEST_SPLIT * total_size)
    val_size = int(VALIDATION_SPLIT * total_size)
    train_size = total_size - val_size - test_size
    
    train_dataset, val_dataset, test_dataset = random_split(
        dataset, 
        [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)  # For reproducibility
    )
    
    return train_dataset, val_dataset, test_dataset

def create_data_loaders(train_dataset, val_dataset, test_dataset):
    train_loader = DataLoader(
        train_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=False,
        num_workers=2
    )
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=False,
        num_workers=2
    )
    
    return train_loader, val_loader, test_loader

In [8]:
import matplotlib.pyplot as plt

def plot_training_curves(history):
    """Plot training curves including learning rate"""
    plt.style.use('seaborn')
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))
    
    # Plot loss curves
    ax1.plot(history['train_loss'], label='Training Loss', marker='o')
    ax1.plot(history['val_loss'], label='Validation Loss', marker='o')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.set_title('Training and Validation Loss')
    ax1.legend()
    ax1.grid(True)
    
    # Plot accuracy curves
    ax2.plot(history['train_acc'], label='Training Accuracy', marker='o')
    ax2.plot(history['val_acc'], label='Validation Accuracy', marker='o')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy (%)')
    ax2.set_title('Training and Validation Accuracy')
    ax2.legend()
    ax2.grid(True)
    
    # Plot learning rate
    ax3.plot(history['learning_rates'], label='Learning Rate', marker='o')
    ax3.set_xlabel('Epoch')
    ax3.set_ylabel('Learning Rate')
    ax3.set_title('Learning Rate over Time')
    ax3.set_yscale('log')
    ax3.grid(True)
    
    plt.tight_layout()
    plt.show()

In [9]:
class EarlyStopping:
    """Early stopping to prevent overfitting"""
    def __init__(self, patience=7, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        # on default = 7 successive val_loss increase stop
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [10]:

from torch.optim import lr_scheduler

def train_model(model, train_loader, val_loader, criterion, optimizer, 
                num_epochs=50, patience=7):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    print("Using device:", device)
    
    # Initialize scheduler
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)
    
    # Initialize early stopping
    early_stopping = EarlyStopping(patience=patience)
    
    # Initialize history
    history = {
        'train_loss': [],
        'val_loss': [],
        'train_acc': [],
        'val_acc': [],
        'learning_rates': []
    }
    
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        
        # Store current learning rate
        current_lr = optimizer.param_groups[0]['lr']
        history['learning_rates'].append(current_lr)
        
        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            _, labels_idx = torch.max(labels, 1)  # For one-hot encoded labels
            train_correct += (predicted == labels_idx).sum().item()
            train_total += labels.size(0)
        
        train_loss = train_loss / len(train_loader)
        train_acc = 100.0 * train_correct / train_total
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs, 1)
                _, labels_idx = torch.max(labels, 1)
                val_correct += (predicted == labels_idx).sum().item()
                val_total += labels.size(0)
        
        val_loss = val_loss / len(val_loader)
        val_acc = 100.0 * val_correct / val_total
        
        # Update history
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)
        
        # Print metrics
        print(f'Train Loss: {train_loss:.4f} Acc: {train_acc:.2f}%')
        print(f'Val Loss: {val_loss:.4f} Acc: {val_acc:.2f}%')
        print(f'Learning Rate: {current_lr}')
        
        # Learning rate scheduling
        scheduler.step(val_loss)
        
        # Early stopping check
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            break
    
    return model, history

In [11]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

def evaluate_model(model, test_loader, criterion, class_names):
    """Evaluate model on test set"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    test_loss = 0.0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            test_loss += loss.item()
            
            predicted = torch.sigmoid(outputs) > 0.5
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    test_loss = test_loss / len(test_loader)
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    
    # Create confusion matrix
    cm = confusion_matrix(all_labels.argmax(axis=1), all_predictions.argmax(axis=1))
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    # Calculate accuracy
    accuracy = (all_predictions == all_labels).all(axis=1).mean()
    print(f'Test Loss: {test_loss:.4f}')
    print(f'Test Accuracy: {accuracy*100:.2f}%')


In [1]:
print("HELLO")

HELLO


In [12]:
print("Loading Data")
dataset = YogaVideoDataset(csv_path, sequence_path, pose_list, video_dir)
train_dataset, val_dataset, test_dataset = train_val_test_split(dataset)
train_loader, val_loader, test_loader = create_data_loaders(train_dataset, val_dataset, test_dataset)
print("Finished Loading Data")

model = CNNLSTM(num_classes=NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

print("Training the model")
model, history = train_model(
    model, 
    train_loader, 
    val_loader, 
    criterion, 
    optimizer,
    NUM_EPOCHS
)
# Plot the training curves
plot_training_curves(history)
evaluate_model(model, test_loader, criterion, pose_list)
torch.save(model.state_dict(), 'my_model.pth')

Loading Data
Finished Loading Data




Training the model
Using device: cpu

Epoch 1/30
