# Import libraries, set up paths, and device

This cell imports necessary libraries and defines file paths and device settings.


In [1]:
import os
import cv2
import json
import torch
import optuna
import datetime
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm 
from pathlib import Path
from PIL import Image
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import pandas as pd
import pickle
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights
from torch.amp import GradScaler, autocast
import matplotlib.pyplot as plt

# Define paths
BASE_DIR = Path("C:/Users/abhis/Downloads/Documents/Learner Engagement Project")
DATA_DIR = BASE_DIR / "data" / "DAiSEE"
FRAMES_DIR = DATA_DIR / "ExtractedFrames"
LABELS_DIR = DATA_DIR / "Labels"
MODEL_DIR = BASE_DIR / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Precomputed directory for caching best frames
CACHE_DIR = BASE_DIR / "cache"
CACHE_DIR.mkdir(exist_ok=True)

# Set device and CUDA configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define data transforms for training and validation.
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

Using device: cuda


# Define helper function for selecting 30 best frames using face detection and sharpness

This cell defines a function that, given a folder of extracted frames, divides them into 30 equal temporal segments and selects the best frame from each segment based on face detection and sharpness.


In [2]:
# Mapping CSV clip IDs.
def get_csv_clip_id(video_stem: str) -> str:
    base = video_stem.strip()
    if base.startswith("110001"):
        base = base.replace("110001", "202614", 1)
    return base

# Select best frames using face detection and Laplacian variance.
def select_impactful_frames(video_folder: Path, num_frames=30):
    frame_files = sorted(video_folder.glob("frame_*.jpg"))
    total_frames = len(frame_files)
    if total_frames == 0:
        return []
    if total_frames <= num_frames:
        return frame_files
    segment_size = total_frames // num_frames
    cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
    face_cascade = cv2.CascadeClassifier(cascade_path)
    selected_frames = []
    for i in range(num_frames):
        start_idx = i * segment_size
        end_idx = (i + 1) * segment_size if i < num_frames - 1 else total_frames
        best_score = -1
        best_frame = None
        for fp in frame_files[start_idx:end_idx]:
            img = cv2.imread(str(fp))
            if img is None:
                continue
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)
            if len(faces) > 0:
                face = max(faces, key=lambda r: r[2]*r[3])
                x, y, w, h = face
                region = gray[y:y+h, x:x+w]
                quality = cv2.Laplacian(region, cv2.CV_64F).var()
            else:
                quality = cv2.Laplacian(gray, cv2.CV_64F).var()
            if quality > best_score:
                best_score = quality
                best_frame = fp
        if best_frame is not None:
            selected_frames.append(best_frame)
    return selected_frames

def precompute_best_frames(csv_file: Path, video_root: Path, num_frames=30):
    """
    Precompute and cache the best frame paths for each video in the CSV.
    The results are saved to a pickle file and returned.
    """
    data = pd.read_csv(csv_file, dtype=str)
    data.columns = data.columns.str.strip()
    split = Path(csv_file).stem.replace("Labels", "").strip()
    precomputed = []  # list of lists for each video
    valid_indices = []  # valid data indices
    skipped_count = 0

    for idx, row in tqdm(data.iterrows(), total=len(data), desc="Precomputing best frames", dynamic_ncols=True):
        clip_id = str(row["ClipID"]).strip()
        if clip_id.endswith(('.avi', '.mp4')):
            clip_id = clip_id.rsplit('.', 1)[0]
        mapped_id = get_csv_clip_id(clip_id)
        video_folder = video_root / split / mapped_id
        if video_folder.exists():
            frame_files = sorted(video_folder.glob("frame_*.jpg"))
            if len(frame_files) >= num_frames:
                selected_frames = select_impactful_frames(video_folder, num_frames)
                precomputed.append(selected_frames)
                valid_indices.append(idx)
            else:
                skipped_count += 1
        else:
            skipped_count += 1
    print(f"Precomputation: Skipped {skipped_count} videos out of {len(data)}.")
    cache_data = {"valid_indices": valid_indices, "precomputed_frames": precomputed}
    cache_file = CACHE_DIR / f"precomputed_{Path(csv_file).stem}_frame_{num_frames}.pkl"
    with open(cache_file, "wb") as f:
        pickle.dump(cache_data, f)
    print(f"Precomputed results saved to {cache_file}")
    return cache_data

# Define the custom Dataset for video classification

This cell defines a PyTorch Dataset that reads a CSV file with video IDs and 4 labels. For each video, it loads the 30 best frames using the above function, applies transforms, and returns a tensor and its label.


In [3]:
class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, video_root, transform=None, num_frames=30):
        self.csv_file = Path(csv_file)
        self.data = pd.read_csv(self.csv_file, dtype=str)
        self.data.columns = self.data.columns.str.strip()
        self.video_root = Path(video_root)
        self.transform = transform
        self.num_frames = num_frames
        self.split = self.csv_file.stem.replace("Labels", "").strip()
        cache_file = CACHE_DIR / f"precomputed_{Path(csv_file).stem}_frame_{num_frames}.pkl"
        if cache_file.exists():
            with open(cache_file, "rb") as f:
                cache_data = pickle.load(f)
            valid_indices = cache_data["valid_indices"]
            self.precomputed_frames = cache_data["precomputed_frames"]
            self.data = self.data.iloc[valid_indices].reset_index(drop=True)
            print(f"Loaded precomputed frames for {len(self.data)} videos from cache.")
        else:
            valid_rows = []
            self.precomputed_frames = []
            skipped_count = 0
            for idx, row in self.data.iterrows():
                clip_id = str(row["ClipID"]).strip()
                if clip_id.endswith(('.avi', '.mp4')):
                    clip_id = clip_id.rsplit('.', 1)[0]
                mapped_id = get_csv_clip_id(clip_id)
                video_folder = self.video_root / self.split / mapped_id
                if video_folder.exists():
                    frame_files = sorted(video_folder.glob("frame_*.jpg"))
                    if len(frame_files) >= num_frames:
                        selected_frames = select_impactful_frames(video_folder, num_frames)
                        valid_rows.append(row)
                        self.precomputed_frames.append(selected_frames)
                    else:
                        skipped_count += 1
                else:
                    skipped_count += 1
            self.data = pd.DataFrame(valid_rows)
            print(f"Computed frames on the fly: Skipped {skipped_count} videos.")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        selected_frame_paths = self.precomputed_frames[idx]
        frames = []
        for fp in selected_frame_paths:
            try:
                # Attempt to load frame using PIL
                img = Image.open(fp).convert("RGB")
            except (FileNotFoundError, OSError):
                # Create a black PIL image placeholder
                img = Image.new('RGB', (224, 224))
            if self.transform:
                img = self.transform(img)
            frames.append(img)
        
        # Ensure exactly num_frames by adding placeholder images if needed
        while len(frames) < self.num_frames:
            placeholder_img = Image.new('RGB', (224, 224))
            if self.transform:
                placeholder_img = self.transform(placeholder_img)
            frames.append(placeholder_img)
        
        frames_tensor = torch.stack(frames)
        labels = torch.tensor([
            int(row["Engagement"]),
            int(row["Boredom"]),
            int(row["Confusion"]),
            int(row["Frustration"])
        ], dtype=torch.long)
        return frames_tensor, labels

# Define the MobileNetV2-TCN model

This cell defines the MobileNetV2-TCN model. It processes a sequence of frames by applying MobileNetV2 on each frame, stacking the features, and feeding them to a temporal convolution network (TCN).


In [4]:
class MobileNetTCN(nn.Module):
    def __init__(self, hidden_ch=512, freeze_block=0):
        super(MobileNetTCN, self).__init__()
        self.mobilenet = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1)
        self.freeze_blocks(freeze_block)
        self.mobilenet.classifier = nn.Identity()
        
        # Adjust TCN layers using hyperparameter hidden_ch.
        self.tcn = nn.Sequential(
            nn.Conv1d(1280, hidden_ch, kernel_size=3, dilation=2, padding=2),
            nn.ReLU(),
            nn.Conv1d(hidden_ch, 16, kernel_size=1)
        )
    
    def freeze_blocks(self, freeze_block):
        # Freeze the first 'freeze_block' blocks in MobileNetV2 features.
        if freeze_block > 0:
            for i in range(freeze_block):
                if i < len(self.mobilenet.features):
                    for param in self.mobilenet.features[i].parameters():
                        param.requires_grad = False
    
    def forward(self, x):
        batch_size, num_frames, C, H, W = x.size()
        x_reshaped = x.view(-1, C, H, W)
        features_reshaped = self.mobilenet(x_reshaped)
        features = features_reshaped.view(batch_size, num_frames, -1).permute(0, 2, 1)
        out = self.tcn(features)
        out = out[:, :, -1]
        return out

# Define training, checkpointing, and evaluation functions

This cell defines the training loop which uses mixed precision (`torch.cuda.amp`), shows progress with `tqdm`, and saves checkpoints to resume training.


In [5]:
def save_checkpoint(model, optimizer, epoch, best_val_loss, checkpoint_path):
    state = {
        "epoch": epoch,
        "best_val_loss": best_val_loss,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict()
    }
    torch.save(state, checkpoint_path)

def load_checkpoint(model, optimizer, checkpoint_path):
    if os.path.exists(checkpoint_path):
        state = torch.load(checkpoint_path, map_location=device)
        model.load_state_dict(state["model_state_dict"])
        optimizer.load_state_dict(state["optimizer_state_dict"])
        return state["epoch"], state["best_val_loss"]
    return 0, float("inf")


def train_model(model, train_loader, val_loader, epochs, lr, checkpoint_path, patience=5, gradient_accum_steps=1):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scaler = GradScaler()
    start_epoch, best_val_loss = load_checkpoint(model, optimizer, checkpoint_path)
    loss_fn = nn.CrossEntropyLoss()
    early_stop_counter = 0

    # Outer tqdm for epochs
    epoch_pbar = tqdm(
        range(start_epoch, epochs),
        desc="Epochs",
        total=epochs - start_epoch,
        position=0,
        leave=True,
        dynamic_ncols=True
    )

    for epoch in epoch_pbar:
        model.train()
        running_loss = 0.0

        # Inner tqdm for batch-level progress within each epoch
        train_iter = tqdm(
            enumerate(train_loader),
            total=len(train_loader),
            desc=f"Epoch {epoch+1} [Train]",
            position=1,
            leave=False,
            dynamic_ncols=True
        )

        for i, (frames, labels) in train_iter:
            frames, labels = frames.to(device), labels.to(device)

            with autocast(enabled=True, dtype=torch.float16, device_type='cuda'):
                outputs = model(frames)
                # outputs_reshaped: [batch_size, 4 (dimensions), 4 (classes each dimension)]
                outputs_reshaped = outputs.view(outputs.size(0), 4, 4)
                # Summation of cross-entropy across the 4 dimensions
                loss = sum(loss_fn(outputs_reshaped[:, d], labels[:, d]) for d in range(4)) / 4.0

            # Gradient scaling / accumulation
            scaler.scale(loss / gradient_accum_steps).backward()
            if (i + 1) % gradient_accum_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            running_loss += loss.item() * frames.size(0)

        # Compute average training loss over the entire train set
        train_loss = running_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for frames, labels in val_loader:
                frames, labels = frames.to(device), labels.to(device)
                with autocast(enabled=True, dtype=torch.float16, device_type='cuda'):
                    outputs = model(frames)
                    outputs_reshaped = outputs.view(outputs.size(0), 4, 4)
                    loss = sum(loss_fn(outputs_reshaped[:, d], labels[:, d]) for d in range(4)) / 4.0
                    val_loss += loss.item() * frames.size(0)
        val_loss /= len(val_loader.dataset)

        # Update the epoch-level progress bar with train & val losses
        epoch_pbar.set_postfix({"train_loss": f"{train_loss:.4f}", "val_loss": f"{val_loss:.4f}"})

        # Print to console as well if desired
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            save_checkpoint(model, optimizer, epoch + 1, best_val_loss, checkpoint_path)
            early_stop_counter = 0
        else:
            early_stop_counter += 1

        if early_stop_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best loss: {best_val_loss:.4f}")
            break

    return best_val_loss

# Define the Optuna objective for hyperparameter tuning (using SQLite storage)

This cell defines an Optuna objective that trains the MobileNetV2-TCN model for a few epochs using hyperparameters suggested by the trial. The study is configured to use an SQLite database (`tuning.db`) for saving progress so tuning can be resumed.


In [6]:

def objective(trial):
    num_frames = trial.suggest_categorical("num_frames", [30])
    batch_size = trial.suggest_categorical("batch_size", [8, 16])
    lr = trial.suggest_float("lr", 1e-5, 5e-4, log=True)
    epochs = trial.suggest_int("epochs", 3, 5)
    hidden_ch = trial.suggest_categorical("hidden_ch", [64, 128, 256])
    freeze_block = trial.suggest_int("freeze_block", 0, 4)

    from torch.utils.data import DataLoader
    train_dataset = VideoDataset(LABELS_DIR / "TrainLabels.csv", FRAMES_DIR, transform=train_transform, num_frames=num_frames)
    val_dataset = VideoDataset(LABELS_DIR / "ValidationLabels.csv", FRAMES_DIR, transform=val_transform, num_frames=num_frames)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

    model = MobileNetTCN(hidden_ch=hidden_ch, freeze_block=freeze_block)
    trial_checkpoint = MODEL_DIR / f"checkpoint_trial_{trial.number}.pth"

    try:
        best_val_loss = train_model(model, train_loader, val_loader, epochs, lr, trial_checkpoint, patience=3)
        return best_val_loss
    except Exception as e:
        print(f"Trial {trial.number} failed: {e}")
        return float("inf")


# Evaluate and visualize results

This cell evaluates the final model on the test set and prints a classification report.


In [7]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for frames, labels in tqdm(test_loader, desc="Evaluating", dynamic_ncols=True):
            frames = frames.to(device)
            outputs = model(frames)
            outputs_reshaped = outputs.view(outputs.size(0), 4, 4)
            preds = torch.argmax(outputs_reshaped, dim=2)
            all_preds.append(preds.cpu())
            all_labels.append(labels)
    all_preds = torch.cat(all_preds, dim=0).numpy()
    all_labels = torch.cat(all_labels, dim=0).numpy()
    
    dims = ["Engagement", "Boredom", "Confusion", "Frustration"]
    for i, dim in enumerate(dims):
        print(f"Classification report for {dim}:")
        print(classification_report(all_labels[:, i], all_preds[:, i], digits=3))
        
        cm = confusion_matrix(all_labels[:, i], all_preds[:, i])
        plt.figure(figsize=(6, 5))
        plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
        plt.title(f"Confusion Matrix for {dim}")
        plt.colorbar()
        tick_marks = np.arange(cm.shape[0])
        plt.xticks(tick_marks, tick_marks)
        plt.yticks(tick_marks, tick_marks)
        plt.xlabel("Predicted label")
        plt.ylabel("True label")
        thresh = cm.max() / 2.
        for j in range(cm.shape[0]):
            for k in range(cm.shape[1]):
                plt.text(k, j, format(cm[j, k], 'd'),
                         horizontalalignment="center",
                         color="white" if cm[j, k] > thresh else "black")
        plt.tight_layout()
        plt.show()

## Main Execution


In [8]:
if __name__ == "__main__":
    # Step 1: (Optional) Precompute and cache best frames
    train_csv = LABELS_DIR / "TrainLabels.csv"
    val_csv = LABELS_DIR / "ValidationLabels.csv"
    
    cache_file_train = CACHE_DIR / f"precomputed_{Path(train_csv).stem}_frame_30.pkl"
    if not cache_file_train.exists():
        print("Precomputing best frames for training data...")
        precompute_best_frames(train_csv, FRAMES_DIR, num_frames=30)
    
    cache_file_val = CACHE_DIR / f"precomputed_{Path(val_csv).stem}_frame_30.pkl"      
    if not cache_file_val.exists():
        print("Precomputing best frames for validation data...")      
        precompute_best_frames(val_csv, FRAMES_DIR, num_frames=30)
    
    # ----------------------
    # Step 2: Run Optuna tuning with early stopping
    # ----------------------
    storage = optuna.storages.RDBStorage(
        url="sqlite:///tuning.db",
        failed_trial_callback=optuna.storages.RetryFailedTrialCallback(max_retry=6)
    )
    n_trials = 10
    study = optuna.create_study(
        direction="minimize",
        study_name="mobilev2_tcn_study",
        storage=storage,
        load_if_exists=True
    )
    print("Starting Optuna hyperparameter tuning...")

    # Count how many trials are already done (completed/failed/pruned)
    completed_trials = len([t for t in study.trials if t.state in {optuna.trial.TrialState.COMPLETE, optuna.trial.TrialState.FAIL, optuna.trial.TrialState.PRUNED}])

    # Initialize the progress bar with 'initial' = already completed
    pbar = tqdm(total=n_trials, desc="Optuna Trials", unit="trial", dynamic_ncols=True, initial=completed_trials)

    def update(study, trial):
        pbar.update()

    study.optimize(objective, n_trials=n_trials, catch=(Exception,), callbacks=[update])
    pbar.close()
    print(f"Optuna tuning complete.\nBest trial: {study.best_trial}")
    
    # ----------------------
    # Step 3: Final training with best hyperparameters and early stopping.
    # ----------------------
    best_trial = study.best_trial
    num_frames = best_trial.params["num_frames"]
    batch_size = best_trial.params["batch_size"]
    lr = best_trial.params["lr"]
    epochs = best_trial.params["epochs"]
    hidden_ch = best_trial.params["hidden_ch"]
    freeze_block = best_trial.params["freeze_block"]
    
    # Use num_workers=0 to avoid Windows spawn delays.
    train_dataset = VideoDataset(train_csv, FRAMES_DIR, transform=train_transform, num_frames=num_frames)
    val_dataset = VideoDataset(val_csv, FRAMES_DIR, transform=val_transform, num_frames=num_frames)
    
    from torch.utils.data import DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    
    final_model = MobileNetTCN(hidden_ch=hidden_ch, freeze_block=freeze_block)
    final_checkpoint = MODEL_DIR / "final_model_checkpoint.pth"
    patience = 5  # Set patience for final training
    print("Starting final training with best hyperparameters...")
    train_model(final_model, train_loader, val_loader, epochs, lr, final_checkpoint, patience=patience)
    
    # ----------------------
    # Step 4: Evaluate final model on test data.
    # ----------------------
    test_csv = LABELS_DIR / "TestLabels.csv"  # Update if needed
    test_dataset = VideoDataset(test_csv, FRAMES_DIR, transform=val_transform, num_frames=num_frames)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    print("Evaluating final model...")
    evaluate_model(final_model, test_loader)

  failed_trial_callback=optuna.storages.RetryFailedTrialCallback(max_retry=6)
[I 2025-02-17 22:35:06,097] Using an existing study with name 'mobilev2_tcn_study' instead of creating a new one.


Starting Optuna hyperparameter tuning...


LookupError: 'float' is not among the defined enum values. Enum name: trialvaluetype. Possible values: FINITE, INF_POS, INF_NEG