### **Import Required Libraries**


In [1]:
import os
import datetime
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import logging
import pandas as pd
from pathlib import Path
import torchvision.transforms as transforms
from torchvision.models import resnet50, ResNet50_Weights
from sklearn.metrics import classification_report
from PIL import Image
from torch.amp import GradScaler, autocast

# 1) Logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('training_classification.log')
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.addHandler(logging.StreamHandler())

print("Torch version:", torch.__version__)

Torch version: 2.6.0+cu126


In [2]:
# paths 
DATASET_ROOT = Path("C:/Users/abhis/Downloads/Documents/Learner Engagement Project/data/DAiSEE/DataSet").resolve()
FRAMES_ROOT  = Path("C:/Users/abhis/Downloads/Documents/Learner Engagement Project/data/DAiSEE/ExtractedFrames").resolve()

#Helper function
def get_csv_clip_id(video_stem: str) -> str:
    """
    Maps old filenames to new ones if needed (like 110001 -> 202614).
    """
    base = video_stem.strip()
    if base.startswith("110001"):
        base = base.replace("110001", "202614", 1)
    return base

#numercial sort key for frames
import re
def numeric_sort_key(path):
    match = re.search(r'frame_(\d+)\.jpg', path.name)
    if match:
        return int(match.group(1))
    return 999999

### Define the DAiSEEDataset Class

This class loads video sequences and pairs them with engagement metrics.

- **Features**: Uses precomputed features for faster training.
- **Error Handling**: Skips missing video directories and logs errors.


In [3]:
class DAiSEEDataset(Dataset):
    def __init__(self, root, csv_path, transform=None, seq_len=15):
        """
        seq_len=15 means we attempt to load up to 15 frames from each folder.
        Adjust if you want fewer or more frames.
        """
        self.root = Path(root)
        self.transform = transform
        self.seq_len = seq_len
        self.video_paths = []
        self.labels = []
        self.missing_videos = 0
        self.total_videos = 0

        df = pd.read_csv(csv_path, dtype=str)
        df.columns = df.columns.str.strip()
        split = Path(csv_path).stem.replace("Labels", "").strip()

        for idx, row in df.iterrows():
            self.total_videos += 1
            try:
                clip_id = row['ClipID'].strip()
                filename = clip_id.split('/')[-1] if '/' in clip_id else clip_id
                video_stem = filename.rsplit('.', 1)[0]
                mapped_id = get_csv_clip_id(video_stem)

                video_dir = self.root / split / mapped_id
                if not video_dir.exists():
                    self.missing_videos += 1
                    continue
                
                frames = list(video_dir.glob('frame_*.jpg'))
                if len(frames) < self.seq_len:
                    self.missing_videos += 1
                    continue

                # We store path + 4-class integer labels
                # Convert them from str to int for classification
                boredom = int(row['Boredom'])
                engage = int(row['Engagement'])
                confuse = int(row['Confusion'])
                frustrate = int(row['Frustration'])

                self.video_paths.append(video_dir)
                self.labels.append([boredom, engage, confuse, frustrate])
            except Exception as e:
                print(f"Error processing row {idx}: {e}")

        if not self.video_paths:
            raise ValueError("No valid videos found for classification dataset.")

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_dir = self.video_paths[idx]
        # labels: 4 separate integers (range [0..3])
        label_list = self.labels[idx]

        # numeric sort frames, then slice up to seq_len
        frames_list = sorted(video_dir.glob('frame_*.jpg'), key=numeric_sort_key)[:self.seq_len]

        frame_tensors = []
        for path in frames_list:
            img = Image.open(path).convert("RGB")
            if self.transform:
                img = self.transform(img)
            else:
                img = transforms.ToTensor()(img)
            frame_tensors.append(img)

        # shape: [seq_len, 3, H, W]
        sequence = torch.stack(frame_tensors)
        # shape: [4], each in [0..3]
        label_tensor = torch.tensor(label_list, dtype=torch.long)

        return sequence, label_tensor

### **Define the CNN-LSTM Model**

**CNN-LSTM Model**:

1.  ResNet50 extracts features from each frame.
2.  LSTM processes temporal dependencies in sequences.
3.  Outputs four-dimensional regression values (engagement metrics).


In [4]:
class CNN_LSTM_Classification(nn.Module):
    def __init__(self, freeze_until='layer3'):
        """
        We'll produce 16 logits total => 4 states × 4 classes each = 16.
        Then we do a custom cross entropy for each dimension.
        """
        super(CNN_LSTM_Classification, self).__init__()

        # Use official weights to avoid deprecation warnings
        self.resnet = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)

        # Optionally freeze everything at first
        for param in self.resnet.parameters():
            param.requires_grad = False

        # Unfreeze from 'layer3' forward (this is partial unfreezing)
        unfreeze = False
        for name, child in self.resnet.named_children():
            if name == freeze_until:
                unfreeze = True
            if unfreeze:
                for param in child.parameters():
                    param.requires_grad = True

        # We'll skip the final fc layer from resnet, we do LSTM + custom FC
        # The output of resnet.avgpool is 2048-d
        self.lstm_hidden = 512
        self.lstm = nn.LSTM(2048, self.lstm_hidden, batch_first=True)
        # 16 logits: 4 states × 4 classes each
        self.fc = nn.Linear(self.lstm_hidden, 16)

    def forward(self, x):
        """
        x: [batch_size, seq_len, 3, H, W]
        Returns: [batch_size, 16 logits]
        We'll shape them to [batch_size, 4, 4] for multi cross-entropy.
        """
        bsz, seq_len, c, h, w = x.shape
        x = x.view(-1, c, h, w)  # flatten for ResNet forward

        # forward pass up to avgpool
        x = self.resnet.conv1(x)
        x = self.resnet.bn1(x)
        x = self.resnet.relu(x)
        x = self.resnet.maxpool(x)

        x = self.resnet.layer1(x)
        x = self.resnet.layer2(x)
        x = self.resnet.layer3(x)
        x = self.resnet.layer4(x)

        x = self.resnet.avgpool(x)  # shape: [batch_size*seq_len, 2048, 1, 1]
        x = x.view(x.size(0), -1)   # [batch_size*seq_len, 2048]

        # reshape for LSTM
        x = x.view(bsz, seq_len, -1)  # [bsz, seq_len, 2048]
        lstm_out, _ = self.lstm(x)
        # take the last time step
        last_step = lstm_out[:, -1, :]  # [bsz, hidden]
        logits = self.fc(last_step)     # [bsz, 16]
        return logits

In [5]:
# Multi-CrossEntropy for 4 states
def multi_ce_loss(logits, labels):
    """
    logits: [batch_size, 16]
    labels: [batch_size, 4], each label in [0..3]
    We'll reshape logits => [batch_size, 4, 4], then do CrossEntropy for each dimension.
    Final loss is the average or sum of the 4 cross entropies.
    """
    batch_size = logits.size(0)
    # reshape => [bsz, 4 states, 4 classes]
    logits_reshaped = logits.view(batch_size, 4, 4)  # e.g. [bsz, 4, 4]

    # separate each dimension’s logits & label
    ce = nn.CrossEntropyLoss()
    # We'll compute CE for each dimension (boredom, engagement, etc.)
    total_loss = 0.0
    for d in range(4):
        # logits for dimension d: [bsz, 4]
        dim_logits = logits_reshaped[:, d, :]
        # labels for dimension d: [bsz]
        dim_labels = labels[:, d]
        loss_d = ce(dim_logits, dim_labels)
        total_loss += loss_d

    # average or sum
    return total_loss / 4.0  # average across the 4 states

### **Data Loaders**

**Configures train, validation, and test loaders with optimal settings.**


In [6]:
def get_classification_dataloaders(batch_size=8, seq_len=15):
    """
    We'll load up to 'seq_len' frames from each folder. 
    """
    train_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])
    val_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    labels_path = DATASET_ROOT / "Labels"

    train_ds = DAiSEEDataset(FRAMES_ROOT, labels_path / "TrainLabels.csv", transform=train_transform, seq_len=seq_len)
    val_ds   = DAiSEEDataset(FRAMES_ROOT, labels_path / "ValidationLabels.csv", transform=val_transform, seq_len=seq_len)
    test_ds  = DAiSEEDataset(FRAMES_ROOT, labels_path / "TestLabels.csv", transform=val_transform, seq_len=seq_len)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=0, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

    return train_loader, val_loader, test_loader


### Training Loop with Optimizations

- **Mixed Precision**: Uses FP16 for faster training.
- **Checkpointing**: Saves the best model based on validation loss


In [7]:
#checkpointing
def save_checkpoint(model, optimizer, epoch, best_val_loss, directory="models_class"):
    from pathlib import Path
    import datetime
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    save_dir = Path(directory) / timestamp
    save_dir.mkdir(parents=True, exist_ok=True)

    checkpoint_path = save_dir / "ResNet50_CNNLSTM_classification.pth"
    torch.save({
        'epoch': epoch,
        'best_val_loss': best_val_loss,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved to {checkpoint_path}")

def load_latest_checkpoint(model, optimizer, model_dir="models_class", filename="ResNet50_CNNLSTM_classification.pth",
                           device=None):
    from pathlib import Path
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_dir = Path(model_dir)
    checkpoints = list(model_dir.rglob(filename))
    if not checkpoints:
        print("No checkpoint found. Starting from scratch.")
        return 0, float('inf')

    latest_cpt = max(checkpoints, key=lambda p: p.stat().st_mtime)
    print(f"Loading checkpoint from {latest_cpt}")
    cpoint = torch.load(latest_cpt, map_location=device)

    model.load_state_dict(cpoint['model_state_dict'], strict=False)
    optimizer.load_state_dict(cpoint['optimizer_state_dict'])
    start_epoch = cpoint.get('epoch', 0) + 1
    best_val_loss = cpoint.get('best_val_loss', float('inf'))

    print(f"Resuming training from epoch {start_epoch}")
    return start_epoch, best_val_loss


# Training Loop (Classification)
def train_classification_model(model, train_loader, val_loader,
                               epochs=10, lr=1e-4, early_stopping_patience=2):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scaler = GradScaler()

    start_epoch = 0
    best_val_loss = float('inf')
    patience_counter = 0

    model_save_dir = "models_class"
    from torch.optim.lr_scheduler import ReduceLROnPlateau
    scheduler = ReduceLROnPlateau(optimizer, patience=1)

    for epoch in range(start_epoch, epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1} Train"):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            with autocast(enabled=True, device_type='cuda'):
                logits = model(inputs)          # [batch, 16]
                loss   = multi_ce_loss(logits, labels)  # custom multi-dim cross entropy

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            running_loss += loss.item() * inputs.size(0)

        train_loss = running_loss / len(train_loader.dataset)

        # validation
        model.eval()
        val_running_loss = 0.0
        with torch.no_grad():
            for inputs, labels in tqdm(val_loader, desc=f"Epoch {epoch+1} Val"):
                inputs, labels = inputs.to(device), labels.to(device)
                with autocast(enabled=True, device_type='cuda'):
                    logits = model(inputs)
                    loss = multi_ce_loss(logits, labels)
                val_running_loss += loss.item() * inputs.size(0)

        val_loss = val_running_loss / len(val_loader.dataset)
        logger.info(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        scheduler.step(val_loss)

        # checkpoint / early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            save_checkpoint(model, optimizer, epoch, best_val_loss, model_save_dir)
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                logger.info("Early stopping triggered.")
                print("Early stopping triggered; training stopped.")
                break

    print("Training finished.")

**Model Evaluation:** After training, use the evaluation function to compute additional metrics:


In [8]:
def evaluate_classification(model, test_loader):
    """
    We'll compute predicted classes for each dimension,
    then compare them with ground truth to get classification_report
    or custom accuracy for each dimension.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            logits = model(inputs)  # [batch, 16]
            # shape => [batch, 4, 4]
            logits_reshaped = logits.view(-1, 4, 4)  # 4 dims × 4 classes

            # For each dimension, pick argmax
            # shape => [batch, 4]
            dimension_preds = torch.argmax(logits_reshaped, dim=2)
            all_preds.append(dimension_preds.cpu())
            all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds, dim=0).numpy()   # shape [N, 4]
    all_labels= torch.cat(all_labels, dim=0).numpy()  # shape [N, 4]

    # We can do classification_report for each dimension
    dimension_names = ["Boredom", "Engagement", "Confusion", "Frustration"]

    for d in range(4):
        print(f"\nDimension: {dimension_names[d]}")
        # We'll do classification report
        print(classification_report(all_labels[:, d], all_preds[:, d],
              labels=[0,1,2,3],
              digits=3))

    return all_preds, all_labels


#### Main Execution for CNN_LSTM Model


In [9]:
# 1. Get Dataloaders
train_loader, val_loader, test_loader = get_classification_dataloaders(
    batch_size=16,  # or 16 if your GPU can handle it
    seq_len=15     # bigger sequence length for more temporal data
)

# 2. Instantiate Model
# e.g. freeze_until='layer3' or 'layer2' or 'layer1' depending on how much you want to fine-tune
model = CNN_LSTM_Classification(freeze_until='layer2')

# 3. Train
train_classification_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=15,
    lr=5e-5,              # adjust as needed
    early_stopping_patience=2
)

# 4. Evaluate Best Checkpoint
# Re-instantiate the same architecture
model_eval = CNN_LSTM_Classification(freeze_until='layer3')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_eval.to(device)

# load the best checkpoint
from pathlib import Path
model_dir = Path("models_class")
cpts = list(model_dir.rglob("ResNet50_CNNLSTM_classification.pth"))
if cpts:
    best_ckpt = max(cpts, key=lambda p: p.stat().st_mtime)
    print(f"Loading best checkpoint for evaluation: {best_ckpt}")
    cpoint = torch.load(best_ckpt, map_location=device)
    model_eval.load_state_dict(cpoint['model_state_dict'], strict=False)

# classification evaluation
all_preds, all_labels = evaluate_classification(model_eval, test_loader)
# you get a classification_report per dimension


Epoch 1 Train: 100%|██████████| 304/304 [16:37<00:00,  3.28s/it]
Epoch 1 Val: 100%|██████████| 90/90 [01:07<00:00,  1.33it/s]
Epoch 1/15 | Train Loss: 0.8531 | Val Loss: 0.9780


Checkpoint saved to models_class\20250213-140249\ResNet50_CNNLSTM_classification.pth


Epoch 2 Train: 100%|██████████| 304/304 [13:54<00:00,  2.74s/it]
Epoch 2 Val: 100%|██████████| 90/90 [00:49<00:00,  1.83it/s]
Epoch 2/15 | Train Loss: 0.8068 | Val Loss: 0.9567


Checkpoint saved to models_class\20250213-141733\ResNet50_CNNLSTM_classification.pth


Epoch 3 Train: 100%|██████████| 304/304 [13:16<00:00,  2.62s/it]
Epoch 3 Val: 100%|██████████| 90/90 [00:57<00:00,  1.56it/s]
Epoch 3/15 | Train Loss: 0.7904 | Val Loss: 0.9583
Epoch 4 Train: 100%|██████████| 304/304 [15:53<00:00,  3.14s/it]
Epoch 4 Val: 100%|██████████| 90/90 [00:57<00:00,  1.55it/s]
Epoch 4/15 | Train Loss: 0.7716 | Val Loss: 0.9696
Early stopping triggered.


Early stopping triggered; training stopped.
Training finished.
Loading best checkpoint for evaluation: models_class\20250213-141733\ResNet50_CNNLSTM_classification.pth


Evaluating: 100%|██████████| 103/103 [10:16<00:00,  5.98s/it]


Dimension: Boredom
              precision    recall  f1-score   support

           0      0.570     0.560     0.565       747
           1      0.395     0.495     0.440       519
           2      0.263     0.200     0.227       335
           3      0.000     0.000     0.000        37

    accuracy                          0.453      1638
   macro avg      0.307     0.314     0.308      1638
weighted avg      0.439     0.453     0.443      1638


Dimension: Engagement
              precision    recall  f1-score   support

           0      0.000     0.000     0.000         4
           1      0.000     0.000     0.000        81
           2      0.548     0.678     0.606       849
           3      0.500     0.416     0.454       704

    accuracy                          0.531      1638
   macro avg      0.262     0.274     0.265      1638
weighted avg      0.499     0.531     0.509      1638


Dimension: Confusion
              precision    recall  f1-score   support

          


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
