In [None]:
!unzip video.zip

Archive:  video.zip
   creating: video/
  inflating: video/CMU_MOSI_Opinion_Labels.csd  
  inflating: video/CMU_MOSI_Visual_Facet_42.csd  


In [None]:
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split

# ============================= GPU SETUP =============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# =========================== LOAD + CHUNK VISUAL FEATURES (FINAL FIXED) ===========================
def load_and_chunk_visual(visual_path, label_path, chunk_len=200, stride=100):
    with h5py.File(visual_path, 'r') as f:
        print("Visual top-level keys:", list(f.keys()))
        data_group = f['FACET_4.2']['data']
        segment_ids = sorted(data_group.keys())

        chunks = []
        seg_lengths = {}

        for seg_id in segment_ids:
            feats = data_group[seg_id]['features'][:]
            feats = np.nan_to_num(feats, nan=0.0).astype(np.float32)
            print(f"Segment {seg_id}: shape {feats.shape}")  # This will show (T, 35)
            L = len(feats)
            seg_lengths[seg_id] = L
            for i in range(0, max(1, L - chunk_len + 1), stride):
                chunks.append(feats[i:i+chunk_len])

    print(f"Generated {len(chunks)} visual chunks from {len(segment_ids)} segments")

    # Load labels
    with h5py.File(label_path, 'r') as f:
        label_group = f['Opinion Segment Labels']['data']
        labels = []
        for seg_id in segment_ids:
            raw = label_group[seg_id]['features'][:]
            if raw.shape[-1] == 7:
                label = float(np.dot(raw.flatten(), np.arange(-3, 4)))
            else:
                label = float(raw.flatten()[0])
            n_chunks = max(1, (seg_lengths[seg_id] - chunk_len + stride) // stride)
            labels.extend([label] * n_chunks)

    chunks = chunks[:len(labels)]
    assert len(chunks) == len(labels)
    print(f"Final visual dataset: {len(chunks)} chunks, labels [{min(labels):.2f}, {max(labels):.2f}]")
    print("-" * 70)
    return chunks, labels


# =========================== DATASET (35-dim FACET 4.2) ===========================
class VisualDataset(Dataset):
    def __len__(self): return len(self.chunks)
    def __init__(self, chunks, labels):
        self.chunks = chunks
        self.labels = labels
    def __getitem__(self, i):
        return torch.from_numpy(self.chunks[i]), torch.tensor(self.labels[i], dtype=torch.float32)

def collate_fn(batch):
    seqs, labs = zip(*batch)
    lengths = torch.tensor([s.shape[0] for s in seqs])
    max_len = lengths.max().item()
    batch_size = len(seqs)
    feature_dim = seqs[0].shape[1]               # ← Auto-detects 35
    padded = torch.zeros(batch_size, max_len, feature_dim, dtype=torch.float32)
    for i, s in enumerate(seqs):
        padded[i, :lengths[i]] = s
    labels = torch.stack(labs).unsqueeze(1)
    return padded, labels, lengths


# =========================== MODEL (35 → 128 → bi-LSTM) ===========================
class VisualSentimentLSTM(nn.Module):
    def __init__(self, input_dim=35):   # ← Correct input size
        super().__init__()
        self.lstm = nn.LSTM(input_dim, 128, num_layers=2, batch_first=True,
                            bidirectional=True, dropout=0.4)
        self.norm = nn.LayerNorm(256)
        self.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )
    def forward(self, x, lengths):
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        h = torch.cat((hn[-2], hn[-1]), dim=1)
        h = self.norm(h)
        return self.head(h)


# =========================== TRAINING ===========================
def train_visual():
    chunks, labels = load_and_chunk_visual(
        'video/CMU_MOSI_Visual_Facet_42.csd',
        'video/CMU_MOSI_Opinion_Labels.csd',
        chunk_len=200, stride=100
    )

    train_c, val_c, train_y, val_y = train_test_split(
        chunks, labels, test_size=0.2, random_state=42,
        stratify=[int(l > 0) for l in labels]
    )

    train_loader = DataLoader(VisualDataset(train_c, train_y), batch_size=64,
                              shuffle=True, collate_fn=collate_fn, num_workers=2, pin_memory=True)
    val_loader   = DataLoader(VisualDataset(val_c, val_y), batch_size=64,
                              shuffle=False, collate_fn=collate_fn, num_workers=2, pin_memory=True)

    model = VisualSentimentLSTM(input_dim=35).to(device)  # ← 35-dim
    optimizer = optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=5)
    criterion = nn.MSELoss()

    best_val = float('inf')
    patience = 15
    no_improve = 0

    print(f"Starting vision-only training: {len(train_c)} train / {len(val_c)} val chunks\n")

    for epoch in range(1, 101):
        model.train()
        train_loss = 0.0
        for x, y, lengths in train_loader:
            x, y, lengths = x.to(device), y.to(device), lengths.to(device)
            optimizer.zero_grad()
            pred = model(x, lengths)
            loss = criterion(pred, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x, y, lengths in val_loader:
                x, y, lengths = x.to(device), y.to(device), lengths.to(device)
                pred = model(x, lengths)
                val_loss += criterion(pred, y).item()

        train_loss /= len(train_loader)
        val_loss   /= len(val_loader)
        scheduler.step(val_loss)

        print(f"Epoch {epoch:02d} | Train {train_loss:.4f} | Val {val_loss:.4f}")

        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), "best_mosi_visual_facet42.pth")
            print(f"  → New best! Val MSE = {best_val:.4f}")
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print("Early stopping")
                break

    print(f"\nVision-only training finished! Best Val MSE: {best_val:.4f}")
    print("Model saved as 'best_mosi_visual_facet42.pth'")


# =========================== RUN ===========================
if __name__ == "__main__":
    train_visual()

Using device: cuda
GPU: Tesla T4
Visual top-level keys: ['FACET_4.2']
Segment 03bSnISJMiM: shape (5402, 35)
Segment 0h-zjBukYpk: shape (5399, 35)
Segment 1DmNV9C1hbY: shape (2078, 35)
Segment 1iG0909rllw: shape (4718, 35)
Segment 2WGyTLYerpo: shape (5402, 35)
Segment 2iD-tVS8NPw: shape (5400, 35)
Segment 5W7Z1C_fDaE: shape (2282, 35)
Segment 6Egk_28TtTM: shape (4744, 35)
Segment 6_0THN4chvY: shape (3060, 35)
Segment 73jzhE8R1TQ: shape (5402, 35)
Segment 7JsX8y1ysxY: shape (5400, 35)
Segment 8OtFthrtaJM: shape (5402, 35)
Segment 8d-gEyoeBzc: shape (5400, 35)
Segment 8qrpnFRGt2A: shape (5401, 35)
Segment 9J25DZhivz8: shape (5400, 35)
Segment 9T9Hf74oK10: shape (5403, 35)
Segment 9c67fiY0wGQ: shape (2834, 35)
Segment 9qR7uwkblbs: shape (5400, 35)
Segment Af8D0E4ZXaw: shape (5402, 35)
Segment BI97DNYfe5I: shape (3419, 35)
Segment BXuRRbG0Ugk: shape (5402, 35)
Segment Bfr499ggo-0: shape (2566, 35)
Segment BioHAh1qJAQ: shape (5400, 35)
Segment BvYR0L6f2Ig: shape (5400, 35)
Segment Ci-AH39fi3