In [None]:
!unzip audio.zip

Archive:  audio.zip
   creating: audio/
  inflating: audio/CMU_MOSI_COVAREP.csd  
  inflating: audio/CMU_MOSI_Opinion_Labels.csd  


In [None]:
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split

# ============================= GPU SETUP =============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# =========================== DATA LOADER (Works with real MOSI files) ===========================
def load_and_chunk_mosi(audio_path, label_path, chunk_len=200, stride=100):
    # --- Load audio (COVAREP) ---
    with h5py.File(audio_path, 'r') as f:
        print("Audio top-level keys:", list(f.keys()))
        if 'COVAREP' in f:
            data_group = f['COVAREP']['data']
        else:
            data_group = f['data']                  # newer format
        segment_ids = sorted(data_group.keys())

        chunks = []
        seg_lengths = {}

        for seg_id in segment_ids:
            feats = data_group[seg_id]['features'][:]
            feats = np.nan_to_num(feats, nan=0.0).astype(np.float32)
            L = len(feats)
            seg_lengths[seg_id] = L
            for i in range(0, max(1, L - chunk_len + 1), stride):
                chunks.append(feats[i:i+chunk_len])

    print(f"Generated {len(chunks)} audio chunks")

    # --- Load labels ---
    with h5py.File(label_path, 'r') as f:
        print("Label top-level keys:", list(f.keys()))
        if 'Opinion Segment Labels' in f:
            label_group = f['Opinion Segment Labels']['data']
        else:
            label_group = f['data']

        labels = []
        chunk_idx = 0
        for seg_id in segment_ids:
            if seg_id not in label_group:
                # skip chunks belonging to missing segment
                n_chunks = max(1, (seg_lengths[seg_id] - chunk_len + stride) // stride)
                chunk_idx += n_chunks
                continue

            raw = label_group[seg_id]['features'][:]
            if raw.shape[-1] == 7:  # one-hot
                label = float(np.dot(raw.flatten(), np.arange(-3, 4)))
            else:
                label = float(raw.flatten()[0])

            n_chunks = max(1, (seg_lengths[seg_id] - chunk_len + stride) // stride)
            labels.extend([label] * n_chunks)
            chunk_idx += n_chunks

    # Trim any extra chunks (safety)
    chunks = chunks[:len(labels)]
    assert len(chunks) == len(labels), f"{len(chunks)} != {len(labels)}"

    print(f"Final dataset: {len(chunks)} samples, labels [{min(labels):.2f}, {max(labels):.2f}]")
    print("-" * 70)
    return chunks, labels


# =========================== DATASET ===========================
class MOSIChunkDataset(Dataset):
    def __init__(self, chunks, labels):
        self.chunks = chunks
        self.labels = labels
    def __len__(self): return len(self.chunks)
    def __getitem__(self, i):
        return torch.from_numpy(self.chunks[i]), torch.tensor(self.labels[i], dtype=torch.float32)

def collate_fn(batch):
    seqs, labs = zip(*batch)
    lengths = torch.tensor([s.shape[0] for s in seqs])
    max_len = lengths.max()
    padded = torch.zeros(len(seqs), max_len, 74, dtype=torch.float32)
    for i, s in enumerate(seqs):
        padded[i, :lengths[i]] = s
    labels = torch.stack(labs).unsqueeze(1)
    return padded, labels, lengths


# =========================== MODEL ===========================
class AudioSentimentLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(74, 128, num_layers=2, batch_first=True,
                            bidirectional=True, dropout=0.4)
        self.norm = nn.LayerNorm(256)
        self.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )
    def forward(self, x, lengths):
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        h = torch.cat((hn[-2], hn[-1]), dim=1)   # (B, 256)
        h = self.norm(h)
        return self.head(h)


# =========================== TRAINING ===========================
def train():
    chunks, labels = load_and_chunk_mosi(
        'audio/CMU_MOSI_COVAREP.csd',        # or .h5
        'audio/CMU_MOSI_Opinion_Labels.csd'  # or .h5
    )

    train_c, val_c, train_y, val_y = train_test_split(
        chunks, labels, test_size=0.2, random_state=42,
        stratify=[int(l > 0) for l in labels]   # rough balance
    )

    train_loader = DataLoader(MOSIChunkDataset(train_c, train_y), batch_size=64,
                              shuffle=True, collate_fn=collate_fn, num_workers=2, pin_memory=True)
    val_loader   = DataLoader(MOSIChunkDataset(val_c, val_y), batch_size=64,
                              shuffle=False, collate_fn=collate_fn, num_workers=2, pin_memory=True)

    model = AudioSentimentLSTM().to(device)
    optimizer = optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    criterion = nn.MSELoss()

    best_val = float('inf')
    patience = 15
    no_improve = 0

    print(f"Start training: {len(train_c)} train / {len(val_c)} val chunks\n")

    for epoch in range(1, 101):
        # Train
        model.train()
        train_loss = 0.0
        for x, y, lengths in train_loader:
            x, y, lengths = x.to(device), y.to(device), lengths.to(device)
            optimizer.zero_grad()
            pred = model(x, lengths)
            loss = criterion(pred, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x, y, lengths in val_loader:
                x, y, lengths = x.to(device), y.to(device), lengths.to(device)
                pred = model(x, lengths)
                val_loss += criterion(pred, y).item()

        train_loss /= len(train_loader)
        val_loss   /= len(val_loader)
        scheduler.step(val_loss)

        print(f"Epoch {epoch:02d} | Train {train_loss:.4f} | Val {val_loss:.4f} | LR {optimizer.param_groups[0]['lr']:.2e}")

        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), "best_mosi_audio_chunked.pth")
            print(f"  → New best! Val MSE = {best_val:.4f}")
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print("Early stopping")
                break

    print(f"\nTraining finished! Best Val MSE: {best_val:.4f}")
    print("Model saved as 'best_mosi_audio_chunked.pth'")


# =========================== RUN ===========================
if __name__ == "__main__":
    train()

Using device: cuda
GPU: Tesla T4
Audio top-level keys: ['COVAREP']
Generated 14384 audio chunks
Label top-level keys: ['Opinion Segment Labels']
Final dataset: 14384 samples, labels [-3.00, 3.00]
----------------------------------------------------------------------
Start training: 11507 train / 2877 val chunks

Epoch 01 | Train 2.7958 | Val 2.7693 | LR 3.00e-04
  → New best! Val MSE = 2.7693
Epoch 02 | Train 2.6462 | Val 2.5405 | LR 3.00e-04
  → New best! Val MSE = 2.5405
Epoch 03 | Train 2.5310 | Val 2.5553 | LR 3.00e-04
Epoch 04 | Train 2.4144 | Val 2.3911 | LR 3.00e-04
  → New best! Val MSE = 2.3911
Epoch 05 | Train 2.3077 | Val 2.3107 | LR 3.00e-04
  → New best! Val MSE = 2.3107
Epoch 06 | Train 2.2449 | Val 2.2078 | LR 3.00e-04
  → New best! Val MSE = 2.2078
Epoch 07 | Train 2.1609 | Val 2.0869 | LR 3.00e-04
  → New best! Val MSE = 2.0869
Epoch 08 | Train 2.0828 | Val 2.0279 | LR 3.00e-04
  → New best! Val MSE = 2.0279
Epoch 09 | Train 2.0236 | Val 2.0083 | LR 3.00e-04
  → New be