In [1]:
import os, pickle, random, math, glob
import numpy as np, pandas as pd, tqdm, torchaudio, torch
from transformers import AutoProcessor, HubertModel
from tqdm import tqdm

2025-05-21 22:52:38.426690: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-21 22:52:38.434466: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747857158.443443   83937 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747857158.446144   83937 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747857158.454240   83937 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [19]:
BASE = "/media/dtsarev/SatSSD/data"
AUDIO_DIR  = f"{BASE}/audio"
TRAIN_CSV  = f"{BASE}/train_split.csv"
VAL_CSV    = f"{BASE}/valid_split.csv"
EMB_CACHE  = f"audio_hubert_embeds"
EPOCHS     = 1000
os.makedirs(EMB_CACHE, exist_ok=True)

In [3]:
SR        = 16_000
MAX_SEC   = 12
DEVICE    = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

EMOS = ["Admiration","Amusement","Determination","Empathic Pain","Excitement","Joy"]
NUM_EMOS = len(EMOS)

In [4]:
proc  = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft").to(DEVICE)
model.eval()

@torch.no_grad()
def extract_embed(path:str)->np.ndarray:
    """Return (T, H) HuBERT hidden states for first MAX_SEC seconds of *path* MP3."""
    wav, sr = torchaudio.load(path)
    if sr != SR:
        wav = torchaudio.functional.resample(wav, sr, SR)
    wav = wav[:, : SR*MAX_SEC]
    wav = (wav - wav.mean()) / (wav.std() + 1e-6)  # CMVN
    inp = proc(wav.squeeze(), sampling_rate=SR, return_tensors='pt', padding=True)
    hid = model(**{k: v.to(DEVICE) for k, v in inp.items()}).last_hidden_state[0]  # (T, 768)
    return hid.cpu().numpy()

In [6]:
def cache_all(csv_path:str, tag:str):
    df = pd.read_csv(csv_path, dtype={'Filename': str})
    df['Filename'] = df['Filename'].str.zfill(5)
    for fn in tqdm(df['Filename'], desc=f"{tag} embeds"):
        out = f"{EMB_CACHE}/{tag}_{fn}.npy"
        if os.path.exists(out):
            continue
        mp3 = f"{AUDIO_DIR}/{fn}.mp3"
        try:
            np.save(out, extract_embed(mp3))
        except Exception as e:
            print("skip", fn, e)

cache_all(TRAIN_CSV, "train")
cache_all(VAL_CSV,   "val")

train embeds: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8072/8072 [03:55<00:00, 34.25it/s]
val embeds: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4588/4588 [02:22<00:00, 32.12it/s]


In [7]:
def build_xy(csv_path:str, tag:str, use_all_stats=True):
    df = pd.read_csv(csv_path, dtype={'Filename': str})
    df['Filename'] = df['Filename'].str.zfill(5)
    X, y = [], []
    for _, row in df.iterrows():
        fn = row['Filename']
        fpath = f"{EMB_CACHE}/{tag}_{fn}.npy"
        if not os.path.exists(fpath):
            continue
        feat = np.load(fpath)  # (T, H)
        if use_all_stats:
            vec = np.concatenate([feat.mean(0), feat.std(0), feat.min(0), feat.max(0)])
        else:
            vec = feat.mean(0)
        X.append(vec)
        y.append(row[EMOS].values.astype(np.float32))
    return np.stack(X), np.stack(y)

X_train, y_train = build_xy(TRAIN_CSV, "train", use_all_stats=True)
X_val,   y_val   = build_xy(VAL_CSV,   "val",   use_all_stats=True)
print("Train/Val shapes:", X_train.shape, y_train.shape, X_val.shape)

IN_DIM = X_train.shape[1]  # <- used by classifier

Train/Val shapes: (8072, 4096) (8072, 6) (4588, 4096)


In [11]:
def pearson_loss(preds, targets):
    vx, vy = preds - preds.mean(0), targets - targets.mean(0)
    corr   = (vx*vy).sum(0) / (torch.sqrt((vx**2).sum(0)*(vy**2).sum(0))+1e-8)
    return 1 - corr.mean()

In [20]:
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

class MLP(nn.Module):
    """2‑layer MLP without output squashing — lets loss drive range."""
    def __init__(self, in_dim:int, out_dim:int=NUM_EMOS):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(in_dim),
            nn.Linear(in_dim, 1024), nn.GELU(), nn.Dropout(0.5),
            nn.Linear(1024, 512),  nn.GELU(), nn.Dropout(0.4),
            nn.Linear(512, out_dim)  # **no activation**
        )
    def forward(self, x):
        return self.net(x)

train_ds = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_ds   = TensorDataset(torch.from_numpy(X_val).float(),   torch.from_numpy(y_val).float())
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)
val_dl   = DataLoader(val_ds,   batch_size=256, shuffle=False)

model_t = MLP(IN_DIM).to(DEVICE)
opt     = torch.optim.AdamW(model_t.parameters(), lr=5e-4, weight_decay=1e-3)
# === Pearson correlation loss (optimize metric directly)

def pearson_loss(preds, targets):
    vx, vy = preds - preds.mean(0), targets - targets.mean(0)
    corr = (vx * vy).sum(0) / (torch.sqrt((vx**2).sum(0) * (vy**2).sum(0)) + 1e-8)
    return 1 - corr.mean()

crit = pearson_loss

best, patience, waited = float('inf'), 7, 0
for epoch in range(1, EPOCHS):
    # --- train
    model_t.train(); total=0
    for xb, yb in train_dl:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        opt.zero_grad()
        pred = model_t(xb)
        loss = crit(pred, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model_t.parameters(), 1.0)
        opt.step()
        total += loss.item()
    tloss = total / len(train_dl)

    # --- val
    model_t.eval(); vtotal=0
    with torch.no_grad():
        for xb, yb in val_dl:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            vtotal += crit(model_t(xb), yb).item()
    vloss = vtotal / len(val_dl)
    print(f"Epoch {epoch:2d}: train {tloss:.4f} | val {vloss:.4f}")

    if vloss < best:
        best, waited = vloss, 0
        torch.save(model_t.state_dict(), 'best_audio_mlp.pt')
        print("  saved best")
    else:
        waited += 1
        if waited >= patience:
            print("Early stop")
            break

Epoch  1: train 0.9983 | val 0.9727
  saved best
Epoch  2: train 0.9894 | val 0.9714
  saved best
Epoch  3: train 0.9896 | val 0.9683
  saved best
Epoch  4: train 0.9917 | val 0.9721
Epoch  5: train 0.9746 | val 0.9690
Epoch  6: train 0.9664 | val 0.9624
  saved best
Epoch  7: train 0.9543 | val 0.9575
  saved best
Epoch  8: train 0.9457 | val 0.9501
  saved best
Epoch  9: train 0.9318 | val 0.9351
  saved best
Epoch 10: train 0.9174 | val 0.9181
  saved best
Epoch 11: train 0.8979 | val 0.8942
  saved best
Epoch 12: train 0.8811 | val 0.8679
  saved best
Epoch 13: train 0.8574 | val 0.8409
  saved best
Epoch 14: train 0.8429 | val 0.8359
  saved best
Epoch 15: train 0.8353 | val 0.8315
  saved best
Epoch 16: train 0.8443 | val 0.8418
Epoch 17: train 0.8323 | val 0.8064
  saved best
Epoch 18: train 0.8216 | val 0.7759
  saved best
Epoch 19: train 0.8061 | val 0.7856
Epoch 20: train 0.8088 | val 0.7733
  saved best
Epoch 21: train 0.8358 | val 0.7881
Epoch 22: train 0.8081 | val 0.7565


In [21]:
model_t.load_state_dict(torch.load('best_audio_mlp.pt'))
model_t.eval(); preds=[]
with torch.no_grad():
    for xb,_ in val_dl:
        preds.append(model_t(xb.to(DEVICE)).cpu())

preds = torch.cat(preds).numpy()
mean_r = np.mean([np.corrcoef(preds[:,i], y_val[:,i])[0,1] for i in range(NUM_EMOS)])
print("Macro Pearson:", mean_r)

Macro Pearson: 0.30535813237021026
