# Multimodal Emotion Recognition: Training + Streamlit Inference

This notebook follows your projectâ€™s structure to train a lightweight multimodal model on pre-extracted features (face AUs via OpenFace and audio features via openSMILE), evaluate it, save artifacts, and create a Streamlit app for testing.

- Project spec source: `Emotion_recognition.md`
- Inputs: `data/features/face_feats.npy`, `data/features/audio_feats.npy`, `data/features/labels.npy`
- Output: `artifacts/ckpt.pt` with class names

Dataset/resources links:
- OpenFace: https://github.com/TadasBaltrusaitis/OpenFace
- openSMILE: https://github.com/audeering/opensmile

If you need a ready dataset to start, consider RAVDESS/TESS/CREMA-D for audio and AFEW/FER+ for video; youâ€™ll still extract features first as per the spec.

In [None]:
# 1) Setup: Dependencies
# Installs are safe to re-run; Kaggle may have many preinstalled.

import sys, subprocess

def ensure(pkg, pip_name=None):
    try:
        __import__(pkg)
    except Exception:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', pip_name or pkg])

for py, pipn in [
    ('numpy','numpy'), ('pandas','pandas'), ('scikit_learn','scikit-learn'),
    ('torch','torch'), ('tqdm','tqdm'), ('joblib','joblib'), ('matplotlib','matplotlib'), ('seaborn','seaborn'),
    ('streamlit','streamlit')
]:
    try:
        __import__(py)
    except Exception:
        ensure(py, pipn)

import os, json, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

print('Versions: torch', torch.__version__, '| numpy', np.__version__)

## 2) Feature preparation (from spec)
- Face features: `data/features/face_feats.npy` (e.g., OpenFace AU stats, shape [N, Ff])
- Audio features: `data/features/audio_feats.npy` (e.g., openSMILE emobase, shape [N, Fa])
- Labels: `data/features/labels.npy` (strings or ints, length N)

If these files are missing, run your extraction scripts from `Emotion_recognition.md` or place the arrays in `data/features/`.

In [None]:
# 3) Load features & quick EDA

FACE_PATH = 'data/features/face_feats.npy'
AUDIO_PATH = 'data/features/audio_feats.npy'
LABELS_PATH = 'data/features/labels.npy'

assert os.path.exists(FACE_PATH), f"Missing {FACE_PATH}"
assert os.path.exists(AUDIO_PATH), f"Missing {AUDIO_PATH}"
assert os.path.exists(LABELS_PATH), f"Missing {LABELS_PATH}"

Xf = np.load(FACE_PATH)
Xa = np.load(AUDIO_PATH)
labels = np.load(LABELS_PATH, allow_pickle=True)

assert len(Xf)==len(Xa)==len(labels), 'Mismatched lengths'
N, Ff, Fa = len(labels), Xf.shape[1], Xa.shape[1]
print('N=', N, 'face_dim=', Ff, 'audio_dim=', Fa)

# Class distribution
vals, counts = np.unique(labels, return_counts=True)
print('Classes:', vals)
print('Counts:', dict(zip(vals, counts)))

# Basic stats
print('Face feats mean/std per-dim sample:', float(Xf.mean()), float(Xf.std()))
print('Audio feats mean/std per-dim sample:', float(Xa.mean()), float(Xa.std()))

In [None]:
# 4) Dataset and Model (from spec, simplified for single-GPU/CPU)

class FeaturePairDataset(Dataset):
    def __init__(self, Xf: np.ndarray, Xa: np.ndarray, labels: np.ndarray):
        assert len(Xf)==len(Xa)==len(labels)
        self.Xf = Xf.astype(np.float32)
        self.Xa = Xa.astype(np.float32)
        self.classes, y = np.unique(labels, return_inverse=True)
        self.y = y.astype(np.int64)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        xf = torch.from_numpy(self.Xf[idx])
        xa = torch.from_numpy(self.Xa[idx])
        y  = torch.tensor(self.y[idx])
        return xf, xa, y

class Branch(nn.Module):
    def __init__(self, in_dim, hidden=256, out_dim=128, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(in_dim),
            nn.Linear(in_dim, hidden), nn.ReLU(), nn.Dropout(p),
            nn.Linear(hidden, out_dim), nn.ReLU()
        )
    def forward(self, x):
        return self.net(x)

class FusionHead(nn.Module):
    def __init__(self, feat_dim, n_classes):
        super().__init__()
        self.cls = nn.Sequential(
            nn.LayerNorm(feat_dim),
            nn.Linear(feat_dim, feat_dim//2), nn.ReLU(),
            nn.Linear(feat_dim//2, n_classes)
        )
    def forward(self, z):
        return self.cls(z)

class AVFusion(nn.Module):
    def __init__(self, in_face, in_audio, n_classes):
        super().__init__()
        self.face = Branch(in_face)
        self.audio = Branch(in_audio)
        self.fuse = FusionHead(128+128, n_classes)
    def forward(self, xf, xa):
        zf = self.face(xf)
        za = self.audio(xa)
        z = torch.cat([zf, za], dim=-1)
        logits = self.fuse(z)
        return logits

In [None]:
# 5) Train/Val split, training, evaluation

ds = FeaturePairDataset(Xf, Xa, labels)
num_classes = len(ds.classes)
in_face = Xf.shape[1]
in_audio = Xa.shape[1]

val_ratio = 0.2
val_size = max(1, int(len(ds)*val_ratio))
train_size = len(ds) - val_size
train_ds, val_ds = random_split(ds, [train_size, val_size], generator=torch.Generator().manual_seed(42))

BATCH = 128
LR = 1e-3
EPOCHS = 20
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_ld = DataLoader(train_ds, batch_size=BATCH, shuffle=True, num_workers=2)
val_ld   = DataLoader(val_ds, batch_size=BATCH, shuffle=False, num_workers=2)

model = AVFusion(in_face, in_audio, num_classes).to(DEVICE)
opt = torch.optim.AdamW(model.parameters(), lr=LR)

best_acc = 0.0
best_state = None

for ep in range(1, EPOCHS+1):
    model.train()
    pbar = tqdm(train_ld, desc=f'ep{ep}')
    for xf, xa, y in pbar:
        xf, xa, y = xf.to(DEVICE), xa.to(DEVICE), y.to(DEVICE)
        logits = model(xf, xa)
        loss = F.cross_entropy(logits, y)
        opt.zero_grad(); loss.backward(); opt.step()
        pbar.set_postfix(loss=float(loss.item()))
    # Eval
    model.eval()
    tot = correct = 0
    with torch.no_grad():
        for xf, xa, y in val_ld:
            xf, xa, y = xf.to(DEVICE), xa.to(DEVICE), y.to(DEVICE)
            logits = model(xf, xa)
            pred = logits.argmax(1)
            correct += (pred==y).sum().item()
            tot += y.numel()
    acc = correct / max(1, tot)
    print(f'val_acc={acc:.4f}')
    if acc > best_acc:
        best_acc = acc
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}

print('Best val_acc:', best_acc)

In [None]:
# 6) Save artifacts

ARTIFACTS = 'artifacts'
os.makedirs(ARTIFACTS, exist_ok=True)
CKPT = os.path.join(ARTIFACTS, 'ckpt.pt')
META = os.path.join(ARTIFACTS, 'meta.json')

if best_state is None:
    best_state = model.state_dict()

torch.save({'model_state': best_state, 'classes': ds.classes}, CKPT)
with open(META, 'w') as f:
    json.dump({'in_face': int(in_face), 'in_audio': int(in_audio)}, f)

print('Saved:', CKPT, META)

In [None]:
# 7) Generate Streamlit test app (app.py)

app_code = r"""
import os, json
import numpy as np
import torch
import streamlit as st

st.set_page_config(page_title='Multimodal Emotion Inference', page_icon='ðŸŽ­', layout='centered')

ARTIFACTS = 'artifacts'
CKPT = os.path.join(ARTIFACTS, 'ckpt.pt')
META = os.path.join(ARTIFACTS, 'meta.json')

blob = torch.load(CKPT, map_location='cpu')
classes = [str(x) for x in blob['classes']]
with open(META, 'r') as f:
    meta = json.load(f)
IN_FACE = int(meta['in_face'])
IN_AUDIO = int(meta['in_audio'])

# Define the same model structure
import torch.nn as nn
class Branch(nn.Module):
    def __init__(self, in_dim, hidden=256, out_dim=128, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(in_dim),
            nn.Linear(in_dim, hidden), nn.ReLU(), nn.Dropout(p),
            nn.Linear(hidden, out_dim), nn.ReLU()
        )
    def forward(self, x):
        return self.net(x)
class FusionHead(nn.Module):
    def __init__(self, feat_dim, n_classes):
        super().__init__()
        self.cls = nn.Sequential(
            nn.LayerNorm(feat_dim),
            nn.Linear(feat_dim, feat_dim//2), nn.ReLU(),
            nn.Linear(feat_dim//2, n_classes)
        )
    def forward(self, z):
        return self.cls(z)
class AVFusion(nn.Module):
    def __init__(self, in_face, in_audio, n_classes):
        super().__init__()
        self.face = Branch(in_face)
        self.audio = Branch(in_audio)
        self.fuse = FusionHead(256, n_classes)
    def forward(self, xf, xa):
        zf = self.face(xf)
        za = self.audio(xa)
        z = torch.cat([zf, za], dim=-1)
        return self.fuse(z)

model = AVFusion(IN_FACE, IN_AUDIO, len(classes))
model.load_state_dict(blob['model_state'])
model.eval()

st.title('Multimodal Emotion Predictor')
st.write('Enter or upload feature vectors to test the trained model.')

mode = st.radio('Input mode', ['Manual sliders', 'Upload .npy vectors'])

import torch
if mode == 'Manual sliders':
    st.subheader('Face features')
    cols_f = st.columns(2)
    face_vals = []
    for i in range(IN_FACE):
        with cols_f[i % 2]:
            face_vals.append(st.number_input(f'face[{i}]', value=0.0))
    st.subheader('Audio features')
    cols_a = st.columns(2)
    audio_vals = []
    for i in range(IN_AUDIO):
        with cols_a[i % 2]:
            audio_vals.append(st.number_input(f'audio[{i}]', value=0.0))
    if st.button('Predict'):
        xf = torch.tensor(face_vals, dtype=torch.float32).unsqueeze(0)
        xa = torch.tensor(audio_vals, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            logits = model(xf, xa)
            probs = torch.softmax(logits, dim=1).numpy()[0]
        idx = int(np.argmax(probs))
        st.metric('Predicted', classes[idx])
        st.json({c: float(p) for c, p in zip(classes, probs)})
else:
    up_face = st.file_uploader('Upload face_feats.npy (1D vector or shape [F])', type=['npy'])
    up_audio = st.file_uploader('Upload audio_feats.npy (1D vector or shape [F])', type=['npy'])
    if st.button('Predict'):
        if up_face is None or up_audio is None:
            st.error('Please upload both vectors')
        else:
            xf = np.load(up_face)
            xa = np.load(up_audio)
            xf = torch.tensor(xf, dtype=torch.float32).view(1, -1)
            xa = torch.tensor(xa, dtype=torch.float32).view(1, -1)
            with torch.no_grad():
                logits = model(xf, xa)
                probs = torch.softmax(logits, dim=1).numpy()[0]
            idx = int(np.argmax(probs))
            st.metric('Predicted', classes[idx])
            st.json({c: float(p) for c, p in zip(classes, probs)})
"""

with open('app.py', 'w', encoding='utf-8') as f:
    f.write(app_code)
print('Wrote app.py')

## 8) Run Streamlit app
- After training cells have run and `artifacts/` exists with `ckpt.pt` and `meta.json`, launch:

```
streamlit run app.py
```

Notes:
- If features are large, prefer the file upload mode in the app.
- Feature extraction tools:
  - OpenFace: https://github.com/TadasBaltrusaitis/OpenFace
  - openSMILE: https://github.com/audeering/opensmile