<a href="https://colab.research.google.com/github/ArtDowdy/deep-learning-engagement/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# Multimodal Content Understanding (PyTorch, Colab-ready)
# Text (synopsis) + Tabular (genre, duration, maturity) -> Engagement Score
# - BiGRU + Additive Attention for text
# - MLP for tabular metadata
# - Fusion head with dropout + layernorm
# - AMP training, early stopping, cosine schedule
# - Metrics: AUC, Accuracy, Brier Score, ECE (calibration), per-genre AUC
# - Exports: TorchScript + ONNX for deployment
# ============================================================

import os, math, random, json, itertools
from typing import List
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import roc_auc_score, accuracy_score

# ------------- Repro & Config -------------
def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
seed_everything(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

CFG = {
    "max_vocab": 8000,
    "max_len": 64,
    "embed_dim": 128,
    "rnn_hidden": 128,
    "rnn_layers": 1,
    "tab_hidden": 64,
    "fusion_hidden": 128,
    "dropout": 0.2,
    "batch_size": 128,
    "lr": 2e-3,
    "weight_decay": 1e-4,
    "epochs": 12,
    "early_patience": 3,
    "train_val_split": 0.85,
}

# ------------- Synthetic Data Generator -------------
GENRES = ["drama", "comedy", "thriller", "documentary", "action", "romance", "scifi"]
MATURITY = ["G", "PG", "PG-13", "R", "TV-MA"]

sentiment_pos = ["heartwarming", "uplifting", "award-winning", "beloved", "captivating", "hilarious", "feel-good"]
sentiment_neg = ["grim", "bleak", "slow", "confusing", "derivative", "predictable", "gory"]
hooks = ["edge-of-your-seat", "bingeable", "critically-acclaimed", "fan-favorite", "character-driven"]
topics = ["family", "space", "heist", "chef", "music", "athlete", "detective", "robot", "time-travel", "politics", "school"]

def make_synopsis(genre, maturity, good=True):
    tokens = []
    tokens += [random.choice(topics) for _ in range(2)]
    tokens += [random.choice(hooks)]
    if good:
        tokens += [random.choice(sentiment_pos) for _ in range(2)]
    else:
        tokens += [random.choice(sentiment_neg) for _ in range(2)]
    fillers = ["story", "journey", "friends", "discover", "secret", "mission", "season", "episode", "world", "small-town"]
    tokens += [random.choice(fillers) for _ in range(10)]
    tokens += [genre, maturity.lower()]
    return " ".join(tokens)

def simulate_dataset(N=5000):
    rows = []
    for _ in range(N):
        genre = random.choice(GENRES)
        maturity = random.choice(MATURITY)
        duration = max(50, int(np.random.normal(105, 25)))  # minutes
        base = 0.35
        if genre in ["comedy","thriller","documentary"]: base += 0.1
        if maturity in ["PG-13","TV-MA"]: base += 0.05
        if 85 <= duration <= 120: base += 0.05
        good_text = random.random() < base
        synopsis = make_synopsis(genre, maturity, good=good_text)
        p = base + (0.10 if good_text else -0.05) + np.random.normal(0, 0.04)
        p = float(np.clip(p, 0.02, 0.98))
        y = int(random.random() < p)
        rows.append({"synopsis": synopsis,
                     "genre": genre,
                     "maturity": maturity,
                     "duration": duration,
                     "label": y})
    return rows

data = simulate_dataset(5000)

# ------------- Vocab / Tokenization -------------
def tokenize(text: str) -> List[str]:
    return [t.strip(".,!?:;\"'()[]").lower() for t in text.split() if t.strip()]

all_tokens = list(itertools.chain.from_iterable(tokenize(r["synopsis"]) for r in data))
from collections import Counter
freqs = Counter(all_tokens)
itos = ["<pad>", "<unk>"] + [w for w,_ in freqs.most_common(CFG["max_vocab"]-2)]
stoi = {w:i for i,w in enumerate(itos)}

def encode(text, max_len=CFG["max_len"]):
    toks = tokenize(text)
    ids = [stoi.get(t, 1) for t in toks][:max_len]
    if len(ids) < max_len:
        ids += [0]*(max_len - len(ids))
    return np.int64(ids)

# ------------- Categorical Encoders -------------
genre2ix = {g:i for i,g in enumerate(GENRES)}
mat2ix = {m:i for i,m in enumerate(MATURITY)}

def one_hot(idx, num):
    v = np.zeros(num, dtype=np.float32)
    v[idx] = 1.0
    return v

# ------------- Dataset / Dataloaders -------------
class ContentDS(Dataset):
    def __init__(self, rows):
        self.rows = rows
    def __len__(self): return len(self.rows)
    def __getitem__(self, i):
        r = self.rows[i]
        x_txt = encode(r["synopsis"])
        g_ix = genre2ix[r["genre"]]
        x_tab = np.concatenate([
            one_hot(g_ix, len(GENRES)),
            one_hot(mat2ix[r["maturity"]], len(MATURITY)),
            np.array([r["duration"]], dtype=np.float32)/180.0
        ]).astype(np.float32)
        y = np.float32(r["label"])
        return torch.from_numpy(x_txt), torch.from_numpy(x_tab), torch.tensor(y), torch.tensor(g_ix, dtype=torch.long)

ds = ContentDS(data)
n_train = int(len(ds)*CFG["train_val_split"])
n_val = len(ds)-n_train
train_ds, val_ds = random_split(ds, [n_train, n_val], generator=torch.Generator().manual_seed(42))

# Colab tip: num_workers=0 is most portable in hosted envs
train_loader = DataLoader(train_ds, batch_size=CFG["batch_size"], shuffle=True, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=CFG["batch_size"], shuffle=False, num_workers=0, pin_memory=True)

# ------------- Model: TextEncoder (BiGRU + Additive Attention) -------------
class AdditiveAttention(nn.Module):
    def __init__(self, dim, hidden=64):
        super().__init__()
        self.W = nn.Linear(dim, hidden)
        self.v = nn.Linear(hidden, 1, bias=False)
    def forward(self, H, mask=None):
        scores = self.v(torch.tanh(self.W(H))).squeeze(-1)
        if mask is not None:
            scores = scores.masked_fill(~mask, -1e9)
        w = torch.softmax(scores, dim=-1).unsqueeze(-1)
        ctx = (H * w).sum(dim=1)
        return ctx, w.squeeze(-1)

class TextEncoder(nn.Module):
    def __init__(self, vocab, embed_dim, hidden, layers, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab, embed_dim, padding_idx=0)
        self.rnn = nn.GRU(embed_dim, hidden, num_layers=layers, batch_first=True, bidirectional=True)
        self.attn = AdditiveAttention(hidden*2, hidden)
        self.norm = nn.LayerNorm(hidden*2)
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        mask = x != 0
        e = self.embed(x)
        H,_ = self.rnn(e)
        ctx,_ = self.attn(H, mask)
        return self.drop(self.norm(ctx))

# ------------- Tabular Encoder -------------
class TabularEncoder(nn.Module):
    def __init__(self, in_dim, hidden, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
        )
        self.norm = nn.LayerNorm(hidden)
    def forward(self, x):
        return self.norm(self.net(x))

# ------------- Fusion Head -------------
class MultimodalCTR(nn.Module):
    def __init__(self, vocab, text_dim, rnn_hidden, rnn_layers, tab_in, tab_hidden, fusion_hidden, dropout):
        super().__init__()
        self.txt = TextEncoder(vocab, text_dim, rnn_hidden, rnn_layers, dropout)
        self.tab = TabularEncoder(tab_in, tab_hidden, dropout)
        fused_in = rnn_hidden*2 + tab_hidden
        self.fuse = nn.Sequential(
            nn.Linear(fused_in, fusion_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.LayerNorm(fusion_hidden),
            nn.Linear(fusion_hidden, 1)
        )
    def forward(self, x_txt, x_tab):
        h_txt = self.txt(x_txt)
        h_tab = self.tab(x_tab)
        z = torch.cat([h_txt, h_tab], dim=1)
        logits = self.fuse(z).squeeze(1)
        return logits

tab_in_dim = len(GENRES) + len(MATURITY) + 1
model = MultimodalCTR(
    vocab=len(itos),
    text_dim=CFG["embed_dim"],
    rnn_hidden=CFG["rnn_hidden"],
    rnn_layers=CFG["rnn_layers"],
    tab_in=tab_in_dim,
    tab_hidden=CFG["tab_hidden"],
    fusion_hidden=CFG["fusion_hidden"],
    dropout=CFG["dropout"]
).to(device)

# ------------- Optim, Schedule, Early Stop -------------
opt = torch.optim.AdamW(model.parameters(), lr=CFG["lr"], weight_decay=CFG["weight_decay"])
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=CFG["epochs"])
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
bce = nn.BCEWithLogitsLoss()

def brier_score(probs, targets):
    probs = np.clip(probs, 1e-6, 1-1e-6)
    return float(np.mean((probs - targets)**2))

def ece_score(probs, targets, bins=10):
    edges = np.linspace(0,1,bins+1)
    ece = 0.0
    for i in range(bins):
        lo, hi = edges[i], edges[i+1]
        m = (probs >= lo) & (probs < hi)
        if m.any():
            conf = probs[m].mean()
            acc  = (targets[m] == (probs[m] >= 0.5)).mean()
            ece += (m.mean()) * abs(conf - acc)
    return float(ece)

# ------------- Train Loop -------------
best_val = float("inf")
best_state = None
pat = 0

for epoch in range(1, CFG["epochs"]+1):
    model.train()
    running = 0.0
    for x_txt, x_tab, y, _gix in train_loader:
        x_txt = x_txt.to(device); x_tab = x_tab.to(device); y = y.to(device)
        opt.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(x_txt, x_tab)
            loss = bce(logits, y)
        scaler.scale(loss).backward()
        scaler.step(opt)
        scaler.update()
        running += loss.item() * x_txt.size(0)
    scheduler.step()

    # ---- Validation ----
    model.eval()
    vloss = 0.0
    all_logits, all_y, all_gix = [], [], []
    with torch.no_grad():
        for x_txt, x_tab, y, gix in val_loader:
            x_txt = x_txt.to(device); x_tab = x_tab.to(device); y = y.to(device)
            logits = model(x_txt, x_tab)
            loss = bce(logits, y)
            vloss += loss.item() * x_txt.size(0)
            all_logits.append(logits.cpu()); all_y.append(y.cpu()); all_gix.append(gix.cpu())
    vloss /= len(val_ds)
    logits = torch.cat(all_logits).numpy()
    targets = torch.cat(all_y).numpy()
    gixs = torch.cat(all_gix).numpy()
    probs = 1/(1+np.exp(-logits))
    auc = roc_auc_score(targets, probs)
    acc = accuracy_score(targets, probs>=0.5)
    brier = brier_score(probs, targets)
    ece = ece_score(probs, targets, bins=15)

    print(f"Epoch {epoch:02d} | train_loss={running/len(train_ds):.4f} "
          f"| val_loss={vloss:.4f} | AUC={auc:.3f} | ACC={acc:.3f} | Brier={brier:.3f} | ECE={ece:.3f}")

    # Early stopping on val_loss
    if vloss < best_val - 1e-4:
        best_val = vloss
        best_state = {k: v.clone().cpu() for k,v in model.state_dict().items()}
        pat = 0
    else:
        pat += 1
        if pat >= CFG["early_patience"]:
            print("Early stopping.")
            break

# Restore best
if best_state is not None:
    model.load_state_dict({k: v.to(device) for k,v in best_state.items()})

# ------------- Per-Genre AUC (slice performance) -------------
genre_auc = {}
for gi, g in enumerate(GENRES):
    m = gixs == gi
    if m.sum() > 1 and len(np.unique(targets[m])) == 2:
        genre_auc[g] = roc_auc_score(targets[m], probs[m])
    else:
        genre_auc[g] = float("nan")

print("\nPer-genre AUC:")
for g,a in genre_auc.items():
    print(f"  {g:12s}: {a:.3f}" if not math.isnan(a) else f"  {g:12s}: n/a")

# ------------- Tiny “Quality Judge” (offline stand-in) -------------
judge_good = set(sentiment_pos + hooks)
judge_bad  = set(sentiment_neg)
def quality_judge(texts: List[str]) -> np.ndarray:
    scores = []
    for t in texts:
        toks = set(tokenize(t))
        pos = len(toks & judge_good)
        neg = len(toks & judge_bad)
        score = (pos + 1) / (pos + neg + 2)  # Laplace-smooth
        scores.append(score)
    return np.array(scores, dtype=np.float32)

sample_syn = [data[i]["synopsis"] for i in random.sample(range(len(data)), 256)]
sample_enc = torch.tensor(np.stack([encode(s) for s in sample_syn])).to(device)
avg_tab = torch.tensor(np.mean([np.concatenate([
    one_hot(genre2ix[r["genre"]], len(GENRES)),
    one_hot(mat2ix[r["maturity"]], len(MATURITY)),
    np.array([r["duration"]], dtype=np.float32)/180.0]) for r in data], axis=0), dtype=torch.float32)
sample_tab = avg_tab.unsqueeze(0).repeat(len(sample_syn),1).to(device)

with torch.no_grad():
    p_probe = torch.sigmoid(model(sample_enc, sample_tab)).cpu().numpy()
q_probe = quality_judge(sample_syn)
corr = np.corrcoef(p_probe.flatten(), q_probe.flatten())[0,1]
print(f"\nCorrelation(model_prob, quality_judge) ≈ {corr:.3f} (illustrative)")

# ------------- Inference Helper -------------
def predict_example(synopsis: str, genre: str, maturity: str, duration_min: int):
    x_txt = torch.tensor(encode(synopsis)).unsqueeze(0).to(device)
    x_tab = torch.from_numpy(np.concatenate([
        one_hot(genre2ix[genre], len(GENRES)),
        one_hot(mat2ix[maturity], len(MATURITY)),
        np.array([duration_min], dtype=np.float32)/180.0
    ])).unsqueeze(0).to(device)
    with torch.no_grad():
        prob = torch.sigmoid(model(x_txt, x_tab)).item()
    return prob

demo_prob = predict_example(
    synopsis="character-driven heist story with bingeable pacing and award-winning moments set in a small-town",
    genre="thriller", maturity="PG-13", duration_min=110
)
print(f"\nDemo inference prob (thriller, PG-13, 110m): {demo_prob:.3f}")

# ------------- Export: TorchScript + ONNX -------------
model.eval()
example_txt = torch.randint(0, len(itos), (1, CFG["max_len"])).to(device)
example_tab = torch.randn(1, tab_in_dim).to(device)

# TorchScript
scripted = torch.jit.trace(model, (example_txt, example_tab))
os.makedirs("artifacts", exist_ok=True)
script_path = "artifacts/multimodal_ctr_scripted.pt"
scripted.save(script_path)

# ONNX
onnx_path = "artifacts/multimodal_ctr.onnx"
torch.onnx.export(
    model, (example_txt, example_tab), onnx_path,
    input_names=["synopsis_ids", "tabular_features"],
    output_names=["engagement_logit"],
    dynamic_axes={"synopsis_ids": {0:"batch"}, "tabular_features": {0:"batch"}, "engagement_logit": {0:"batch"}},
    opset_version=13
)
print(f"\nSaved: {script_path} and {onnx_path}")

# ------------- Model Card (light) -------------
def brier_score_np(p, t):  # reuse values computed earlier
    p = np.clip(p, 1e-6, 1-1e-6); return float(np.mean((p - t)**2))

card = {
    "task": "Engagement likelihood from synopsis + metadata",
    "architecture": "BiGRU + AdditiveAttention (text) + MLP (tabular) with fusion",
    "metrics": {
        "val_loss": round(float(best_val), 4),
        "AUC": round(float(roc_auc_score(targets, probs)), 3),
        "ACC": round(float(accuracy_score(targets, probs>=0.5)), 3),
        "Brier": round(float(brier_score_np(probs, targets)), 3),
        "ECE": round(float(ece_score(probs, targets, bins=15)), 3),
        "PerGenreAUC": {k: (None if math.isnan(v) else round(float(v),3)) for k,v in genre_auc.items()}
    },
    "export": {"torchscript": script_path, "onnx": onnx_path},
    "notes": [
        "Runs fully offline; synthetic data encodes realistic signals.",
        "AMP enabled, cosine LR, early stopping; calibrated via ECE/Brier.",
        "‘quality_judge’ shows how a judge model could be integrated; swap with a learned judge model in production."
    ]
}
with open("artifacts/model_card.json", "w") as f:
    json.dump(card, f, indent=2)
print("\nModel card written to artifacts/model_card.json")
