In [None]:
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip -q install lightgbm xgboost shap scikit-learn matplotlib numpy pandas tqdm
!pip -q install captum

**Load data (features + raw series)**

In [None]:
import numpy as np, pandas as pd
from tqdm import tqdm
from sklearn.model_selection import GroupShuffleSplit
from typing import Dict, Any, List, Tuple

VET_PARQUET  = "/content/vetting_kepler.parquet"
CAND_PARQUET = "/content/candidates_bls.parquet"
FEAT_PARQUET = "/content/candidate_features.parquet"

vet_df  = pd.read_parquet(VET_PARQUET)
cand_df = pd.read_parquet(CAND_PARQUET)
feat_df = pd.read_parquet(FEAT_PARQUET)

print(vet_df.shape, cand_df.shape, feat_df.shape)

In [None]:
def _to_arr(x):
    if isinstance(x, np.ndarray): return x
    if isinstance(x, list): return np.asarray(x)
    if isinstance(x, pd.Series):
        v = next((v for v in x if isinstance(v, (list, np.ndarray))), None)
        return np.asarray(v) if v is not None else np.asarray(x)
    return np.asarray(x)

def phase_fold(time, period, epoch):
    t = _to_arr(time).astype(float)
    P = float(period); t0 = float(epoch)
    ph = ((t - t0 + 0.5*P) % P) / P - 0.5
    return ph

def bin_folded(phase, flux, nbins=200, robust=True):
    ph = phase.copy()
    f  = flux.copy()
    edges = np.linspace(-0.5, 0.5, nbins+1)
    idx = np.digitize(ph, edges) - 1
    idx = np.clip(idx, 0, nbins-1)
    b = np.empty(nbins); b[:] = np.nan
    for i in range(nbins):
        sel = (idx == i)
        if not np.any(sel): continue
        vals = f[sel]
        if robust:
            b[i] = np.nanmedian(vals)
        else:
            b[i] = np.nanmean(vals)
    if np.isnan(b).any():
        good = np.isfinite(b)
        if np.any(good):
            b[np.isnan(b)] = np.interp(np.flatnonzero(np.isnan(b)), np.flatnonzero(good), b[good])
        else:
            b[:] = 0.0
    return b

def make_cnn_views(time, flux, period, epoch, duration, nbins_global=200, nbins_local=100, local_half_width_factor=3.0):
    t = _to_arr(time).astype(float)
    f = _to_arr(flux).astype(float)
    m = np.isfinite(t) & np.isfinite(f)
    t, f = t[m], f[m]
    if t.size < 100:
        return None, None
    order = np.argsort(t); t, f = t[order], f[order]
    f = f - np.nanmedian(f)

    ph = phase_fold(t, period, epoch)
    global_view = bin_folded(ph, f, nbins=nbins_global, robust=True)
    half = local_half_width_factor * (duration / period)
    half = float(np.clip(half, 1e-3, 0.25))
    sel = (ph >= -half) & (ph <= half)
    if not np.any(sel):
        sel = np.argsort(np.abs(ph))[:max(50, nbins_local)]
    local_view = bin_folded(ph[sel] / (2*half), f[sel], nbins=nbins_local, robust=True)  # map window to [-0.5,0.5)

    def z(x):
        mu, sd = np.nanmean(x), np.nanstd(x)
        return (x - mu) / (sd if sd > 0 else 1.0)
    return z(global_view).astype(np.float32), z(local_view).astype(np.float32)

In [None]:
vet_idx = vet_df.set_index("obs_block_id", drop=False)

Xg, Xl, y, groups, ids = [], [], [], [], []
drop_count = 0

for row in tqdm(feat_df.itertuples(index=False), total=len(feat_df), desc="Prep CNN inputs"):
    obid = row.obs_block_id
    if obid not in vet_idx.index:
        drop_count += 1
        continue
    block = vet_idx.loc[obid]
    if isinstance(block, pd.DataFrame): block = block.iloc[0]

    time = block["time"]; flux = block["flux"]
    g, l = make_cnn_views(time, flux, row.period, row.epoch, row.duration,
                          nbins_global=200, nbins_local=100, local_half_width_factor=3.0)
    if g is None:
        drop_count += 1
        continue

    Xg.append(g); Xl.append(l)
    y.append(int(0 if pd.isna(row.label) else row.label))
    groups.append(block.get("target_id"))
    ids.append(obid)

Xg = np.stack(Xg)
Xl = np.stack(Xl)
y  = np.asarray(y)
groups = np.asarray(groups)
ids = np.asarray(ids)

print("CNN usable:", Xg.shape, Xl.shape, y.shape, "dropped:", drop_count)

**Train/val split**

In [None]:
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
(train_idx, val_idx), = gss.split(Xg, y, groups=groups)

Xg_tr, Xg_va = Xg[train_idx], Xg[val_idx]
Xl_tr, Xl_va = Xl[train_idx], Xl[val_idx]
y_tr,  y_va  = y[train_idx],  y[val_idx]
print(Xg_tr.shape, Xg_va.shape, y_tr.mean(), y_va.mean())

**CNN model (PyTorch) + training**

In [None]:
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Dual1DCNN(nn.Module):
    def __init__(self, in_g=200, in_l=100):
        super().__init__()
        self.g = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=5, padding=2), nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=5, padding=2), nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, kernel_size=3, padding=1), nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.l = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=5, padding=2), nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=5, padding=2), nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, kernel_size=3, padding=1), nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.head = nn.Sequential(
            nn.Linear(64+64, 64), nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)  # logits
        )

    def forward(self, xg, xl):
        # x: [B, L] -> [B, 1, L]
        xg = xg.unsqueeze(1)
        xl = xl.unsqueeze(1)
        g = self.g(xg).squeeze(-1)  # [B,64]
        l = self.l(xl).squeeze(-1)  # [B,64]
        h = torch.cat([g, l], dim=1)
        logit = self.head(h).squeeze(-1)
        return logit

tr_ds = TensorDataset(torch.tensor(Xg_tr), torch.tensor(Xl_tr), torch.tensor(y_tr, dtype=torch.float32))
va_ds = TensorDataset(torch.tensor(Xg_va), torch.tensor(Xl_va), torch.tensor(y_va, dtype=torch.float32))
tr_dl = DataLoader(tr_ds, batch_size=128, shuffle=True)
va_dl = DataLoader(va_ds, batch_size=256, shuffle=False)

model = Dual1DCNN().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=3e-3, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([max(1.0, (1.0 - y_tr.mean())/(y_tr.mean()+1e-6))], device=device))

def evaluate(model, loader):
    model.eval()
    ys, ps = [], []
    with torch.no_grad():
        for xg, xl, yb in loader:
            xg, xl = xg.to(device), xl.to(device)
            logit = model(xg, xl)
            prob = torch.sigmoid(logit).cpu().numpy()
            ys.append(yb.numpy()); ps.append(prob)
    y_true = np.concatenate(ys); y_prob = np.concatenate(ps)
    from sklearn.metrics import roc_auc_score, average_precision_score
    return roc_auc_score(y_true, y_prob), average_precision_score(y_true, y_prob), y_prob

best_auc, best_state = 0, None
for epoch in range(25):
    model.train()
    for xg, xl, yb in tr_dl:
        xg, xl, yb = xg.to(device), xl.to(device), yb.to(device)
        logit = model(xg, xl)
        loss = criterion(logit, yb)
        opt.zero_grad(); loss.backward(); opt.step()
    auc, ap, yprob_va = evaluate(model, va_dl)
    if auc > best_auc:
        best_auc, best_state = auc, {k:v.cpu() for k,v in model.state_dict().items()}
    print(f"Epoch {epoch+1:02d}  val AUC={auc:.4f}  PR-AUC={ap:.4f}")
if best_state is not None:
    model.load_state_dict({k: v.to(device) for k, v in best_state.items()})
cnn_val_prob = evaluate(model, va_dl)[2]
print("Best CNN val AUC:", best_auc)

**GBM on tabular features**

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler

numeric_cols = [c for c in feat_df.columns if c not in [
    "obs_block_id","target_id","mission","sector","quarter","campaign","epoch"
] and (pd.api.types.is_numeric_dtype(feat_df[c]) or feat_df[c].dtype==object)]

X_full = feat_df[numeric_cols].copy()
y_full = feat_df["label"].fillna(0).astype(int).values
groups_full = feat_df["target_id"].values

ids_full = feat_df["obs_block_id"].values
val_mask = np.isin(ids_full, ids[val_idx])
train_mask = ~val_mask

X_tr, X_va = X_full[train_mask], X_full[val_mask]
y_tr2, y_va2 = y_full[train_mask], y_full[val_mask]

lgb_tr = lgb.Dataset(X_tr, label=y_tr2)
lgb_va = lgb.Dataset(X_va, label=y_va2, reference=lgb_tr)

params = dict(
    objective="binary",
    metric=["auc","average_precision"],
    learning_rate=0.05,
    num_leaves=63,
    min_data_in_leaf=30,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l2=1.0,
    verbose=-1
)
gbm = lgb.train(params, lgb_tr, num_boost_round=200, valid_sets=[lgb_tr, lgb_va],
                valid_names=["train","valid"], early_stopping_rounds=30, verbose_eval=25)

gbm_val_prob = gbm.predict(X_va, num_iteration=gbm.best_iteration)
print("GBM val AUC:", roc_auc_score(y_va2, gbm_val_prob), "PR-AUC:", average_precision_score(y_va2, gbm_val_prob))

**Late-fusion**

In [None]:
from sklearn.linear_model import LogisticRegression

assert np.all(np.isin(ids[val_idx], ids_full[val_mask]))
y_val = y_va2

stack_X_tr = np.c_[cnn_val_prob, gbm_val_prob]
stacker = LogisticRegression(max_iter=1000)
stacker.fit(stack_X_tr, y_val)
final_val_prob = stacker.predict_proba(stack_X_tr)[:,1]

from sklearn.metrics import roc_auc_score, average_precision_score
print("Fusion val AUC:", roc_auc_score(y_val, final_val_prob), "PR-AUC:", average_precision_score(y_val, final_val_prob))
print("Fusion weights [cnn, gbm], bias:", np.r_[stacker.coef_[0], stacker.intercept_])


# Interpretability
**CNN saliency**

In [None]:
import torch
from captum.attr import IntegratedGradients
import matplotlib.pyplot as plt

model.eval()
def forward_local(xl_tensor):
    xg_template = torch.tensor(np.mean(Xg_tr, axis=0), dtype=torch.float32, device=device).unsqueeze(0).repeat(xl_tensor.size(0),1)
    logit = model(xg_template, xl_tensor)
    return torch.sigmoid(logit)

ig = IntegratedGradients(forward_local)

def plot_local_saliency(sample_idx_in_val):
    xl = torch.tensor(Xl_va[sample_idx_in_val], dtype=torch.float32, device=device).unsqueeze(0)
    attributions, delta = ig.attribute(xl, baselines=torch.zeros_like(xl), target=None, return_convergence_delta=True)
    att = attributions.squeeze(0).detach().cpu().numpy()

    plt.figure(figsize=(10,3))
    plt.plot(Xl_va[sample_idx_in_val], label="local input")
    plt.twinx()
    plt.plot(att, alpha=0.7, label="saliency")
    plt.title(f"Local saliency (val idx {sample_idx_in_val})")
    plt.show()

plot_local_saliency(0)

**GBM SHAP**

In [None]:
import shap
explainer = shap.TreeExplainer(gbm, feature_perturbation="tree_path_dependent")
shap_vals = explainer.shap_values(X_va, check_additivity=False)
shap.summary_plot(shap_vals, X_va, feature_names=X_full.columns.tolist(), max_display=20)
i = 0
shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_vals[i], feature_names=X_full.columns.tolist(), max_display=20)

In [None]:
import joblib, os, json, torch

os.makedirs("/content/models", exist_ok=True)
joblib.dump(gbm, "/content/models/gbm_lgb.pkl")
joblib.dump(stacker, "/content/models/stacker_logreg.pkl")
torch.save(model.state_dict(), "/content/models/cnn_dual.pt")
meta = {
    "cnn_input_shapes": {"global": int(Xg.shape[1]), "local": int(Xl.shape[1])},
    "stack_features": ["cnn_prob", "gbm_prob"]
}
json.dump(meta, open("/content/models/meta.json", "w"))
print("Saved models to /content/models")