In [1]:
# -*- coding: utf-8 -*-
"""
Step 3: Aggregate word-level PCA(50) features into TR-level features (Pie Man).
- Loads Step 1 word table for word timing (start_sec/end_sec)
- Loads Step 2 word PCA features: (W, L, 50)
- Builds TR bins for Pie Man (default: TR=1.5s, N_TR=300, total=450s)
- Duration-weighted aggregation by overlap between word time interval and TR bin
- Saves:
  1) X_tr.npy: (N_TR, L, 50)
  2) tr_layer_features.csv.gz: long table (N_TR * L rows, 50 dims + metadata)
  3) summary CSV (words per TR, coverage per TR)
- Plots:
  - Words per TR line plot
  - Heatmap of TR feature L2 norms (TR x layer) with robust color range

"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# -----------------------------
# User config (EDIT THESE)
# -----------------------------
STEP1_DIR = r"E:\Nastase\encoding_features\pieman_step1"
STEP2_DIR = r"E:\Nastase\encoding_features\pieman_step2"
OUT_DIR   = r"E:\Nastase\encoding_features\pieman_step3"

TR_SEC = 1.5
N_TR = 300
TOTAL_SEC = TR_SEC * N_TR  # 450s for Pie Man

# Optional: shift time origin (seconds) if you want story onset to be time 0.
# For Pie Man, story starts at 15s after intro music+silence.
# Keep 0.0 for now (recommended) to preserve the actual audio timeline.
TIME_ORIGIN_SEC = 0.0

FIG_DPI = 220


def compute_overlap(a0: float, a1: float, b0: float, b1: float) -> float:
    """Return overlap duration between intervals [a0,a1] and [b0,b1]."""
    left = max(a0, b0)
    right = min(a1, b1)
    return max(0.0, right - left)


def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # -----------------------------
    # 1) Load word table (timing)
    # -----------------------------
    word_csv = os.path.join(STEP1_DIR, "pieman_word_table.csv")
    word_df = pd.read_csv(word_csv)

    # Timing arrays (seconds)
    start = pd.to_numeric(word_df["start_sec"], errors="coerce").to_numpy(dtype=float)
    end = pd.to_numeric(word_df["end_sec"], errors="coerce").to_numpy(dtype=float)

    # Apply time origin shift if requested
    start = start - TIME_ORIGIN_SEC
    end = end - TIME_ORIGIN_SEC

    has_timing = np.isfinite(start) & np.isfinite(end) & (end > start)

    # -----------------------------
    # 2) Load word PCA features (W, L, 50)
    # -----------------------------
    pca_path = os.path.join(STEP2_DIR, "pieman_word_emb_all_layers_pca50.npy")
    word_pca = np.load(pca_path)  # shape (W, L, 50)
    W, L, D = word_pca.shape
    assert D == 50, f"Expected PCA dim=50, got {D}"
    assert len(word_df) == W, f"Word count mismatch: CSV={len(word_df)} vs PCA={W}"

    # -----------------------------
    # 3) Build TR bins
    # -----------------------------
    tr_starts = np.arange(N_TR, dtype=float) * TR_SEC
    tr_ends = tr_starts + TR_SEC

    # -----------------------------
    # 4) Duration-weighted TR aggregation
    # -----------------------------
    X_tr = np.zeros((N_TR, L, D), dtype=np.float32)
    words_per_tr = np.zeros((N_TR,), dtype=np.int32)
    coverage_sec_per_tr = np.zeros((N_TR,), dtype=np.float32)

    # Also store per TR per layer total weight for diagnostics
    weight_tr_layer = np.zeros((N_TR, L), dtype=np.float32)

    # Iterate over TRs (Pie Man: 300 TRs, fast enough)
    for t in range(N_TR):
        b0, b1 = tr_starts[t], tr_ends[t]

        # Find candidate words whose intervals could overlap this TR bin
        # (Simple scan is fine at W=957; keep it explicit for clarity)
        weights = np.zeros((W,), dtype=np.float32)

        for i in range(W):
            if not has_timing[i]:
                continue
            w = compute_overlap(start[i], end[i], b0, b1)
            if w > 0:
                weights[i] = w

        idx = np.nonzero(weights > 0)[0]
        if idx.size == 0:
            continue

        # Count how many unique words contribute to this TR
        words_per_tr[t] = int(idx.size)
        coverage_sec_per_tr[t] = float(weights[idx].sum())

        # Normalize weights to compute a weighted mean
        wsum = float(weights[idx].sum())
        wn = (weights[idx] / wsum).astype(np.float32)  # (n_words,)

        # Weighted mean aggregation for each layer
        # word_pca[idx] -> (n_words, L, D)
        # We compute sum_i wn_i * word_pca_i
        # Use einsum for clarity and speed.
        X_tr[t] = np.einsum("i,ild->ld", wn, word_pca[idx]).astype(np.float32)

        # Diagnostics: total unnormalized weight per layer equals coverage_sec (same for all layers)
        # but we store it anyway for later extensions
        weight_tr_layer[t, :] = coverage_sec_per_tr[t]

    # -----------------------------
    # 5) Save TR features
    # -----------------------------
    np.save(os.path.join(OUT_DIR, "pieman_tr_features_pca50.npy"), X_tr)

    # Save a compact summary CSV per TR
    tr_summary = pd.DataFrame({
        "tr_idx": np.arange(N_TR, dtype=int),
        "tr_start_sec": tr_starts,
        "tr_end_sec": tr_ends,
        "n_words": words_per_tr,
        "coverage_sec": coverage_sec_per_tr,
    })
    tr_summary_path = os.path.join(OUT_DIR, "pieman_tr_summary.csv")
    tr_summary.to_csv(tr_summary_path, index=False, encoding="utf-8-sig")

    # Save a long-form CSV: TR x layer rows, 50 dims columns (gzip compressed)
    # Shape: 300*49 = 14700 rows, each has 50 values + metadata
    rows = []
    for t in range(N_TR):
        for l in range(L):
            row = {
                "tr_idx": t,
                "tr_start_sec": tr_starts[t],
                "tr_end_sec": tr_ends[t],
                "layer": l,
                "n_words": int(words_per_tr[t]),
                "coverage_sec": float(coverage_sec_per_tr[t]),
            }
            # Add PC columns
            vec = X_tr[t, l, :]
            for k in range(D):
                row[f"PC{k+1:02d}"] = float(vec[k])
            rows.append(row)

    tr_layer_df = pd.DataFrame(rows)
    tr_layer_csv_gz = os.path.join(OUT_DIR, "pieman_tr_layer_features_pca50.csv.gz")
    tr_layer_df.to_csv(tr_layer_csv_gz, index=False, compression="gzip")

    # -----------------------------
    # 6) Plot 1: words per TR timecourse
    # -----------------------------
    plt.figure(figsize=(12, 4))
    plt.plot(np.arange(N_TR), words_per_tr)
    plt.xlabel("TR index")
    plt.ylabel("Number of contributing words")
    plt.title("Pie Man: word coverage per TR (duration-weighted overlap)")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "fig_words_per_tr.png"), dpi=FIG_DPI)
    plt.close()

    # -----------------------------
    # 7) Plot 2: heatmap of TR feature norms (TR x layer)
    # -----------------------------
    # Compute L2 norms across PCA dims: (N_TR, L)
    norms = np.linalg.norm(X_tr, axis=2)

    # Use robust color range for consistent interpretation
    vmin = float(np.quantile(norms, 0.01))
    vmax = float(np.quantile(norms, 0.99))

    plt.figure(figsize=(12, 6))
    plt.imshow(norms.T, aspect="auto", origin="lower", vmin=vmin, vmax=vmax)
    plt.colorbar(label="L2 norm of TR feature (PCA50)")
    plt.xlabel("TR index")
    plt.ylabel("Layer (0=embeddings, 48=top)")
    plt.title("TR-level feature strength heatmap (robust scaled)")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "fig_tr_feature_norm_heatmap.png"), dpi=FIG_DPI)
    plt.close()

    # -----------------------------
    # Print a compact summary
    # -----------------------------
    nonempty_tr = int((words_per_tr > 0).sum())
    print("\n[STEP3 DONE]")
    print(f"TR: {TR_SEC}s | N_TR: {N_TR} | Total: {TOTAL_SEC:.1f}s | Time origin shift: {TIME_ORIGIN_SEC}s")
    print(f"Words: {W} | Layers: {L} | PCA dim: {D}")
    print(f"Non-empty TR bins: {nonempty_tr}/{N_TR}")
    print(f"Saved TR features: pieman_tr_features_pca50.npy")
    print(f"Saved TR summary:  {tr_summary_path}")
    print(f"Saved TRxLayer CSV (gz): {tr_layer_csv_gz}")
    print("Saved figures: fig_words_per_tr.png, fig_tr_feature_norm_heatmap.png")


if __name__ == "__main__":
    main()


[STEP3 DONE]
TR: 1.5s | N_TR: 300 | Total: 450.0s | Time origin shift: 0.0s
Words: 957 | Layers: 49 | PCA dim: 50
Non-empty TR bins: 259/300
Saved TR features: pieman_tr_features_pca50.npy
Saved TR summary:  E:\Nastase\encoding_features\pieman_step3\pieman_tr_summary.csv
Saved TRxLayer CSV (gz): E:\Nastase\encoding_features\pieman_step3\pieman_tr_layer_features_pca50.csv.gz
Saved figures: fig_words_per_tr.png, fig_tr_feature_norm_heatmap.png
