In [1]:
# -*- coding: utf-8 -*-
"""
Step 2: Extract GPT2-XL embeddings for ALL layers and apply PCA(50) per layer.
- Loads Step 1 word table to reuse word spans/timing and token-word mapping
- Re-tokenizes transcript deterministically
- Extracts token embeddings for all layers using sliding window
- Pools token embeddings into word embeddings (mean pooling)
- Fits PCA to 50 dims PER LAYER (recommended for layer-wise analyses)
- Saves outputs (NPY/NPZ/CSV) and plots with unified ranges

All comments are in English as requested.
"""

import os
from typing import List, Tuple

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA


# -----------------------------
# User config (EDIT THESE)
# -----------------------------
STEP1_DIR = r"E:\Nastase\encoding_features\pieman_step1"     # contains pieman_word_table.csv from Step 1
OUT_DIR   = r"E:\Nastase\encoding_features\pieman_step2"     # output dir for Step 2
MODEL_NAME = "gpt2-xl"

MAX_CTX_TOKENS = 1024
STRIDE = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

PCA_DIM = 50

# Plot settings (unified ranges)
FIG_DPI = 220


# -----------------------------
# Helpers (copied/compatible with Step 1)
# -----------------------------
def build_transcript_text(words: List[str]) -> Tuple[str, List[Tuple[int, int]]]:
    spans = []
    parts = []
    cursor = 0
    for i, w in enumerate(words):
        if i > 0:
            parts.append(" ")
            cursor += 1
        start = cursor
        parts.append(w)
        cursor += len(w)
        end = cursor
        spans.append((start, end))
    return "".join(parts), spans


def map_tokens_to_words(offsets: List[Tuple[int, int]], word_spans: List[Tuple[int, int]]) -> List[List[int]]:
    word_to_tokens: List[List[int]] = [[] for _ in word_spans]
    w = 0
    for t_idx, (t0, t1) in enumerate(offsets):
        if t1 <= t0:
            continue
        while w < len(word_spans) and word_spans[w][1] <= t0:
            w += 1
        if w >= len(word_spans):
            break
        ww = w
        while ww < len(word_spans):
            w0, w1 = word_spans[ww]
            if w0 >= t1:
                break
            if (t0 < w1) and (t1 > w0):
                word_to_tokens[ww].append(t_idx)
            ww += 1
    return word_to_tokens


@torch.no_grad()
def extract_token_hiddenstates_all_layers_sliding_window(
    model: AutoModel,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    max_ctx: int,
    stride: int,
    device: str,
) -> np.ndarray:
    """
    Extract token embeddings for all layers using a sliding window.
    Returns:
      token_h: (L, T, H) float32
        L = number of hidden-state layers returned by the model (embedding + each block)
        T = number of tokens
        H = hidden size

    Note:
      This stores all layers in RAM; for Pie Man (T~1057, H=1600, L=49),
      it's manageable (~330MB float32). If you later do longer stories, we will stream per layer.
    """
    T = int(input_ids.shape[0])
    H = int(model.config.hidden_size)

    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Determine number of layers by running a tiny forward once
    test_out = model(
        input_ids=input_ids[: min(8, T)].unsqueeze(0),
        attention_mask=attention_mask[: min(8, T)].unsqueeze(0),
        output_hidden_states=True,
    )
    L = len(test_out.hidden_states)  # includes embedding output
    del test_out

    token_h = np.zeros((L, T, H), dtype=np.float32)

    for i in tqdm(range(0, T, stride), desc="Extracting ALL-layer token embeddings (sliding window)"):
        tgt_start = i
        tgt_end = min(i + stride, T)

        win_end = tgt_end
        win_start = max(0, win_end - max_ctx)

        ids_win = input_ids[win_start:win_end].unsqueeze(0)
        mask_win = attention_mask[win_start:win_end].unsqueeze(0)

        out = model(input_ids=ids_win, attention_mask=mask_win, output_hidden_states=True)
        hs = out.hidden_states  # tuple length L, each (1, win_len, H)

        rel_start = tgt_start - win_start
        rel_end = tgt_end - win_start

        for l in range(L):
            token_h[l, tgt_start:tgt_end, :] = hs[l][0, rel_start:rel_end, :].detach().cpu().numpy().astype(np.float32)

    return token_h


def plot_layerwise_curves(explained_cum: np.ndarray, out_dir: str) -> None:
    """
    Plot cumulative explained variance of first PCA_DIM components per layer.
    y-axis is fixed to [0, 1] for comparability.
    """
    os.makedirs(out_dir, exist_ok=True)
    plt.figure(figsize=(10, 5))
    plt.plot(np.arange(len(explained_cum)), explained_cum)
    plt.ylim(0.0, 1.0)
    plt.xlabel("Layer index (0=embeddings, 48=top)")
    plt.ylabel(f"Cumulative explained variance (first {PCA_DIM} PCs)")
    plt.title("PCA(50) cumulative explained variance by layer")
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "fig_pca_explained_variance_layerwise.png"), dpi=FIG_DPI)
    plt.close()


def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # -----------------------------
    # 1) Load Step 1 word table
    # -----------------------------
    step1_csv = os.path.join(STEP1_DIR, "pieman_word_table.csv")
    word_df = pd.read_csv(step1_csv)

    words = word_df["transcript_word"].astype(str).tolist()
    transcript_text, word_spans = build_transcript_text(words)

    # -----------------------------
    # 2) Tokenize (offset mapping) and map tokens -> words
    # -----------------------------
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    enc = tokenizer(
        transcript_text,
        add_special_tokens=False,
        return_offsets_mapping=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    input_ids = enc["input_ids"][0]
    attention_mask = enc["attention_mask"][0]
    offsets = enc["offset_mapping"][0].tolist()

    word_to_tokens = map_tokens_to_words(offsets, word_spans)
    n_tokens = int(input_ids.shape[0])

    # Sanity: ensure no word is unmapped
    n_unmapped = sum(1 for x in word_to_tokens if len(x) == 0)
    if n_unmapped != 0:
        raise RuntimeError(f"{n_unmapped} words have no tokens mapped. This should be 0 for Pie Man.")

    # -----------------------------
    # 3) Load model and extract ALL-layer token hidden states
    # -----------------------------
    model = AutoModel.from_pretrained(MODEL_NAME)
    model.eval()
    model.to(DEVICE)

    token_h = extract_token_hiddenstates_all_layers_sliding_window(
        model=model,
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_ctx=MAX_CTX_TOKENS,
        stride=STRIDE,
        device=DEVICE,
    )
    L, T, H = token_h.shape
    assert T == n_tokens

    # -----------------------------
    # 4) Token -> word mean pooling for ALL layers
    # -----------------------------
    W = len(word_to_tokens)
    word_h = np.zeros((L, W, H), dtype=np.float32)

    for w_idx, toks in enumerate(word_to_tokens):
        toks_arr = np.array(toks, dtype=int)
        # Mean over tokens for each layer
        word_h[:, w_idx, :] = token_h[:, toks_arr, :].mean(axis=1).astype(np.float32)

    # Save raw all-layer word embeddings (optional but useful)
    np.save(os.path.join(OUT_DIR, "pieman_word_emb_all_layers_raw.npy"), word_h)

    # -----------------------------
    # 5) PCA(50) per layer
    # -----------------------------
    word_pca = np.zeros((W, L, PCA_DIM), dtype=np.float32)
    explained_cum = np.zeros((L,), dtype=np.float32)

    # Store PCA params per layer for reproducibility
    pca_means = np.zeros((L, H), dtype=np.float32)
    pca_components = np.zeros((L, PCA_DIM, H), dtype=np.float32)
    pca_explained = np.zeros((L, PCA_DIM), dtype=np.float32)

    for l in tqdm(range(L), desc="Fitting PCA per layer"):
        X = word_h[l, :, :]  # (W, H)
        pca = PCA(n_components=PCA_DIM, svd_solver="auto", random_state=0)
        Z = pca.fit_transform(X)  # (W, 50)

        word_pca[:, l, :] = Z.astype(np.float32)
        pca_means[l, :] = pca.mean_.astype(np.float32)
        pca_components[l, :, :] = pca.components_.astype(np.float32)
        pca_explained[l, :] = pca.explained_variance_ratio_.astype(np.float32)
        explained_cum[l] = float(pca.explained_variance_ratio_[:PCA_DIM].sum())

    np.save(os.path.join(OUT_DIR, "pieman_word_emb_all_layers_pca50.npy"), word_pca)

    np.savez_compressed(
        os.path.join(OUT_DIR, "pieman_pca_models.npz"),
        mean=pca_means,
        components=pca_components,
        explained_ratio=pca_explained,
    )

    # -----------------------------
    # 6) Save a compact CSV summary
    # -----------------------------
    summary = pd.DataFrame({
        "layer": np.arange(L, dtype=int),
        "cum_explained_first50": explained_cum,
    })
    summary.to_csv(os.path.join(OUT_DIR, "pieman_pca_summary_by_layer.csv"), index=False, encoding="utf-8-sig")

    # -----------------------------
    # 7) Plot unified-range figures
    # -----------------------------
    plot_layerwise_curves(explained_cum, OUT_DIR)

    # Also plot layerwise word embedding norms (raw) with unified y-range
    norms = np.linalg.norm(word_h, axis=2)  # (L, W)
    # Use a robust max for consistent plotting range
    y_max = float(np.quantile(norms, 0.99))
    plt.figure(figsize=(10, 5))
    plt.boxplot([norms[l, :] for l in range(L)], showfliers=False)
    plt.ylim(0.0, y_max)
    plt.xlabel("Layer index (1..L)")
    plt.ylabel("L2 norm (raw word embeddings)")
    plt.title("Distribution of word embedding norms by layer (raw)")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "fig_word_emb_norm_layerwise.png"), dpi=FIG_DPI)
    plt.close()

    print("\n[STEP2 DONE]")
    print(f"Words: {W} | Tokens: {T} | Layers: {L} | Hidden size: {H}")
    print(f"Saved raw all-layer word embeddings: pieman_word_emb_all_layers_raw.npy")
    print(f"Saved PCA(50) word embeddings:        pieman_word_emb_all_layers_pca50.npy")
    print(f"Saved PCA models:                    pieman_pca_models.npz")
    print(f"Saved PCA summary:                   pieman_pca_summary_by_layer.csv")
    print("Saved figures: fig_pca_explained_variance_layerwise.png, fig_word_emb_norm_layerwise.png")


if __name__ == "__main__":
    main()

Token indices sequence length is longer than the specified maximum sequence length for this model (1057 > 1024). Running this sequence through the model will result in indexing errors
  warn(
Extracting ALL-layer token embeddings (sliding window): 100%|██████████| 5/5 [00:23<00:00,  4.67s/it]
Fitting PCA per layer: 100%|██████████| 49/49 [00:02<00:00, 24.11it/s]



[STEP2 DONE]
Words: 957 | Tokens: 1057 | Layers: 49 | Hidden size: 1600
Saved raw all-layer word embeddings: pieman_word_emb_all_layers_raw.npy
Saved PCA(50) word embeddings:        pieman_word_emb_all_layers_pca50.npy
Saved PCA models:                    pieman_pca_models.npz
Saved PCA summary:                   pieman_pca_summary_by_layer.csv
Saved figures: fig_pca_explained_variance_layerwise.png, fig_word_emb_norm_layerwise.png


check sanity

In [2]:
import os
import numpy as np
import pandas as pd

STEP2_DIR = r"E:\Nastase\encoding_features\pieman_step2"

summary_csv = os.path.join(STEP2_DIR, "pieman_pca_summary_by_layer.csv")
raw_path = os.path.join(STEP2_DIR, "pieman_word_emb_all_layers_raw.npy")
pca_path = os.path.join(STEP2_DIR, "pieman_word_emb_all_layers_pca50.npy")

summary = pd.read_csv(summary_csv)
raw = np.load(raw_path)   # (L, W, H)
pca = np.load(pca_path)   # (W, L, 50) in our code

L, W, H = raw.shape
Wp, Lp, D = pca.shape

print("[SHAPES]")
print("raw:", raw.shape, " (L,W,H)")
print("pca:", pca.shape, " (W,L,50)")
print("summary rows:", len(summary))

print("\n[EXPLAINED VAR CHECK]")
print("min:", summary["cum_explained_first50"].min())
print("max:", summary["cum_explained_first50"].max())
print("mean:", summary["cum_explained_first50"].mean())

print("\n[NORM CHECK]")
norms = np.linalg.norm(raw, axis=2)  # (L,W)
print("norm p01:", np.quantile(norms, 0.01))
print("norm p50:", np.quantile(norms, 0.50))
print("norm p99:", np.quantile(norms, 0.99))

# Layerwise red-flag scan
bad_layers = []
for l in range(L):
    x = raw[l]
    if not np.isfinite(x).all():
        bad_layers.append((l, "non-finite"))
        continue
    layer_norm = np.linalg.norm(x, axis=1)
    if np.quantile(layer_norm, 0.99) > 10 * np.median(layer_norm):
        bad_layers.append((l, "heavy-tail"))
    if np.median(layer_norm) < 1e-6:
        bad_layers.append((l, "near-zero"))

print("\n[RED FLAGS]")
print("none" if len(bad_layers)==0 else bad_layers)

[SHAPES]
raw: (49, 957, 1600)  (L,W,H)
pca: (957, 49, 50)  (W,L,50)
summary rows: 49

[EXPLAINED VAR CHECK]
min: 0.534538
max: 0.86465245
mean: 0.7680522302040815

[NORM CHECK]
norm p01: 1.2927607583999634
norm p50: 140.78179931640625
norm p99: 1063.14240234375

[RED FLAGS]
none
