In [1]:
# -*- coding: utf-8 -*-
"""
Step 1: Build word-level table from Gentle align file and extract GPT2-XL embeddings.
- Reads align.csv or align.xlsx (Gentle word-level output)
- Builds a clean word table with onset/offset timing
- Extracts token-level contextual embeddings from GPT2-XL using sliding window (max context 1024)
- Pools token embeddings into word embeddings by mean pooling
- Saves CSV + NPY outputs + sanity-check plots

"""

import os
import re
import math
import json
from typing import List, Tuple, Dict, Optional

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# -----------------------------
# User config (EDIT THESE)
# -----------------------------
ALIGN_PATH = r"E:\Nastase\align\gentle\pieman\align.csv"   # <- change to your file (align.csv or align.xlsx)
OUT_DIR = r"E:\Nastase\encoding_features\pieman_step1"     # <- output directory
MODEL_NAME = "gpt2-xl"                                     # you said GPT2-XL is installed

# Sliding window settings for GPT-2 (max positions 1024)
MAX_CTX_TOKENS = 1024
STRIDE = 256  # tokens per step to compute representations; smaller = slower but safer
BATCH_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Plot settings (keep consistent binning/range across multiple figures)
DURATION_BINS = 50
TOKENCOUNT_BINS = 30


# -----------------------------
# Helpers
# -----------------------------
def read_gentle_align(path: str) -> pd.DataFrame:
    """
    Read Gentle word-level alignment table.
    Gentle CSV has 4 columns with no header:
      0 = transcript word
      1 = alignedWord (or <unk>)
      2 = start (sec)
      3 = end (sec)
    Some rows may have missing start/end if not found in audio.
    """
    ext = os.path.splitext(path)[1].lower()
    if ext in [".xlsx", ".xls"]:
        df = pd.read_excel(path, header=None)
    else:
        df = pd.read_csv(path, header=None)

    if df.shape[1] < 2:
        raise ValueError(f"Align file has too few columns: {df.shape[1]}")

    # Ensure exactly 4 columns (pad if needed)
    while df.shape[1] < 4:
        df[df.shape[1]] = np.nan
    df = df.iloc[:, :4].copy()
    df.columns = ["transcript_word", "aligned_word", "start_sec", "end_sec"]

    # Normalize types
    df["transcript_word"] = df["transcript_word"].astype(str)
    df["aligned_word"] = df["aligned_word"].astype(str)

    # Coerce timing to numeric (NaNs allowed)
    df["start_sec"] = pd.to_numeric(df["start_sec"], errors="coerce")
    df["end_sec"] = pd.to_numeric(df["end_sec"], errors="coerce")

    return df


def build_transcript_text(words: List[str]) -> Tuple[str, List[Tuple[int, int]]]:
    """
    Build a single transcript string by joining words with single spaces,
    and return character spans (start_char, end_char) for each word in that string.

    This allows mapping token offsets -> words using tokenizer offset mapping.
    """
    spans = []
    parts = []
    cursor = 0
    for i, w in enumerate(words):
        if i > 0:
            parts.append(" ")
            cursor += 1
        start = cursor
        parts.append(w)
        cursor += len(w)
        end = cursor
        spans.append((start, end))
    text = "".join(parts)
    return text, spans


def map_tokens_to_words(offsets: List[Tuple[int, int]], word_spans: List[Tuple[int, int]]) -> List[List[int]]:
    """
    Map each word to the list of token indices whose character offsets overlap the word span.
    Token offsets come from tokenizer(..., return_offsets_mapping=True).
    """
    word_to_tokens: List[List[int]] = [[] for _ in word_spans]

    # Two-pointer sweep for efficiency
    w = 0
    for t_idx, (t0, t1) in enumerate(offsets):
        if t1 <= t0:
            # Skip special/empty tokens (GPT2 usually none, but keep safe)
            continue

        # Advance word pointer if token is beyond current word
        while w < len(word_spans) and word_spans[w][1] <= t0:
            w += 1
        if w >= len(word_spans):
            break

        # Token may overlap multiple spans in rare cases; handle via local scan
        ww = w
        while ww < len(word_spans):
            w0, w1 = word_spans[ww]
            # Stop if word starts after token ends
            if w0 >= t1:
                break
            # Overlap condition
            if (t0 < w1) and (t1 > w0):
                word_to_tokens[ww].append(t_idx)
            ww += 1

    return word_to_tokens


@torch.no_grad()
def extract_token_embeddings_sliding_window(
    model: AutoModel,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    max_ctx: int = 1024,
    stride: int = 256,
    device: str = "cpu",
) -> np.ndarray:
    """
    Extract token-level contextual embeddings using a sliding window.
    Returns:
      token_emb: (T, H) float32, embeddings from the last hidden layer.

    Strategy:
      For token positions in a target block, run model on a window that contains up to max_ctx tokens,
      then take embeddings for the target positions.
    """
    T = input_ids.shape[0]
    H = model.config.hidden_size
    token_emb = np.zeros((T, H), dtype=np.float32)

    # Ensure tensors on device
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Iterate over target ranges
    # We compute representations for tokens in [i, i+stride)
    for i in tqdm(range(0, T, stride), desc="Extracting token embeddings (sliding window)"):
        tgt_start = i
        tgt_end = min(i + stride, T)

        # Window end aligns with tgt_end to maximize left context for the target tokens
        win_end = tgt_end
        win_start = max(0, win_end - max_ctx)

        ids_win = input_ids[win_start:win_end].unsqueeze(0)          # (1, L)
        mask_win = attention_mask[win_start:win_end].unsqueeze(0)    # (1, L)

        out = model(input_ids=ids_win, attention_mask=mask_win, output_hidden_states=False)
        # For GPT2Model, out.last_hidden_state is (1, L, H)
        last = out.last_hidden_state[0]  # (L, H)

        # Indices in the window that correspond to target tokens
        rel_start = tgt_start - win_start
        rel_end = tgt_end - win_start

        token_emb[tgt_start:tgt_end, :] = last[rel_start:rel_end, :].detach().cpu().numpy().astype(np.float32)

    return token_emb


def plot_histograms(word_df: pd.DataFrame, out_dir: str) -> None:
    """
    Create two sanity-check histograms:
    1) word duration distribution (seconds)
    2) token count per word distribution
    Use fixed bin counts for consistent ranges.
    """
    os.makedirs(out_dir, exist_ok=True)

    # Duration histogram (ignore NaNs)
    durations = word_df["duration_sec"].dropna().values
    if durations.size > 0:
        plt.figure(figsize=(10, 5))
        plt.hist(durations, bins=DURATION_BINS)
        plt.title("Word duration distribution (sec)")
        plt.xlabel("Duration (sec)")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, "fig_word_duration_hist.png"), dpi=200)
        plt.close()

    # Token count histogram
    tok_counts = word_df["n_tokens"].fillna(0).astype(int).values
    plt.figure(figsize=(10, 5))
    plt.hist(tok_counts, bins=TOKENCOUNT_BINS)
    plt.title("Token count per word (GPT2 BPE)")
    plt.xlabel("Number of tokens")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "fig_word_token_count_hist.png"), dpi=200)
    plt.close()


def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # -----------------------------
    # 1) Read alignment table
    # -----------------------------
    align_df = read_gentle_align(ALIGN_PATH)

    # Basic word table
    word_df = align_df.copy()
    word_df.insert(0, "word_id", np.arange(len(word_df), dtype=int))

    # Flags
    word_df["is_unk"] = word_df["aligned_word"].str.strip().eq("<unk>")
    word_df["has_timing"] = word_df["start_sec"].notna() & word_df["end_sec"].notna()
    word_df["duration_sec"] = word_df["end_sec"] - word_df["start_sec"]

    # Use transcript_word as the canonical word string (keeps punctuation etc.)
    words = word_df["transcript_word"].tolist()

    # -----------------------------
    # 2) Build transcript text + word spans
    # -----------------------------
    transcript_text, word_spans = build_transcript_text(words)

    # -----------------------------
    # 3) Tokenize with offset mapping (fast tokenizer)
    # -----------------------------
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

    # GPT-2 has no pad token by default; we do not need padding for a single sequence
    enc = tokenizer(
        transcript_text,
        add_special_tokens=False,
        return_offsets_mapping=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

    input_ids = enc["input_ids"][0]            # (T,)
    attention_mask = enc["attention_mask"][0]  # (T,)
    offsets = enc["offset_mapping"][0].tolist()  # list of (start_char, end_char)

    T = int(input_ids.shape[0])

    # -----------------------------
    # 4) Map tokens -> words by character overlap
    # -----------------------------
    word_to_tokens = map_tokens_to_words(offsets, word_spans)

    # Fill token counts
    word_df["n_tokens"] = [len(toks) for toks in word_to_tokens]
    word_df["has_tokens"] = word_df["n_tokens"] > 0

    # -----------------------------
    # 5) Load GPT2-XL model and extract token embeddings (last layer) with sliding window
    # -----------------------------
    device = BATCH_DEVICE
    model = AutoModel.from_pretrained(MODEL_NAME)
    model.eval()
    model.to(device)

    token_emb = extract_token_embeddings_sliding_window(
        model=model,
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_ctx=MAX_CTX_TOKENS,
        stride=STRIDE,
        device=device,
    )  # (T, H)

    # -----------------------------
    # 6) Token -> word mean pooling (last layer)
    # -----------------------------
    H = token_emb.shape[1]
    W = len(word_to_tokens)
    word_emb = np.full((W, H), np.nan, dtype=np.float32)

    for w_idx, toks in enumerate(word_to_tokens):
        if len(toks) == 0:
            continue
        word_emb[w_idx, :] = token_emb[toks, :].mean(axis=0).astype(np.float32)

    # Add simple embedding norm summary for quick diagnostics
    word_df["emb_l2norm_lastlayer"] = np.linalg.norm(np.nan_to_num(word_emb, nan=0.0), axis=1)

    # -----------------------------
    # 7) Save outputs
    # -----------------------------
    out_csv = os.path.join(OUT_DIR, "pieman_word_table.csv")
    word_df.to_csv(out_csv, index=False, encoding="utf-8-sig")

    np.save(os.path.join(OUT_DIR, "pieman_token_emb_lastlayer.npy"), token_emb)
    np.save(os.path.join(OUT_DIR, "pieman_word_emb_lastlayer.npy"), word_emb)

    # -----------------------------
    # 8) Plot sanity-check figures
    # -----------------------------
    plot_histograms(word_df, OUT_DIR)

    # Print a compact summary
    n_missing_timing = int((~word_df["has_timing"]).sum())
    n_missing_tokens = int((~word_df["has_tokens"]).sum())
    print("\n[STEP1 DONE]")
    print(f"Words: {len(word_df)} | Tokens: {T} | Hidden size: {H}")
    print(f"Missing timing rows: {n_missing_timing}")
    print(f"Words with no tokens mapped: {n_missing_tokens}")
    print(f"Saved CSV: {out_csv}")
    print(f"Saved token embeddings: pieman_token_emb_lastlayer.npy")
    print(f"Saved word embeddings:  pieman_word_emb_lastlayer.npy")
    print("Saved figures: fig_word_duration_hist.png, fig_word_token_count_hist.png")


if __name__ == "__main__":
    main()

Token indices sequence length is longer than the specified maximum sequence length for this model (1057 > 1024). Running this sequence through the model will result in indexing errors
  warn(
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  21%|##1       | 1.35G/6.43G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Extracting token embeddings (sliding window): 100%|██████████| 5/5 [00:22<00:00,  4.59s/it]



[STEP1 DONE]
Words: 957 | Tokens: 1057 | Hidden size: 1600
Missing timing rows: 3
Words with no tokens mapped: 0
Saved CSV: E:\Nastase\encoding_features\pieman_step1\pieman_word_table.csv
Saved token embeddings: pieman_token_emb_lastlayer.npy
Saved word embeddings:  pieman_word_emb_lastlayer.npy
Saved figures: fig_word_duration_hist.png, fig_word_token_count_hist.png


In [4]:
pip install -U "protobuf==3.20.3"

Defaulting to user installation because normal site-packages is not writeable
Collecting protobuf==3.20.3
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.33.5
    Uninstalling protobuf-6.33.5:
      Successfully uninstalled protobuf-6.33.5
Successfully installed protobuf-3.20.3
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-proto 1.39.1 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
streamlit 1.30.0 requires rich<14,>=10.14.0, but you have rich 14.3.3 which is incompatible.
