In [None]:
# --- Load My Functions ---
# Put the file name functions.py under folder
import functions
from functions import *

import GPT_function
from GPT_function import *

In [None]:
# Library

import torch
import math
import re
import numpy as np
import pandas as pd
from torch.distributions import Laplace

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForCausalLM
from transformers import GPT2LMHeadModel

from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from scipy.stats import vonmises_fisher
import torch.nn.functional as F

from typing import Dict, List, Optional

from openai import OpenAI

from collections import Counter

In [None]:
# Change this if possible
# Also change the one in GPT_function

client = OpenAI(api_key="Your_API_Key")  # needs OPENAI_API_KEY


In [None]:
# --- load only squad v2 validation ---
squad = load_dataset("Setpember/Fantasy-SQUAD_10")


In [None]:
# --- Load tokenizer and GPT-2 model ---
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", output_hidden_states=True)
embedding_table = model.get_input_embeddings().weight.detach()
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.eval()

# Base tokenizer pad fix (optional)
if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token", None) is not None:
    tokenizer.pad_token = tokenizer.eos_token

# Load a light GPT-2 model
gpt2_tok = AutoTokenizer.from_pretrained("distilgpt2")
if gpt2_tok.pad_token is None:
    gpt2_tok.pad_token = gpt2_tok.eos_token
gpt2_model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(
    "cuda" if torch.cuda.is_available() else "cpu"
).eval()

# --- Extract embedding table ---
# Normalize embedding table for search
norm_embedding_table = torch.nn.functional.normalize(embedding_table, dim=1)

In [None]:
# Heuristic privacy flags at the word level (_privacy_flag_for_word)
# A word is marked private = True if any of these fire:

# 1. Looks like email or URL ([A-Z0-9._%+-]+@..., http(s):// or www.).

# 2. Contains a digit (covers dates, addresses, ages, years, etc.).

# 3. Alphanumeric mix like A12B or user123.

# 4. Is a month name/abbr. (e.g., jan, feb, september, …).

# 5. Ends with common location suffixes (e.g., -ville, -town, -city, -grad, …).

# 6. Proper-noun-ish capitalization or CamelCase chunk.

# 7. Starts with @ or # (handles/hashtags).

# 8. Long ID-ish tokens (≥6) with underscores/hyphens allowed.

# Every subtoken of that word gets the same private value.

In [None]:
# Important? (token-level cosine to the question)

# Question centroid

# 1. Take the (static) GPT-2 embedding for each question token, L2-normalize each, then average and re-normalize → q.

# Token–question similarity

# 2. For each context token, take its (static) embedding, L2-normalize, compute cos = dot(token_emb, q).

# Threshold by τ, important = (cos ≥ τ), with τ set by --tau (default 0.5).

# Note: this is not a contextual encoder; it’s using the GPT-2 embedding table only, so it’s a simple, fast proxy for task relevance.

In [None]:
# This is actual STAMP
# This is a play around parameter for importance , default 0.5
tau = 0.50

@torch.no_grad()
def get_task_vector(question: str, tokenizer, embedding_table: torch.Tensor) -> torch.Tensor:
    q_ids = tokenizer.encode(question, add_special_tokens=False)
    if len(q_ids) == 0:
        v = torch.randn(embedding_table.shape[1], device=embedding_table.device)
        return F.normalize(v, dim=0)
    q_ids_t = torch.tensor(q_ids, dtype=torch.long, device=embedding_table.device)
    q_vec = embedding_table[q_ids_t].mean(dim=0)
    return F.normalize(q_vec, dim=0)

# ---- Light NER-ish fallback (unchanged, just kept here)
_CAP  = re.compile(r"^[A-Z][a-z]{2,}$")
_EMAIL= re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
_NUM  = re.compile(r"^\d{2,}$")
def _clean_piece(p: str) -> str:
    return p.lstrip("Ġ▁").lstrip("##")
def fallback_ner(tokens):
    toks = [_clean_piece(t) for t in tokens]
    return [bool(_EMAIL.search(t) or _NUM.match(t) or _CAP.match(t)) for t in toks]

# ==== canonical partitioner ====
@torch.no_grad()
def partition_tokens_2x2(context: str,
                         question: str,
                         tokenizer,
                         embedding_table: torch.Tensor,
                         tau: float = tau,
                         ner_fn=None):
    """
    Returns: token_ids (List[int]), tokens (List[str]), groups (List[int] in {1,2,3,4})
      1 = High-privacy × High-importance
      2 = High-privacy × Low-importance
      3 = Low-privacy  × High-importance
      4 = Low-privacy  × Low-importance
    """
    token_ids = tokenizer.encode(context, add_special_tokens=False)
    pieces = tokenizer.convert_ids_to_tokens(token_ids)

    # Importance via cosine to task vector
    q_vec = get_task_vector(question, tokenizer, embedding_table)     # (d,)
    ids_t = torch.tensor(token_ids, dtype=torch.long, device=embedding_table.device)
    vecs  = embedding_table[ids_t]                                    # [n,d]
    vecs  = F.normalize(vecs, dim=1)
    sims  = (vecs @ q_vec)                                            # [n]
    high_imp = sims.ge(tau).tolist()

    # Privacy via NER-ish heuristic by default
    high_priv = (ner_fn or fallback_ner)(pieces)

    groups = []
    for p, imp in zip(high_priv, high_imp):
        if p and imp:         groups.append(1)
        elif p and not imp:   groups.append(2)
        elif (not p) and imp: groups.append(3)
        else:                 groups.append(4)

    # Counts per group
    cnt = Counter(groups)
    total = len(groups)
    print(f"τ={tau}  |  tokens={total}  |  G1={cnt.get(1,0)}  G2={cnt.get(2,0)}  G3={cnt.get(3,0)}  G4={cnt.get(4,0)}")

    return token_ids, pieces, groups


# ---- Back-compat alias so existing calls don't break (IMPORTANT):
partition_tokens_paper = partition_tokens_2x2

# ---- Helper that returns ONLY the groups (for apply_stamp callers)
def groups_2x2(context: str,
               question: str,
               tokenizer,
               embedding_table: torch.Tensor,
               **kw) -> list[int]:
    tok_ids, _, groups = partition_tokens_2x2(context, question, tokenizer, embedding_table, **kw)
    return groups, tok_ids

In [None]:
# Unit test on partition

In [None]:
# ---- Print tokens by 2×2 group (uses current STAMP.ipynb code) ----
def print_tokens_by_group(
    context: str,
    question: str,
    tau: float = None,
    *,
    labels: dict | None = None,
    show_counts: bool = True,
    return_buckets: bool = False,
):
    """
    Prints the BPE tokens grouped into:
      G1: important + private
      G2: !important + private
      G3: important + !private
      G4: !important + !private

    Uses the notebook's partitioning function:
      - partition_tokens_2x2(context, question, tokenizer, embedding_table, tau=...)
        -> (token_ids, pieces, groups)
    Falls back to:
      - groups_2x2(...) -> (groups, token_ids) + tokenizer.convert_ids_to_tokens(...)
    """
    # default to the notebook's global tau if not provided
    if tau is None:
        try:
            _ = tau  # local arg
        except NameError:
            pass
        tau = globals().get("tau", 0.50)

    # Try the main API: partition_tokens_2x2 returns pieces directly
    pieces = None
    try:
        token_ids, pieces, groups = partition_tokens_2x2(
            context=context,
            question=question,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=tau,
        )
    except NameError:
        # Back-compat path if only groups_2x2 is defined
        groups, token_ids = groups_2x2(
            context=context,
            question=question,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=tau,
        )
    # If pieces not provided, derive from ids
    if pieces is None:
        pieces = tokenizer.convert_ids_to_tokens(token_ids)

    # Bucket tokens
    buckets = {1: [], 2: [], 3: [], 4: []}
    for tok, g in zip(pieces, groups):
        buckets[g].append(tok)

    lab = labels or {
        1: "G1 (imp+priv)",
        2: "G2 (!imp+priv)",
        3: "G3 (imp+!priv)",
        4: "G4 (!imp+!priv)",
    }

    for g in (1, 2, 3, 4):
        header = f"\n{lab[g]}"
        if show_counts:
            header += f" — {len(buckets[g])} tokens"
        print(header + ":")
        print(" ".join(buckets[g]) if buckets[g] else "—")

    if return_buckets:
        return buckets


In [None]:
# Lets try an example
ctx = "Barack Obama was born in Hawaii on August 4, 1961."
q   = "Where was Obama born?"
print_tokens_by_group(ctx, q, tau=0.50)

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:1]")


for ex in ds:
    ctx = ex["context"]
    q   = ex["question"]

    # Use the canonical partitioner
    print_tokens_by_group(ctx, q, tau=0.50)


In [None]:
# Unit test on Squad

In [2]:
# English as a second language
def repair_coherent_gpt4(
    text: str,
    *,
    answer_span: str | None = None,   # keep this exact text unchanged if provided
    model: str = "gpt-4o-mini",
    temperature: float = 0.2,
    max_tokens: int = 800,
    keep_weird_tokens: bool = True,   # keep nonce tokens (e.g., "thewoodivum") as-is
) -> str:
    """
    Rewrites 'text' into coherent English, preserving meaning and not adding facts.
    If answer_span is provided, it's tagged and MUST remain verbatim.
    """

    # Optionally protect odd tokens so the model doesn’t normalize them away.
    protected = set()
    if keep_weird_tokens:
        # crude heuristic: tokens with digits OR mixed case OR long runs of consonants OR non-ascii
        for tok in set(re.findall(r"[A-Za-z0-9\u00C0-\u024F\u0100-\u017F\-']{3,}", text)):
            if (re.search(r"\d", tok)
                or (re.search(r"[A-Z]", tok) and re.search(r"[a-z]", tok))       # CamelCase-ish
                or re.search(r"[^\x00-\x7F]", tok)                               # non-ascii
                or re.search(r"[bcdfghjklmnpqrstvwxyz]{4,}", tok.lower())):      # consonant run
                protected.add(tok)
    # Tag protected tokens
    tagged = text
    for tok in sorted(protected, key=len, reverse=True):
        tagged = re.sub(rf"\b{re.escape(tok)}\b", f"<ENT>{tok}</ENT>", tagged)

    # Tag answer span (first occurrence) if provided
    if answer_span:
        idx = tagged.find(answer_span)
        if idx != -1:
            tagged = tagged[:idx] + "<ANS>" + answer_span + "</ANS>" + tagged[idx+len(answer_span):]

    sys = (
        "You are a careful editor. Rewrite the passage into coherent, grammatical English, "
        "keeping the original meaning and tone. Do NOT add external facts, "
        # "do NOT invent names, dates, locations, or entities, and do NOT expand abbreviations. "
        "Keep unusual or unknown tokens as-is if wrapped in <ENT>...</ENT>. "
        "If <ANS>...</ANS> appears, keep its contents exactly unchanged and keep it in place. "
        "Preserve paragraphing; only fix grammar/fluency and minimal function words."
    )

    user = f"Rewrite the passage below. Return ONLY the rewritten passage (no commentary).\n\n{tagged}"

    out = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": sys},
                  {"role": "user", "content": user}],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    text_out = (out.choices[0].message.content or "").strip()

    # Untag
    text_out = text_out.replace("<ENT>", "").replace("</ENT>", "")
    if answer_span:
        text_out = text_out.replace("<ANS>", "").replace("</ANS>", "")

    return text_out

# Unit test on garabage
noisy = ("Beneath a vaulted ceiling of driftwood ribs, theroomivum Marest convened to read thescriptthe.")
clean = repair_coherent_gpt4(noisy, answer_span=None)
print(clean)

NameError: name 're' is not defined

In [None]:
# Apply repair_coherent_gpt4 after apply_stamp with polar
results = []

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:3]")
eps_map = {1:200, 2:100, 3:500, 4:400}

for ex in ds:
    ctx = ex["context"]
    q   = ex["question"]
    answers = ex["answers"]["text"] # Extract answers

    # Use the canonical partitioner
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50,                 # or tune
        # ner_fn=your_spacy_ner   # optional: plug a heavier NER here
    )

    privatized_ctx = apply_stamp(
        torch.tensor(tok_ids, device=embedding_table.device),
        groups,
        embedding_table,
        tokenizer,
        eps_dir_by_group=eps_map,                       # <-- actually used
    )[0]

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)

    results.append({
        "question": q,
        "answers": answers,
        "original_context": ctx,
        "privatized_context": privatized_ctx,
        "repaired_context": repaired_ctx
    })

# Create a pandas DataFrame and save it to CSV
df_results = pd.DataFrame(results)
df_results.to_csv("squad_stamp_polar.csv", index=False)

print("Processing complete. Results saved to squad_stamp_polar.csv")

In [None]:
# Apply repair_coherent_gpt4 after apply_stamp with polar for different tau values

tau_values = [0.3, 0.4, 0.5, 0.6, 0.7]
eps_map = {1: 200, 2: 100, 3: 500, 4: 400}

for tau in tau_values:
    print(f"Processing with tau = {tau}")
    results = []

    ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation")

    for ex in ds:
        ctx = ex["context"]
        q = ex["question"]
        answers = ex["answers"]["text"]  # Extract answers

        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=tau,  # Use the current tau value
        )

        privatized_ctx = apply_stamp(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,  # <-- actually used
        )[0]

        # Apply repair_coherent_gpt4
        repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)

        results.append(
            {
                "question": q,
                "answers": answers,
                "original_context": ctx,
                "privatized_context": privatized_ctx,
                "repaired_context": repaired_ctx,
            }
        )

    # Create a pandas DataFrame and save it to CSV with tau value in the filename
    df_results = pd.DataFrame(results)
    df_results.to_csv(f"squad_stamp_polar_tau_{tau:.1f}.csv", index=False)

    print(f"Processing complete for tau = {tau}. Results saved to squad_stamp_polar_tau_{tau:.1f}.csv")

print("All processing complete.")

In [None]:
# Apply repair_coherent_gpt4 after apply_stamp for different privacy maps

def generate_eps_maps_with_step(start_map, end_map, step):
    """Generates a list of eps_map dictionaries with values incremented by step."""
    maps = []
    current_map = start_map.copy()
    while all(current_map.get(k, float('inf')) <= end_map.get(k, float('inf')) for k in end_map):
        maps.append(current_map.copy())
        next_map = {}
        for key in end_map:
            start_val = start_map.get(key, float('inf'))
            end_val = end_map.get(key, float('inf'))
            current_val = current_map.get(key, float('inf'))
            if current_val == float('inf'):
                 next_map[key] = float('inf')
            else:
                next_map[key] = current_val + step[key] if key in step else current_val + step.get('default', 0)
                if next_map[key] > end_val and end_val != float('inf'):
                    next_map[key] = end_val
        current_map = next_map
        # Break if no values were incremented (handles cases where start == end or step is 0)
        if current_map == maps[-1]:
            break

    # Ensure the end_map is included if it wasn't reached exactly
    if maps and maps[-1] != end_map:
        maps.append(end_map.copy())

    return maps

start_eps_map = {1: 150, 2: 50, 3: 450, 4: 350}
end_eps_map = {1: 350, 2: 250, 3: 650, 4: 550}
step_eps_map = {1: 50, 2: 50, 3: 50, 4: 50} # Step size for each key

eps_maps_sweep = generate_eps_maps_with_step(start_eps_map, end_eps_map, step_eps_map)

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation")

for i, eps_map in enumerate(eps_maps_sweep):
    print(f"Processing with eps_map = {eps_map}")
    results_polar = [] # Changed variable name
    total_epsilon_sum = 0
    total_tokens_count = 0

    for ex in ds:
        ctx = ex["context"]
        q = ex["question"]
        answers = ex["answers"]["text"]  # Extract answers

        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=0.50,  # Using a fixed tau for this sweep
        )

        # Calculate total epsilon and token count for average calculation
        sentence_epsilon_sum = sum(eps_map[group] for group in groups)
        total_epsilon_sum += sentence_epsilon_sum
        total_tokens_count += len(tok_ids)

        # Changed from apply_stamp_laplace to apply_stamp
        privatized_ctx_polar = apply_stamp( # Changed variable name
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,  # <-- actually used
        )[0]

        # Apply repair_coherent_gpt4
        repaired_ctx_polar = repair_coherent_gpt4(privatized_ctx_polar, answer_span=None) # Changed variable name

        results_polar.append( # Changed variable name
            {
                "question": q,
                "answers": answers,
                "original_context": ctx,
                "privatized_context": privatized_ctx_polar, # Changed variable name
                "repaired_context": repaired_ctx_polar, # Changed variable name
            }
        )

    # Calculate average epsilon per token for this eps_map
    average_epsilon_per_token = total_epsilon_sum / total_tokens_count if total_tokens_count > 0 else 0

    # Create a pandas DataFrame and save it to CSV with average epsilon in the filename
    df_results_polar = pd.DataFrame(results_polar) # Changed variable name
    df_results_polar.to_csv(f"squad_stamp_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv", index=False) # Changed filename

    print(f"Processing complete for eps_map {i+1}. Results saved to squad_stamp_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv") # Changed filename

print("All sweeps complete.")

In [None]:
# Apply repair_coherent_gpt4 after apply_stamp_laplace
results_laplace = []

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:3]")
eps_map = {1:200, 2:100, 3:500, 4:400}

for ex in ds:
    ctx = ex["context"]
    q   = ex["question"]
    answers = ex["answers"]["text"] # Extract answers

    # Use the canonical partitioner
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50,                 # or tune
        # ner_fn=your_spacy_ner   # optional: plug a heavier NER here
    )

    privatized_ctx_laplace = apply_stamp_laplace(
        torch.tensor(tok_ids, device=embedding_table.device),
        groups,
        embedding_table,
        tokenizer,
        eps_dir_by_group=eps_map,                       # <-- actually used
    )[0]

    # Apply repair_coherent_gpt4
    repaired_ctx_laplace = repair_coherent_gpt4(privatized_ctx_laplace, answer_span=None)

    results_laplace.append({
        "question": q,
        "answers": answers,
        "original_context": ctx,
        "privatized_context": privatized_ctx_laplace,
        "repaired_context": repaired_ctx_laplace
    })

# Create a pandas DataFrame and save it to CSV
df_results_laplace = pd.DataFrame(results_laplace)
df_results_laplace.to_csv(f"squad_stamp_Laplace_tau_{tau:.1f}.csv", index=False)

print("Processing complete. Results saved to squad_stamp_laplace_tau.csv")

In [None]:
# Apply repair_coherent_gpt4 after uniform with polar
results = []

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation")
eps_map = {1:200, 2:100, 3:500, 4:400}

for ex in ds:
    ctx = ex["context"]
    q   = ex["question"]
    answers = ex["answers"]["text"] # Extract answers

    # Use the canonical partitioner
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50,                 # or tune
        # ner_fn=your_spacy_ner   # optional: plug a heavier NER here
    )

    # Calculate the total privacy budget per sentence
    total_sentence_budget = sum(eps_map[group] for group in groups)

    # Calculate the total number of tokens in the sentence
    total_tokens = len(tok_ids)

    # Calculate the average token privacy budget
    average_token_budget = total_sentence_budget / total_tokens if total_tokens > 0 else 0

    print(f"Average token privacy budget for sentence: {average_token_budget:.2f}")

    new_eps_map = {1:average_token_budget, 2:average_token_budget, 3:average_token_budget, 4:average_token_budget}

    privatized_ctx = apply_stamp(
        torch.tensor(tok_ids, device=embedding_table.device),
        groups,
        embedding_table,
        tokenizer,
        eps_dir_by_group=eps_map,                       # <-- actually used
    )[0]

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)

    results.append({
        "question": q,
        "answers": answers,
        "original_context": ctx,
        "privatized_context": privatized_ctx,
        "repaired_context": repaired_ctx
    })

# Create a pandas DataFrame and save it to CSV
df_results = pd.DataFrame(results)
df_results.to_csv("squad_Uniform_polar.csv", index=False)

print("Processing complete. Results saved to squad_Uniform_polar.csv")

In [3]:
# Apply repair_coherent_gpt4 after apply_stamp for different privacy maps

def generate_eps_maps_with_step(start_map, end_map, step):
    """Generates a list of eps_map dictionaries with values incremented by step."""
    maps = []
    current_map = start_map.copy()
    while all(current_map.get(k, float('inf')) <= end_map.get(k, float('inf')) for k in end_map):
        maps.append(current_map.copy())
        next_map = {}
        for key in end_map:
            start_val = start_map.get(key, float('inf'))
            end_val = end_map.get(key, float('inf'))
            current_val = current_map.get(key, float('inf'))
            if current_val == float('inf'):
                 next_map[key] = float('inf')
            else:
                next_map[key] = current_val + step[key] if key in step else current_val + step.get('default', 0)
                if next_map[key] > end_val and end_val != float('inf'):
                    next_map[key] = end_val
        current_map = next_map
        # Break if no values were incremented (handles cases where start == end or step is 0)
        if current_map == maps[-1]:
            break

    # Ensure the end_map is included if it wasn't reached exactly
    if maps and maps[-1] != end_map:
        maps.append(end_map.copy())

    return maps

start_eps_map = {1: 150, 2: 50, 3: 450, 4: 350}
end_eps_map = {1: 350, 2: 250, 3: 650, 4: 550}
step_eps_map = {1: 50, 2: 50, 3: 50, 4: 50} # Step size for each key

eps_maps_sweep = generate_eps_maps_with_step(start_eps_map, end_eps_map, step_eps_map)

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation")

for i, eps_map in enumerate(eps_maps_sweep):
    print(f"Processing with eps_map = {eps_map}")
    results_polar = []
    total_epsilon_sum = 0
    total_tokens_count = 0

    for ex in ds:
        ctx = ex["context"]
        q = ex["question"]
        answers = ex["answers"]["text"]  # Extract answers

        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=0.50,  # Using a fixed tau for this sweep
        )

        # Calculate total epsilon and token count for average calculation
        sentence_epsilon_sum = sum(eps_map[group] for group in groups)
        total_epsilon_sum += sentence_epsilon_sum
        total_tokens_count += len(tok_ids)

        # Calculate and print the pre-token average budget for each sentence
        pre_token_average_budget = sentence_epsilon_sum / len(tok_ids) if len(tok_ids) > 0 else 0
        print(f"Pre-token average budget for this sentence: {pre_token_average_budget:.2f}")

        # Create a new eps_map for uniform budget for this sentence
        uniform_eps_map = {1: pre_token_average_budget, 2: pre_token_average_budget, 3: pre_token_average_budget, 4: pre_token_average_budget}


        privatized_ctx_polar = apply_stamp(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=uniform_eps_map,  # <-- actually used
        )[0]

        # Apply repair_coherent_gpt4
        repaired_ctx_polar = repair_coherent_gpt4(privatized_ctx_polar, answer_span=None)

        results_polar.append(
            {
                "question": q,
                "answers": answers,
                "original_context": ctx,
                "privatized_context": privatized_ctx_polar,
                "repaired_context": repaired_ctx_polar,
            }
        )

    # Calculate average epsilon per token for this eps_map
    average_epsilon_per_token = total_epsilon_sum / total_tokens_count if total_tokens_count > 0 else 0

    # Create a pandas DataFrame and save it to CSV with average epsilon in the filename
    df_results_polar = pd.DataFrame(results_polar)
    df_results_polar.to_csv(f"squad_uniform_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv", index=False)

    print(f"Processing complete for eps_map {i+1}. Results saved to squad_uniform_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv")

print("All sweeps complete.")

NameError: name 'load_dataset' is not defined

In [None]:
# Apply repair_coherent_gpt4 after uniform with laplace
results = []

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation")
eps_map = {1:200, 2:100, 3:500, 4:400}

for ex in ds:
    ctx = ex["context"]
    q   = ex["question"]
    answers = ex["answers"]["text"] # Extract answers

    # Use the canonical partitioner
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50,                 # or tune
        # ner_fn=your_spacy_ner   # optional: plug a heavier NER here
    )

    # Calculate the total privacy budget per sentence
    total_sentence_budget = sum(eps_map[group] for group in groups)

    # Calculate the total number of tokens in the sentence
    total_tokens = len(tok_ids)

    # Calculate the average token privacy budget
    average_token_budget = total_sentence_budget / total_tokens if total_tokens > 0 else 0

    print(f"Average token privacy budget for sentence: {average_token_budget:.2f}")

    new_eps_map = {1:average_token_budget, 2:average_token_budget, 3:average_token_budget, 4:average_token_budget}

    privatized_ctx = apply_stamp_laplace(
        torch.tensor(tok_ids, device=embedding_table.device),
        groups,
        embedding_table,
        tokenizer,
        eps_dir_by_group=eps_map,                       # <-- actually used
    )[0]

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)

    results.append({
        "question": q,
        "answers": answers,
        "original_context": ctx,
        "privatized_context": privatized_ctx,
        "repaired_context": repaired_ctx
    })

# Create a pandas DataFrame and save it to CSV
df_results = pd.DataFrame(results)
df_results.to_csv("squad_Uniform_Laplace.csv", index=False)

print("Processing complete. Results saved to squad_Uniform_Laplace.csv")

In [None]:
# Simpler cases

In [None]:
# apply_stamp_allow_inf_polar: stamp with polar

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:3]")
eps_map = {1:200.0, 2:100.0, 3:float("inf"), 4:float("inf")}

for ex in ds:
    ctx = ex["context"]
    q   = ex["question"]

    # Use the canonical partitioner
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50,                 # or tune
        # ner_fn=your_spacy_ner   # optional: plug a heavier NER here
    )

    privatized_ctx = apply_stamp_allow_inf_polar(
        torch.tensor(tok_ids, device=embedding_table.device),
        groups,
        embedding_table,
        tokenizer,
        eps_dir_by_group=eps_map,                       # <-- actually used
    )[0]

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)


    print("Q:", q)
    print("Privatized context (Polar):", privatized_ctx)
    print("Repaired context:", repaired_ctx)

In [4]:
# apply_stamp_allow_inf_polar: stamp with polar

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation") # Using full validation set

def generate_eps_maps_with_step(start_map, end_map, step):
    """Generates a list of eps_map dictionaries with values incremented by step."""
    maps = []
    current_map = start_map.copy()
    while all(current_map.get(k, float('inf')) <= end_map.get(k, float('inf')) for k in end_map):
        maps.append(current_map.copy())
        next_map = {}
        for key in end_map:
            start_val = start_map.get(key, float('inf'))
            end_val = end_map.get(key, float('inf'))
            current_val = current_map.get(key, float('inf'))
            if current_val == float('inf'):
                 next_map[key] = float('inf')
            else:
                next_map[key] = current_val + step[key] if key in step else current_val + step.get('default', 0)
                if next_map[key] > end_val and end_val != float('inf'):
                    next_map[key] = end_val
        current_map = next_map
        # Break if no values were incremented (handles cases where start == end or step is 0)
        if current_map == maps[-1]:
            break

    # Ensure the end_map is included if it wasn't reached exactly
    if maps and maps[-1] != end_map:
        maps.append(end_map.copy())

    return maps

# Update start and end epsilon maps and step
start_eps_map = {1: 150, 2: 150, 3: float("inf"), 4: float("inf")}
end_eps_map = {1: 550, 2: 550, 3: float("inf"), 4: float("inf")}
step_eps_map = {1: 50, 2: 50, 3: 0, 4: 0} # Step size for each key

eps_maps_sweep = generate_eps_maps_with_step(start_eps_map, end_eps_map, step_eps_map)


for i, eps_map in enumerate(eps_maps_sweep):
    print(f"Processing with eps_map = {eps_map}")
    results_polar = []
    total_epsilon_sum = 0
    total_tokens_count = 0

    for ex in ds:
        ctx = ex["context"]
        q = ex["question"]
        answers = ex["answers"]["text"]  # Extract answers


        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=0.50,  # Using a fixed tau for this sweep
        )

        # Calculate total epsilon and token count for average calculation, skipping inf values
        sentence_epsilon_sum = sum(eps_map[group] for group in groups if eps_map[group] != float('inf'))
        total_epsilon_sum += sentence_epsilon_sum
        total_tokens_count += len([tok for tok, group in zip(tok_ids, groups) if eps_map[group] != float('inf')])


        privatized_ctx_polar = apply_stamp_allow_inf_polar(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,                       # <-- actually used
        )[0]

        # Apply repair_coherent_gpt4
        repaired_ctx_polar = repair_coherent_gpt4(privatized_ctx_polar, answer_span=None)

        results_polar.append(
            {
                "question": q,
                "answers": answers,
                "original_context": ctx,
                "privatized_context": privatized_ctx_polar,
                "repaired_context": repaired_ctx_polar,
            }
        )

    # Calculate average epsilon per token for this eps_map, excluding tokens with inf epsilon
    average_epsilon_per_token = total_epsilon_sum / total_tokens_count if total_tokens_count > 0 else 0


    # Create a pandas DataFrame and save it to CSV with average epsilon in the filename
    df_results_polar = pd.DataFrame(results_polar)
    df_results_polar.to_csv(f"squad_stamp_polar_sweep_inf_avg_epsilon_{average_epsilon_per_token:.2f}.csv", index=False)


    print(f"Processing complete for eps_map {i+1}. Results saved to squad_stamp_polar_sweep_inf_avg_epsilon_{average_epsilon_per_token:.2f}.csv")

print("All sweeps complete.")

NameError: name 'load_dataset' is not defined

In [None]:
# apply_stamp_allow_inf_laplace: stamp with laplace

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:3]")
eps_map = {1:200.0, 2:100.0, 3:float("inf"), 4:float("inf")}

for ex in ds:
    ctx = ex["context"]
    q   = ex["question"]

    # Use the canonical partitioner
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50,                 # or tune
        # ner_fn=your_spacy_ner   # optional: plug a heavier NER here
    )

    privatized_ctx = apply_stamp_allow_inf_laplace(
        torch.tensor(tok_ids, device=embedding_table.device),
        groups,
        embedding_table,
        tokenizer,
        eps_dir_by_group=eps_map,                       # <-- actually used
    )[0]

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)

    print("Q:", q)
    print("Privatized context (Polar):", privatized_ctx)
    print("Repaired context:", repaired_ctx)

In [None]:
# # test # 0: drop ε=0, keep ε=+∞, pass-through otherwise

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:3]")

eps_map = {1: 0.0, 2:0.0, 3: float("inf"), 4: float("inf")}

total = Counter()

for ex in ds:
    ctx, q = ex["context"], ex["question"]

    # 1) Partition once (for reporting)
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.30,
    )
    counts = Counter(groups); total.update(groups)

    # 2) Apply zero-drop policy
    privatized_ctx = apply_drop_zero_keep_inf(
        token_ids=torch.tensor(tok_ids, device=embedding_table.device),
        group_assignments=groups,
        tokenizer=tokenizer,
        eps_dir_by_group=eps_map,
    )[0]


    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)

    print("Q:", q)
    print("Privatized context (Polar):", privatized_ctx)
    print("Repaired context:", repaired_ctx)

In [None]:
# test # 1 only musk out group 1&2 and fill by gpt2

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:3]")

# Example policy: G1 → ε=0 (drop+fill), G4 → ε=∞ (keep), G2/G3 → finite (pass-through)
eps_map = {1: 0, 2: 0, 3: float("inf"), 4: float("inf")}

total = Counter()

for ex in ds:
    ctx, q = ex["context"], ex["question"]

    # 1) Partition (for reporting and to align groups with BASE tokenizer)
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,              # BASE tokenizer (same one used by your embeddings)
        embedding_table=embedding_table,
        tau=0.50,
    )
    counts = Counter(groups); total.update(groups)

    # 2) Apply drop+GPT-2 fill (IDs variant; returns text + ids)
    # some copied form SD code...

    privatized_ctx, _ = apply_drop_zero_keep_inf_gpt2fill_ids(
        token_ids=tok_ids,
        groups=groups,
        eps_dir_by_group=eps_map,
        tokenizer=tokenizer,              # BASE tokenizer
        gpt2_tok=gpt2_tok,                # GPT-2 tokenizer
        gpt2_model=gpt2_model,            # GPT-2 model
        deterministic=True,               # set False for sampling
        temperature=0.2,
        top_p=0.9, # or top k=50
        context_window=256,
    )

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)


    print("Q:", q)
    print("Privatized context (Polar):", privatized_ctx)
    print("Repaired context:", repaired_ctx)


In [None]:
# test # 2 also musk out group 3 and fill by gpt2

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:3]")

# Example policy: G1 → ε=0 (drop+fill), G4 → ε=∞ (keep), G2/G3 → finite (pass-through)
eps_map = {1: 0, 2: 0, 3: 0, 4: float("inf")}

total = Counter()

for ex in ds:
    ctx, q = ex["context"], ex["question"]

    # 1) Partition (for reporting and to align groups with BASE tokenizer)
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,              # BASE tokenizer (same one used by your embeddings)
        embedding_table=embedding_table,
        tau=0.55,
    )
    counts = Counter(groups); total.update(groups)

    # 2) Apply drop+GPT-2 fill (IDs variant; returns text + ids)
    # some copied form SD code...

    privatized_ctx, _ = apply_drop_zero_keep_inf_gpt2fill_ids(
        token_ids=tok_ids,
        groups=groups,
        eps_dir_by_group=eps_map,
        tokenizer=tokenizer,              # BASE tokenizer
        gpt2_tok=gpt2_tok,                # GPT-2 tokenizer
        gpt2_model=gpt2_model,            # GPT-2 model
        deterministic=True,               # set False for sampling
        temperature=0.2,
        top_p=0.9, # or top k=50
        context_window=256,
    )

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)


    print("Q:", q)
    print("Privatized context (Polar):", privatized_ctx)
    print("Repaired context:", repaired_ctx)

In [None]:
# test # 1 only musk out group 1&2 and fill by gpt4

eps_map = {1: 0.0, 2: 0.0, 3: float("inf"), 4: float("inf")}

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:3]")
total = Counter()

for ex in ds:
    ctx, q = ex["context"], ex["question"]

    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,           # BASE tokenizer
        embedding_table=embedding_table,
        tau=0.55,
    )
    counts = Counter(groups); total.update(groups)

    privatized_ctx, _ = apply_drop_zero_keep_inf_gpt4fill_ids(
        token_ids=tok_ids,
        groups=groups,
        eps_dir_by_group=eps_map,
        tokenizer=tokenizer,
        openai_client=client,
        model="gpt-4o-mini",
        temperature=0.2,
        top_p=0.9,
        context_window=256,
        enforce_one_token=True,        # keep exactly 1 base token per drop
        # mask_groups={1,3},           # optional override independent of eps_map
    )

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)


    print("Q:", q)
    print("Privatized context (Polar):", privatized_ctx)
    print("Repaired context:", repaired_ctx)

In [None]:
# test # 2 also musk out group 3 and fill by gpt4

eps_map = {1: 0.0, 2: 0.0, 3: 0.0, 4: float("inf")}

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation[:3]")
total = Counter()

for ex in ds:
    ctx, q = ex["context"], ex["question"]

    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,           # BASE tokenizer
        embedding_table=embedding_table,
        tau=0.55,
    )
    counts = Counter(groups); total.update(groups)

    privatized_ctx, _ = apply_drop_zero_keep_inf_gpt4fill_ids(
        token_ids=tok_ids,
        groups=groups,
        eps_dir_by_group=eps_map,
        tokenizer=tokenizer,
        openai_client=client,
        model="gpt-4o-mini",
        temperature=0.2,
        top_p=0.9,
        context_window=256,
        enforce_one_token=True,        # keep exactly 1 base token per drop
        # mask_groups={1,3},           # optional override independent of eps_map
    )

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)


    print("Q:", q)
    print("Privatized context (Polar):", privatized_ctx)
    print("Repaired context:", repaired_ctx)

In [None]:
# test # 2 also musk out group 3 and fill by gpt4

eps_map = {1: 0.0, 2: 0.0, 3: 0.0, 4: float("inf")}

ds = load_dataset("Setpember/Fantasy-SQUAD_10", split="validation") # Use full validation set
total = Counter()
results_gpt4_mask_fill = [] # List to store results

for ex in ds:
    ctx, q = ex["context"], ex["question"]
    answers = ex["answers"]["text"] # Extract answers

    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,           # BASE tokenizer
        embedding_table=embedding_table,
        tau=0.55,
    )
    counts = Counter(groups); total.update(groups)

    privatized_ctx, _ = apply_drop_zero_keep_inf_gpt4fill_ids(
        token_ids=tok_ids,
        groups=groups,
        eps_dir_by_group=eps_map,
        tokenizer=tokenizer,
        openai_client=client,
        model="gpt-4o-mini",
        temperature=0.2,
        top_p=0.9,
        context_window=256,
        enforce_one_token=True,        # keep exactly 1 base token per drop
        # mask_groups={1,3},           # optional override independent of eps_map
    )

    # Apply repair_coherent_gpt4
    repaired_ctx = repair_coherent_gpt4(privatized_ctx, answer_span=None)

    results_gpt4_mask_fill.append({
        "question": q,
        "answers": answers,
        "original_context": ctx,
        "privatized_context": privatized_ctx,
        "repaired_context": repaired_ctx
    })

    print("Q:", q)
    print("Privatized context (GPT-4 Mask Fill):", privatized_ctx)
    print("Repaired context:", repaired_ctx)

# Create a pandas DataFrame and save it to CSV
df_results_gpt4_mask_fill = pd.DataFrame(results_gpt4_mask_fill)
df_results_gpt4_mask_fill.to_csv("squad_gpt4_mask_fill.csv", index=False)

print("\nProcessing complete. Results saved to squad_gpt4_mask_fill.csv")

In [None]:
import time
import torch

# Initialize timing variables
total_polar_time = 0
polar_perturb_count = 0
total_laplace_time = 0
laplace_perturb_count = 0

# Define epsilon map
eps_map = {1: 200, 2: 100, 3: 500, 4: 400}

# Select first 5 examples from the validation set
# Assuming 'ds' is already loaded. If not, load it: ds = load_dataset("Setpember/Fantasy-SQUAD_10")
validation_subset = ds['validation'].select(range(5))

print("Starting timing comparison on 5 examples...")

for i, ex in enumerate(validation_subset):
    ctx = ex["context"]
    q = ex["question"]

    # Partition tokens
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50
    )

    if tok_ids:
        # Measure Polar time
        start_time = time.time()
        _ = apply_stamp(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,
        )
        end_time = time.time()
        total_polar_time += (end_time - start_time)
        polar_perturb_count += 1

        # Measure Laplace time
        start_time = time.time()
        _ = apply_stamp_laplace(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,
        )
        end_time = time.time()
        total_laplace_time += (end_time - start_time)
        laplace_perturb_count += 1

# Calculate averages
avg_polar = total_polar_time / polar_perturb_count if polar_perturb_count > 0 else 0
avg_laplace = total_laplace_time / laplace_perturb_count if laplace_perturb_count > 0 else 0

print(f"\nProcessed {polar_perturb_count} examples.")
print(f"Average Polar Time: {avg_polar:.6f} s")
print(f"Average Laplace Time: {avg_laplace:.6f} s")