In [None]:
# --- Load My Functions ---
# Put the file name functions.py under folder
import functions
from functions import *

import GPT_function
from GPT_function import *

In [None]:
# Library

import torch
import math
import re
import numpy as np
import pandas as pd
from torch.distributions import Laplace

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForCausalLM
from transformers import GPT2LMHeadModel

from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from scipy.stats import vonmises_fisher
import torch.nn.functional as F

from typing import Dict, List, Optional

from openai import OpenAI

from collections import Counter

In [None]:
# Change this if possible
# Also change the one in GPT_function

client = OpenAI(api_key="Your_API_Key")  # needs OPENAI_API_KEY


In [None]:
# Load only the first 50 examples of the test split of the AGnews dataset
ds = load_dataset("ag_news", split="test[:50]")

In [None]:
# --- Load tokenizer and GPT-2 model ---
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", output_hidden_states=True)
embedding_table = model.get_input_embeddings().weight.detach()
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.eval()

# Base tokenizer pad fix (optional)
if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token", None) is not None:
    tokenizer.pad_token = tokenizer.eos_token

# Load a light GPT-2 model
gpt2_tok = AutoTokenizer.from_pretrained("distilgpt2")
if gpt2_tok.pad_token is None:
    gpt2_tok.pad_token = gpt2_tok.eos_token
gpt2_model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(
    "cuda" if torch.cuda.is_available() else "cpu"
).eval()

# --- Extract embedding table ---
# Normalize embedding table for search
norm_embedding_table = torch.nn.functional.normalize(embedding_table, dim=1)

In [None]:
# Heuristic privacy flags at the word level (_privacy_flag_for_word)
# A word is marked private = True if any of these fire:

# 1. Looks like email or URL ([A-Z0-9._%+-]+@..., http(s):// or www.).

# 2. Contains a digit (covers dates, addresses, ages, years, etc.).

# 3. Alphanumeric mix like A12B or user123.

# 4. Is a month name/abbr. (e.g., jan, feb, september, …).

# 5. Ends with common location suffixes (e.g., -ville, -town, -city, -grad, …).

# 6. Proper-noun-ish capitalization or CamelCase chunk.

# 7. Starts with @ or # (handles/hashtags).

# 8. Long ID-ish tokens (≥6) with underscores/hyphens allowed.

# Every subtoken of that word gets the same private value.

In [None]:
# Important? (token-level cosine to the question)

# Question centroid

# 1. Take the (static) GPT-2 embedding for each question token, L2-normalize each, then average and re-normalize → q.

# Token–question similarity

# 2. For each context token, take its (static) embedding, L2-normalize, compute cos = dot(token_emb, q).

# Threshold by τ, important = (cos ≥ τ), with τ set by --tau (default 0.5).

# Note: this is not a contextual encoder; it’s using the GPT-2 embedding table only, so it’s a simple, fast proxy for task relevance.

In [None]:
# This is actual STAMP
# This is a play around parameter for importance , default 0.5
tau = 0.50

@torch.no_grad()
def get_task_vector(question: str, tokenizer, embedding_table: torch.Tensor) -> torch.Tensor:
    q_ids = tokenizer.encode(question, add_special_tokens=False)
    if len(q_ids) == 0:
        v = torch.randn(embedding_table.shape[1], device=embedding_table.device)
        return F.normalize(v, dim=0)
    q_ids_t = torch.tensor(q_ids, dtype=torch.long, device=embedding_table.device)
    q_vec = embedding_table[q_ids_t].mean(dim=0)
    return F.normalize(q_vec, dim=0)

# ---- Light NER-ish fallback (unchanged, just kept here)
_CAP  = re.compile(r"^[A-Z][a-z]{2,}$")
_EMAIL= re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
_NUM  = re.compile(r"^\d{2,}$")
def _clean_piece(p: str) -> str:
    return p.lstrip("Ġ▁").lstrip("##")
def fallback_ner(tokens):
    toks = [_clean_piece(t) for t in tokens]
    return [bool(_EMAIL.search(t) or _NUM.match(t) or _CAP.match(t)) for t in toks]

# ==== canonical partitioner ====
@torch.no_grad()
def partition_tokens_2x2(context: str,
                         question: str,
                         tokenizer,
                         embedding_table: torch.Tensor,
                         tau: float = tau,
                         ner_fn=None):
    """
    Returns: token_ids (List[int]), tokens (List[str]), groups (List[int] in {1,2,3,4})
      1 = High-privacy × High-importance
      2 = High-privacy × Low-importance
      3 = Low-privacy  × High-importance
      4 = Low-privacy  × Low-importance
    """
    token_ids = tokenizer.encode(context, add_special_tokens=False)
    pieces = tokenizer.convert_ids_to_tokens(token_ids)

    # Importance via cosine to task vector
    q_vec = get_task_vector(question, tokenizer, embedding_table)     # (d,)
    ids_t = torch.tensor(token_ids, dtype=torch.long, device=embedding_table.device)
    vecs  = embedding_table[ids_t]                                    # [n,d]
    vecs  = F.normalize(vecs, dim=1)
    sims  = (vecs @ q_vec)                                            # [n]
    high_imp = sims.ge(tau).tolist()

    # Privacy via NER-ish heuristic by default
    high_priv = (ner_fn or fallback_ner)(pieces)

    groups = []
    for p, imp in zip(high_priv, high_imp):
        if p and imp:         groups.append(1)
        elif p and not imp:   groups.append(2)
        elif (not p) and imp: groups.append(3)
        else:                 groups.append(4)

    # Counts per group
    cnt = Counter(groups)
    total = len(groups)
    print(f"τ={tau}  |  tokens={total}  |  G1={cnt.get(1,0)}  G2={cnt.get(2,0)}  G3={cnt.get(3,0)}  G4={cnt.get(4,0)}")

    return token_ids, pieces, groups


# ---- Back-compat alias so existing calls don't break (IMPORTANT):
partition_tokens_paper = partition_tokens_2x2

# ---- Helper that returns ONLY the groups (for apply_stamp callers)
def groups_2x2(context: str,
               question: str,
               tokenizer,
               embedding_table: torch.Tensor,
               **kw) -> list[int]:
    tok_ids, _, groups = partition_tokens_2x2(context, question, tokenizer, embedding_table, **kw)
    return groups, tok_ids

In [None]:
# Unit test on partition

In [None]:
# ---- Print tokens by 2×2 group (uses current STAMP.ipynb code) ----
def print_tokens_by_group(
    context: str,
    question: str,
    tau: float = None,
    *,
    labels: dict | None = None,
    show_counts: bool = True,
    return_buckets: bool = False,
):
    """
    Prints the BPE tokens grouped into:
      G1: important + private
      G2: !important + private
      G3: important + !private
      G4: !important + !private

    Uses the notebook's partitioning function:
      - partition_tokens_2x2(context, question, tokenizer, embedding_table, tau=...)
        -> (token_ids, pieces, groups)
    Falls back to:
      - groups_2x2(...) -> (groups, token_ids) + tokenizer.convert_ids_to_tokens(...)
    """
    # default to the notebook's global tau if not provided
    if tau is None:
        try:
            _ = tau  # local arg
        except NameError:
            pass
        tau = globals().get("tau", 0.50)

    # Try the main API: partition_tokens_2x2 returns pieces directly
    pieces = None
    try:
        token_ids, pieces, groups = partition_tokens_2x2(
            context=context,
            question=question,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=tau,
        )
    except NameError:
        # Back-compat path if only groups_2x2 is defined
        groups, token_ids = groups_2x2(
            context=context,
            question=question,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=tau,
        )
    # If pieces not provided, derive from ids
    if pieces is None:
        pieces = tokenizer.convert_ids_to_tokens(token_ids)

    # Bucket tokens
    buckets = {1: [], 2: [], 3: [], 4: []}
    for tok, g in zip(pieces, groups):
        buckets[g].append(tok)

    lab = labels or {
        1: "G1 (imp+priv)",
        2: "G2 (!imp+priv)",
        3: "G3 (imp+!priv)",
        4: "G4 (!imp+!priv)",
    }

    for g in (1, 2, 3, 4):
        header = f"\n{lab[g]}"
        if show_counts:
            header += f" — {len(buckets[g])} tokens"
        print(header + ":")
        print(" ".join(buckets[g]) if buckets[g] else "—")

    if return_buckets:
        return buckets


In [None]:
# Generate sweep maps
def generate_eps_maps_with_step(start_map, end_map, step):
    """Generates a list of eps_map dictionaries with values incremented by step."""
    maps = []
    current_map = start_map.copy()
    while all(current_map.get(k, float('inf')) <= end_map.get(k, float('inf')) for k in end_map):
        maps.append(current_map.copy())
        next_map = {}
        for key in end_map:
            start_val = start_map.get(key, float('inf'))
            end_val = end_map.get(key, float('inf'))
            current_val = current_map.get(key, float('inf'))
            if current_val == float('inf'):
                 next_map[key] = float('inf')
            else:
                next_map[key] = current_val + step[key] if key in step else current_val + step.get('default', 0)
                if next_map[key] > end_val and end_val != float('inf'):
                    next_map[key] = end_val
        current_map = next_map
        # Break if no values were incremented (handles cases where start == end or step is 0)
        if current_map == maps[-1]:
            break

    # Ensure the end_map is included if it wasn't reached exactly
    if maps and maps[-1] != end_map:
        maps.append(end_map.copy())

    return maps


In [None]:
# Lets try an example
ctx = "Barack Obama was born in Hawaii on August 4, 1961."
q   = "Where was Obama born?"
# Note: The importance calculation in print_tokens_by_group is based on the relationship with a question.
# For the Yelp review dataset, which does not have inherent questions, this aspect might not be directly applicable.
# We will still call the function, but keep in mind the 'importance' groups are relative to this example question.
print_tokens_by_group(ctx, q, tau=0.50)

ds = load_dataset("ag_news", split="test[:1]")
for ex in ds:
    # Access the review text using the 'text' key
    ctx = ex["text"]
    # Access the rating using the 'label' key (as seen from the dataset structure)
    rating = ex["label"]

    print(f"\nReview Text: {ctx}")
    print(f"Rating: {rating} stars")

    # Since the Yelp dataset does not have inherent questions, we will use a generic
    # approach for partitioning. The 'importance' aspect as calculated by
    # print_tokens_by_group (based on a question) may not be meaningful here.
    # We will still call the function for demonstration purposes, but be aware
    # of how importance is being determined in this context.
    # Using a generic question, or could consider removing importance calculation
    # if not relevant for the task with this dataset.
    q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"


    # Use the canonical partitioner
    print_tokens_by_group(ctx, q, tau=0.50)

In [None]:
# Unit test on Squad

In [None]:
# Apply apply_stamp_laplace
results_polar = []

ds = load_dataset("ag_news", split="test[:1]")
eps_map = {1:200, 2:100, 3:500, 4:400}

for ex in ds:
    ctx = ex["text"]
    q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"

    answers = ex["label"] # Extract answers

    # Use the canonical partitioner
    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50,                 # or tune
        # ner_fn=your_spacy_ner   # optional: plug a heavier NER here
    )

    privatized_ctx_polar = apply_stamp(
        torch.tensor(tok_ids, device=embedding_table.device),
        groups,
        embedding_table,
        tokenizer,
        eps_dir_by_group=eps_map,                       # <-- actually used
    )[0]

    results_polar.append({
        "question": q,
        "rating": answers,
        "original_context": ctx,
        "privatized_context": privatized_ctx_polar,
    })

# Create a pandas DataFrame and save it to CSV
df_results_polar = pd.DataFrame(results_polar)
df_results_polar.to_csv(f"AGnews_stamp_polar_tau_{tau:.1f}.csv", index=False)

print("Processing complete. Results saved to squad_stamp_polar_tau.csv")

In [None]:
# Apply apply_stamp_laplace for different privacy maps

start_eps_map = {1: 150, 2: 50, 3: 450, 4: 350}
end_eps_map = {1: 350, 2: 250, 3: 650, 4: 550}
step_eps_map = {1: 50, 2: 50, 3: 50, 4: 50} # Step size for each key

eps_maps_sweep = generate_eps_maps_with_step(start_eps_map, end_eps_map, step_eps_map)

ds = load_dataset("ag_news", split="test[:50]")

for i, eps_map in enumerate(eps_maps_sweep):
    print(f"Processing with eps_map = {eps_map}")
    results_polar = []
    total_epsilon_sum = 0
    total_tokens_count = 0

    for ex in ds:
        # Access the review text using the 'text' key
        ctx = ex["text"]
        # Use a placeholder question since this is a review dataset
        q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"

        # Access the rating using the 'label' key
        rating = ex["label"]

        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=0.50,  # Using a fixed tau for this sweep
        )

        # Calculate total epsilon and token count for average calculation
        sentence_epsilon_sum = sum(eps_map[group] for group in groups)
        total_epsilon_sum += sentence_epsilon_sum
        total_tokens_count += len(tok_ids)


        privatized_ctx_polar = apply_stamp(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,  # <-- actually used
        )[0]

        # Removed: Apply repair_coherent_gpt4
        # repaired_ctx_laplace = repair_coherent_gpt4(privatized_ctx_laplace, answer_span=None)

        results_polar.append(
            {
                "question": q,
                "rating": rating, # Store rating instead of answers
                "original_context": ctx,
                "privatized_context": privatized_ctx_polar,
                # Removed: "repaired_context": repaired_ctx_laplace,
            }
        )

    # Calculate average epsilon per token for this eps_map
    average_epsilon_per_token = total_epsilon_sum / total_tokens_count if total_tokens_count > 0 else 0

    # Create a pandas DataFrame and save it to CSV with average epsilon in the filename
    # Using a slightly different filename to distinguish from SQuAD results
    df_results_polar = pd.DataFrame(results_polar)
    df_results_polar.to_csv(f"AGnews_stamp_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv", index=False)

    print(f"Processing complete for eps_map {i+1}. Results saved to AGnews_stamp_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv")

print("All sweeps complete.")

In [None]:
# Add a special case for eps_map = {1: 100, 2: 0, 3: 350, 4: 250}

eps_map = {1: 100, 2: 1, 3: 350, 4: 250}

ds = load_dataset("ag_news", split="test[:50]")

print(f"Processing with eps_map = {eps_map}")
results_polar = []
total_epsilon_sum = 0
total_tokens_count = 0

for ex in ds:
    ctx = ex["text"]
    q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"

    rating = ex["label"]

    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50,
    )

    sentence_epsilon_sum = sum(eps_map[group] for group in groups)
    total_epsilon_sum += sentence_epsilon_sum
    total_tokens_count += len(tok_ids)

    privatized_ctx_polar = apply_stamp(
        torch.tensor(tok_ids, device=embedding_table.device),
        groups,
        embedding_table,
        tokenizer,
        eps_dir_by_group=eps_map,
    )[0]

    results_polar.append(
        {
            "question": q,
            "rating": rating,
            "original_context": ctx,
            "privatized_context": privatized_ctx_polar,
        }
    )

average_epsilon_per_token = total_epsilon_sum / total_tokens_count if total_tokens_count > 0 else 0

df_results_polar = pd.DataFrame(results_polar)
df_results_polar.to_csv(f"AGnews_stamp_polar_avg_epsilon_{average_epsilon_per_token:.2f}.csv", index=False)

print(f"Processing complete for eps_map. Results saved to AGnews_stamp_polar_avg_epsilon_{average_epsilon_per_token:.2f}.csv")

In [2]:
# Apply apply_stamp_laplace after apply_stamp_laplace for different privacy maps

start_eps_map = {1: 150, 2: 50, 3: 450, 4: 350}
end_eps_map = {1: 350, 2: 250, 3: 650, 4: 550}
step_eps_map = {1: 50, 2: 50, 3: 50, 4: 50} # Step size for each key

eps_maps_sweep = generate_eps_maps_with_step(start_eps_map, end_eps_map, step_eps_map)

ds = load_dataset("ag_news", split="test[:50]")

for i, eps_map in enumerate(eps_maps_sweep):
    print(f"Processing with eps_map = {eps_map}")
    results_polar = []
    total_epsilon_sum = 0
    total_tokens_count = 0

    for ex in ds:
        # Access review text and rating using correct keys for Yelp dataset
        ctx = ex["text"]
        q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"

        rating = ex["label"]  # Access rating

        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=0.50,  # Using a fixed tau for this sweep
        )

        # Calculate total epsilon and token count for average calculation
        sentence_epsilon_sum = sum(eps_map[group] for group in groups)
        total_epsilon_sum += sentence_epsilon_sum
        total_tokens_count += len(tok_ids)

        # Calculate and print the pre-token average budget for each sentence
        pre_token_average_budget = sentence_epsilon_sum / len(tok_ids) if len(tok_ids) > 0 else 0
        print(f"Pre-token average budget for this sentence: {pre_token_average_budget:.2f}")

        # Create a new eps_map for uniform budget for this sentence
        # Note: This logic seems to be applying the average sentence budget uniformly across groups,
        # which might not be the intended behavior if you want to maintain differential privacy across groups.
        # Consider if you want to apply the original eps_map or this uniform one.
        uniform_eps_map = {1: pre_token_average_budget, 2: pre_token_average_budget, 3: pre_token_average_budget, 4: pre_token_average_budget}


        privatized_ctx_polar = apply_stamp(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=uniform_eps_map,  # <-- actually used (using uniform map)
        )[0]

        # Removed: Apply repair_coherent_gpt4 as it's designed for QA pairs
        # repaired_ctx_laplace = repair_coherent_gpt4(privatized_ctx_laplace, answer_span=None)

        results_polar.append(
            {
                "question": q,
                "rating": rating, # Store rating
                "original_context": ctx,
                "privatized_context": privatized_ctx_polar,
                # Removed: "repaired_context": repaired_ctx_laplace,
            }
        )

    # Calculate average epsilon per token for this eps_map
    average_epsilon_per_token = total_epsilon_sum / total_tokens_count if total_tokens_count > 0 else 0

    # Create a pandas DataFrame and save it to CSV with average epsilon in the filename
    # Using a slightly different filename to distinguish from SQuAD results
    df_results_polar = pd.DataFrame(results_polar)
    df_results_polar.to_csv(f"AGnews_uniform_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv", index=False)

    print(f"Processing complete for eps_map {i+1}. Results saved to AGnews_uniform_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv")

print("All sweeps complete.")

NameError: name 'generate_eps_maps_with_step' is not defined

In [None]:
# Apply uniform_polar with special_eps_map = {1: 100, 2: 0, 3: 350, 4: 250}

eps_map = {1: 100, 2: 1, 3: 350, 4: 250}

ds = load_dataset("ag_news", split="test[:50]")

print(f"Processing with eps_map = {eps_map} for uniform_laplace")

results_uniform = []
total_epsilon_sum_uniform = 0
total_tokens_count_uniform = 0


for ex in ds:
    ctx = ex["text"]
    q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"

    rating = ex["label"]

    tok_ids, _, groups = partition_tokens_2x2(
        context=ctx,
        question=q,
        tokenizer=tokenizer,
        embedding_table=embedding_table,
        tau=0.50,
    )

    # Calculate total epsilon for the sentence based on the special_eps_map
    sentence_epsilon_sum = sum(eps_map[group] for group in groups)

    # Calculate the average budget per token for this sentence
    avg_budget_for_uniform = sentence_epsilon_sum / len(tok_ids) if len(tok_ids) > 0 else 0

    # Create a uniform epsilon map with the calculated average budget
    uniform_eps_map = {1: avg_budget_for_uniform, 2: avg_budget_for_uniform, 3: avg_budget_for_uniform, 4: avg_budget_for_uniform}

    # Accumulate total epsilon and token count for uniform_laplace
    total_epsilon_sum_uniform += sum(uniform_eps_map[group] for group in groups)
    total_tokens_count_uniform += len(tok_ids)


    privatized_ctx_uniform = apply_stamp( # Assuming apply_stamp is the uniform laplace function
        torch.tensor(tok_ids, device=embedding_table.device),
        groups,
        embedding_table,
        tokenizer,
        eps_dir_by_group=uniform_eps_map,
    )[0]

    results_uniform.append(
        {
            "question": q,
            "rating": rating,
            "original_context": ctx,
            "privatized_context": privatized_ctx_uniform,
        }
    )


# Calculate average epsilon per token for uniform_laplace
average_epsilon_per_token_uniform = total_epsilon_sum_uniform / total_tokens_count_uniform if total_tokens_count_uniform > 0 else 0

df_results_uniform = pd.DataFrame(results_uniform)
df_results_uniform.to_csv(f"AGnews_uniform_avg_epsilon_{average_epsilon_per_token_uniform:.2f}.csv", index=False)

print(f"Processing complete for uniform_laplace. Results saved to AGnews_uniform_avg_epsilon_{average_epsilon_per_token_uniform:.2f}.csv")

In [None]:
# Simpler cases

In [None]:
# apply_stamp_allow_inf_laplace: stamp with laplace

# Load the Yelp dataset instead of SQuAD
ds = load_dataset("ag_news", split="test[:50]")

# Update start and end epsilon maps and step
start_eps_map = {1: 150, 2: 150, 3: float("inf"), 4: float("inf")}
end_eps_map = {1: 550, 2: 550, 3: float("inf"), 4: float("inf")}
step_eps_map = {1: 50, 2: 50, 3: 0, 4: 0} # Step size for each key

eps_maps_sweep = generate_eps_maps_with_step(start_eps_map, end_eps_map, step_eps_map)


for i, eps_map in enumerate(eps_maps_sweep):
    print(f"Processing with eps_map = {eps_map}")
    results_polar = []
    total_epsilon_sum = 0
    total_tokens_count = 0

    for ex in ds:
        # Access review text and rating using correct keys for Yelp dataset
        ctx = ex["text"]
        q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"

        rating = ex["label"]  # Access rating

        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=0.50,  # Using a fixed tau for this sweep
        )

        # Calculate total epsilon and token count for average calculation
        sentence_epsilon_sum = sum(eps_map[group] for group in groups if eps_map[group] != float('inf')) # Only sum finite epsilons
        total_epsilon_sum += sentence_epsilon_sum
        total_tokens_count += len([g for g in groups if eps_map[g] != float('inf')]) # Only count tokens with finite epsilons


        privatized_ctx_polar = apply_stamp_allow_inf_polar(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,                       # <-- actually used
        )[0]

        # Removed: Apply repair_coherent_gpt4 as it's designed for QA pairs
        # repaired_ctx_laplace = repair_coherent_gpt4(privatized_ctx_laplace, answer_span=None)

        results_polar.append(
            {
                "question": q,
                "rating": rating, # Store rating
                "original_context": ctx,
                "privatized_context": privatized_ctx_polar,
                # Removed: "repaired_context": repaired_ctx_laplace,
            }
        )

    # Calculate average epsilon per token for this eps_map (only considering tokens with finite epsilon)
    average_epsilon_per_token = total_epsilon_sum / total_tokens_count if total_tokens_count > 0 else 0

    # Create a pandas DataFrame and save it to CSV with average epsilon in the filename
    # Using a filename appropriate for the Yelp dataset
    df_results_polar = pd.DataFrame(results_polar)
    df_results_polar.to_csv(f"AGnews_inf_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv", index=False)

    print(f"Processing complete for eps_map {i+1}. Results saved to AGnews_inf_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv")

print("All sweeps complete.")

In [None]:
# apply_stamp_allow_inf_laplace: stamp with laplace

# Load the Yelp dataset instead of SQuAD
ds = load_dataset("ag_news", split="test[:50]")

# Update start and end epsilon maps and step
start_eps_map = {1: float("inf"), 2: 150, 3: float("inf"), 4: float("inf")}
end_eps_map = {1: float("inf"), 2: 550, 3: float("inf"), 4: float("inf")}
step_eps_map = {1: 0, 2: 50, 3: 0, 4: 0} # Step size for each key

eps_maps_sweep = generate_eps_maps_with_step(start_eps_map, end_eps_map, step_eps_map)


for i, eps_map in enumerate(eps_maps_sweep):
    print(f"Processing with eps_map = {eps_map}")
    results_polar = []
    total_epsilon_sum = 0
    total_tokens_count = 0

    for ex in ds:
        # Access review text and rating using correct keys for Yelp dataset
        ctx = ex["text"]
        q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"

        rating = ex["label"]  # Access rating

        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=0.50,  # Using a fixed tau for this sweep
        )

        # Calculate total epsilon and token count for average calculation
        sentence_epsilon_sum = sum(eps_map[group] for group in groups if eps_map[group] != float('inf')) # Only sum finite epsilons
        total_epsilon_sum += sentence_epsilon_sum
        total_tokens_count += len([g for g in groups if eps_map[g] != float('inf')]) # Only count tokens with finite epsilons


        privatized_ctx_polar = apply_stamp_allow_inf_polar(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,                       # <-- actually used
        )[0]

        # Removed: Apply repair_coherent_gpt4 as it's designed for QA pairs
        # repaired_ctx_laplace = repair_coherent_gpt4(privatized_ctx_laplace, answer_span=None)

        results_polar.append(
            {
                "question": q,
                "rating": rating, # Store rating
                "original_context": ctx,
                "privatized_context": privatized_ctx_polar,
                # Removed: "repaired_context": repaired_ctx_laplace,
            }
        )

    # Calculate average epsilon per token for this eps_map (only considering tokens with finite epsilon)
    average_epsilon_per_token = total_epsilon_sum / total_tokens_count if total_tokens_count > 0 else 0

    # Create a pandas DataFrame and save it to CSV with average epsilon in the filename
    # Using a filename appropriate for the Yelp dataset
    df_results_polar = pd.DataFrame(results_polar)
    df_results_polar.to_csv(f"AGnews_1inf_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv", index=False)

    print(f"Processing complete for eps_map {i+1}. Results saved to AGnews_inf_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv")

print("All sweeps complete.")

In [None]:
# apply_stamp_allow_inf_laplace: stamp with laplace

# Load the Yelp dataset instead of SQuAD
ds = load_dataset("ag_news", split="test[:50]")

# Update start and end epsilon maps and step
start_eps_map = {1: 150, 2: 150, 3: 150, 4: 150}
end_eps_map = {1: 550, 2: 550, 3: 550, 4: 550}
step_eps_map = {1: 50, 2: 50, 3: 50, 4: 50} # Step size for each key

eps_maps_sweep = generate_eps_maps_with_step(start_eps_map, end_eps_map, step_eps_map)


for i, eps_map in enumerate(eps_maps_sweep):
    print(f"Processing with eps_map = {eps_map}")
    results_polar = []
    total_epsilon_sum = 0
    total_tokens_count = 0

    for ex in ds:
        # Access review text and rating using correct keys for Yelp dataset
        ctx = ex["text"]
        q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"

        rating = ex["label"]  # Access rating

        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=0.50,  # Using a fixed tau for this sweep
        )

        # Calculate total epsilon and token count for average calculation
        sentence_epsilon_sum = sum(eps_map[group] for group in groups if eps_map[group] != float('inf')) # Only sum finite epsilons
        total_epsilon_sum += sentence_epsilon_sum
        total_tokens_count += len([g for g in groups if eps_map[g] != float('inf')]) # Only count tokens with finite epsilons


        privatized_ctx_polar = apply_stamp_allow_inf_polar(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,                       # <-- actually used
        )[0]

        # Removed: Apply repair_coherent_gpt4 as it's designed for QA pairs
        # repaired_ctx_laplace = repair_coherent_gpt4(privatized_ctx_laplace, answer_span=None)

        results_polar.append(
            {
                "question": q,
                "rating": rating, # Store rating
                "original_context": ctx,
                "privatized_context": privatized_ctx_polar,
                # Removed: "repaired_context": repaired_ctx_laplace,
            }
        )

    # Calculate average epsilon per token for this eps_map (only considering tokens with finite epsilon)
    average_epsilon_per_token = total_epsilon_sum / total_tokens_count if total_tokens_count > 0 else 0

    # Create a pandas DataFrame and save it to CSV with average epsilon in the filename
    # Using a filename appropriate for the Yelp dataset
    df_results_polar = pd.DataFrame(results_polar)
    df_results_polar.to_csv(f"AGnews_no_inf_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv", index=False)

    print(f"Processing complete for eps_map {i+1}. Results saved to AGnews_inf_polar_sweep_avg_epsilon_{average_epsilon_per_token:.2f}.csv")

print("All sweeps complete.")

In [None]:
# apply_stamp_allow_inf_laplace: stamp with laplace

# Load the Yelp dataset instead of SQuAD
ds = load_dataset("ag_news", split="test[:50]")

# Update start and end epsilon maps and step
start_eps_map = {1: 300, 2: 150, 3: 600, 4: 450}   # 2*150, 1*150, 4*150, 3*150
end_eps_map   = {1: 1100, 2: 550, 3: 2200, 4: 1650} # 2*550, 1*550, 4*550, 3*550
step_eps_map  = {1: 100, 2: 50, 3: 200, 4: 150}     # 2*50, 1*50, 4*50, 3*50

eps_maps_sweep = generate_eps_maps_with_step(start_eps_map, end_eps_map, step_eps_map)


for i, eps_map in enumerate(eps_maps_sweep):
    print(f"Processing with eps_map = {eps_map}")
    results_polar = []
    total_epsilon_sum = 0
    total_tokens_count = 0

    for ex in ds:
        # Access review text and rating using correct keys for Yelp dataset
        ctx = ex["text"]
        q = "Which category does this news article belong to: World, Sports, Business, or Sci/Tech?"

        rating = ex["label"]  # Access rating

        # Use the canonical partitioner
        tok_ids, _, groups = partition_tokens_2x2(
            context=ctx,
            question=q,
            tokenizer=tokenizer,
            embedding_table=embedding_table,
            tau=0.50,  # Using a fixed tau for this sweep
        )

        # Calculate total epsilon and token count for average calculation
        sentence_epsilon_sum = sum(eps_map[group] for group in groups if eps_map[group] != float('inf')) # Only sum finite epsilons
        total_epsilon_sum += sentence_epsilon_sum
        total_tokens_count += len([g for g in groups if eps_map[g] != float('inf')]) # Only count tokens with finite epsilons


        privatized_ctx_polar = apply_stamp_allow_inf_polar(
            torch.tensor(tok_ids, device=embedding_table.device),
            groups,
            embedding_table,
            tokenizer,
            eps_dir_by_group=eps_map,                       # <-- actually used
        )[0]

        # Removed: Apply repair_coherent_gpt4 as it's designed for QA pairs
        # repaired_ctx_laplace = repair_coherent_gpt4(privatized_ctx_laplace, answer_span=None)

        results_polar.append(
            {
                "question": q,
                "rating": rating, # Store rating
                "original_context": ctx,
                "privatized_context": privatized_ctx_polar,
                # Removed: "repaired_context": repaired_ctx_laplace,
            }
        )

    # Calculate average epsilon per token for this eps_map (only considering tokens with finite epsilon)
    min_group_eps = min(v for v in eps_map.values() if math.isfinite(v))

    # Create a pandas DataFrame and save it to CSV with average epsilon in the filename
    # Using a filename appropriate for the Yelp dataset
    df_results_polar = pd.DataFrame(results_polar)
    df_results_polar.to_csv(f"AGnews_power_inf_polar_sweep_avg_epsilon_{min_group_eps:.2f}.csv", index=False)

    print(f"Processing complete for eps_map {i+1}. Results saved to AGnews_inf_polar_sweep_min_epsilon_{average_epsilon_per_token:.2f}.csv")

print("All sweeps complete.")