In [15]:
import pandas as pd
from pathlib import Path

In [16]:
def make_test_excel(path="test_docs.xlsx"):
    """
    Create a tiny Excel workbook with sheets:
      - docs
      - known
      - unknown
      - no context
      - metadata
    Using phrase_num as string keys: 'phrase_1', 'phrase_2'
    """
    # ---- sheet: docs ----
    docs = pd.DataFrame({
        "doc_id":[1],
        "title":["Toy Example (string phrase ids)"],
        "notes":["Tiny dataset for LLR test with phrase_num='phrase_*'"]
    })

    # ---- base table with string phrase IDs ----
    rows = [
        # phrase_1, occurrence 1
        ("phrase_1", 1, "I went to the shop",   "reference",  0.40),
        ("phrase_1", 1, "I visited the shop",   "paraphrase", 0.25),
        ("phrase_1", 1, "I walked to the shop", "paraphrase", 0.20),
        ("phrase_1", 1, "I drove to the shop",  "paraphrase", 0.15),

        # phrase_2, occurrence 1
        ("phrase_2", 1, "He likes music",       "reference",  0.60),
        ("phrase_2", 1, "He enjoys music",      "paraphrase", 0.30),
        ("phrase_2", 1, "He loves music",       "paraphrase", 0.10),
    ]
    cols = ["phrase_num","phrase_occurence","original_phrase","phrase_type","raw_prob"]
    base = pd.DataFrame(rows, columns=cols)

    # ---- split into known / unknown / no context ----
    known = base.copy()
    unknown = base.copy()
    noctx = base.copy()
    metadata = pd.DataFrame({"source":["synthetic"], "comment":["for pipeline test with string keys"]})

    # Slight tweaks so results differ across sheets
    # (scale paraphrase probs a bit in unknown; scale all in no-context)
    # For phrase_1, nudge paraphrases
    mask1 = unknown["phrase_num"].eq("phrase_1")
    # multiply paraphrase rows only
    is_para = unknown["phrase_type"].eq("paraphrase")
    unknown.loc[mask1 & is_para & unknown["original_phrase"].str.contains("visited"), "raw_prob"] *= 0.90
    unknown.loc[mask1 & is_para & unknown["original_phrase"].str.contains("walked"),  "raw_prob"] *= 1.10
    unknown.loc[mask1 & is_para & unknown["original_phrase"].str.contains("drove"),   "raw_prob"] *= 1.20

    # Global downscale for no-context
    noctx["raw_prob"] = noctx["raw_prob"] * 0.80

    # (Optional) re-normalize per (phrase_num, phrase_occurence) block to keep sums ~1
    def _renorm(df):
        return df.assign(raw_prob=df["raw_prob"] / df["raw_prob"].sum())
    known  = known.groupby(["phrase_num","phrase_occurence"], group_keys=False).apply(_renorm)
    unknown= unknown.groupby(["phrase_num","phrase_occurence"], group_keys=False).apply(_renorm)
    noctx  = noctx.groupby(["phrase_num","phrase_occurence"], group_keys=False).apply(_renorm)

    with pd.ExcelWriter(path, engine="openpyxl") as writer:
        docs.to_excel(writer, index=False, sheet_name="docs")
        known.to_excel(writer, index=False, sheet_name="known")
        unknown.to_excel(writer, index=False, sheet_name="unknown")
        noctx.to_excel(writer, index=False, sheet_name="no context")
        metadata.to_excel(writer, index=False, sheet_name="metadata")

    print(f"Created {Path(path).resolve()}")

In [17]:
make_test_excel('/Volumes/BCross/paraphrase examples slurm/Test blank doc.xlsx')

Created /Volumes/BCross/paraphrase examples slurm/Test blank doc.xlsx


