## Synthetic data pre-processing. 

In [1]:
import os, re, unicodedata, random

INPUT  = os.path.join('..', 'data', 'synthetic_dialogue_1.txt')
OUTPUT = os.path.join('..', 'data', 'game_dialogue_10k_clean.txt')

TRAIN_OUT = os.path.join('..', 'data', 'game_dialogue_10k_train.txt')
VAL_OUT   = os.path.join('..', 'data', 'game_dialogue_10k_val.txt')

VALID_NPCS = {"Jacky", "Mr Dawson", "Father Jacob", "Tom", "Sarah"}


In [2]:
def normalize_text(s: str) -> str:
    # Normalize width/spacing but keep fancy punctuation
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"[ \t]+", " ", s).strip()
    return s

def parse_block(lines):
    """
    Expect lines containing (order can vary, but all must exist):
      <CONTEXT> ...
      <PLAYER> ...
      <NPC>(Name) ...
      <END>
    Returns (ctx, ply, npc, end_flag) or (None, None, None, None) if malformed.
    """
    ctx = ply = npc = end_flag = None
    for ln in lines:
        if ln.startswith("<CONTEXT>"):
            ctx = normalize_text(ln)
        elif ln.startswith("<PLAYER>"):
            ply = normalize_text(ln)
        elif ln.startswith("<NPC>("):
            npc = normalize_text(ln)
        elif ln.startswith("<END>"):
            end_flag = "<END>"
    return ctx, ply, npc, end_flag


In [3]:
with open(INPUT, "r", encoding="utf-8") as f:
    raw = f.read()

blocks = [b for b in re.split(r"\n\s*\n", raw) if b.strip()]
print(f"Raw blocks detected: {len(blocks)}")


Raw blocks detected: 5119


In [4]:
clean_blocks = []
skipped = 0
bad_npc = 0
malformed = 0

for b in blocks:
    # Keep non-empty, strip trailing spaces
    lines = [l.rstrip() for l in b.splitlines() if l.strip()]
    ctx, ply, npc, end_flag = parse_block(lines)

    # Basic presence check
    if not (ctx and ply and npc and end_flag):
        malformed += 1
        continue

    # Exact tag prefix validation
    if not ctx.startswith("<CONTEXT> ") or not ply.startswith("<PLAYER> ") or not npc.startswith("<NPC>("):
        malformed += 1
        continue

    # Validate NPC name inside parentheses
    m = re.match(r"^<NPC>\(([^)]+)\)\s", npc)
    if not m:
        malformed += 1
        continue
    npc_name = m.group(1).strip()
    if npc_name not in VALID_NPCS:
        bad_npc += 1
        continue

    # Canonicalize spacing & rebuild the block exactly (one blank line between samples later)
    block = "\n".join([ctx, ply, npc, "<END>"])
    clean_blocks.append(block)

print(f"Kept: {len(clean_blocks)} | Skipped malformed: {malformed} | Skipped bad NPC: {bad_npc}")


Kept: 5119 | Skipped malformed: 0 | Skipped bad NPC: 0


In [5]:
# If your generator produced accidental duplicates, remove them:
before = len(clean_blocks)
clean_blocks = list(dict.fromkeys(clean_blocks))  # order-preserving unique
after = len(clean_blocks)
print(f"Deduplicated: {before - after} removed | Final unique blocks: {after}")


Deduplicated: 402 removed | Final unique blocks: 4717


In [6]:
random.seed(42)
random.shuffle(clean_blocks)

In [7]:
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
with open(OUTPUT, "w", encoding="utf-8") as f:
    f.write("\n\n".join(clean_blocks) + "\n")

print(f"Cleaned file written to: {OUTPUT}")

Cleaned file written to: ..\data\game_dialogue_10k_clean.txt


In [8]:
for i, blk in enumerate(clean_blocks[:3], 1):
    print(f"--- SAMPLE {i} ---")
    print(blk)
    print()


--- SAMPLE 1 ---
<CONTEXT> Someone claimed the well water tasted sweeter today; folks nodded.
<PLAYER> Mr Dawson, is Mr Dawson in the garden today? Is the west gate still squeaking?
<NPC>(Mr Dawson) The garden is calm. Reputation, like roses, rewards patience.
<END>

--- SAMPLE 2 ---
<CONTEXT> Morning sun felt warmer than usual; everyone kept to their chores.
<PLAYER> Jacky, is Mr Dawson in the garden today?
<NPC>(Jacky) Routine morning. Stay clear of the hedge and keep your word.
<END>

--- SAMPLE 3 ---
<CONTEXT> Player apologized to Tom near the scaffolding; laborers passed the word along. A rumor said a merchant wagon might arrive tomorrow.
<PLAYER> Tom, sorry if I sounded off before. Is the west gate still squeaking?
<NPC>(Tom) All good. Hand me those nails and we’ll call it even.
<END>



In [9]:
split = int(0.95 * len(clean_blocks))  # 95/5 split; tweak as you like
train_blocks = clean_blocks[:split]
val_blocks   = clean_blocks[split:]

with open(TRAIN_OUT, "w", encoding="utf-8") as f:
    f.write("\n\n".join(train_blocks) + "\n")

with open(VAL_OUT, "w", encoding="utf-8") as f:
    f.write("\n\n".join(val_blocks) + "\n")

print(f"Train: {len(train_blocks)} -> {TRAIN_OUT}")
print(f"Val:   {len(val_blocks)} -> {VAL_OUT}")


Train: 4481 -> ..\data\game_dialogue_10k_train.txt
Val:   236 -> ..\data\game_dialogue_10k_val.txt
