## Synthetic data pre-processing. 

In [None]:
import os, re, unicodedata, random, json
from pathlib import Path

PROJECT_ROOT = Path("..").resolve() 
DATA_DIR     = PROJECT_ROOT / "data"
OUTPUT_DIR   = DATA_DIR / "clean"

INPUT_FILE   = DATA_DIR / "synthetic_dialogue.txt"


CLEAN_ALL    = OUTPUT_DIR / "synthetic_dialogue_clean.txt"


TRAIN_OUT    = OUTPUT_DIR / "train.txt"
VAL_OUT      = OUTPUT_DIR / "val.txt"
TEST_OUT     = OUTPUT_DIR / "test.txt"

TRAIN_JSONL  = OUTPUT_DIR / "train.jsonl"
VAL_JSONL    = OUTPUT_DIR / "val.jsonl"
TEST_JSONL   = OUTPUT_DIR / "test.jsonl"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Input:   {INPUT_FILE}")
print(f"Outputs: {OUTPUT_DIR}")


Input:   D:\Game\Backend\data\synthetic_dialogue.txt
Outputs: D:\Game\Backend\data\clean


In [None]:
VALID_NPCS = {"Jacky", "Mr Dawson", "Father Jacob", "Tom", "Sarah"}

def normalize_line_keep_punct(s: str) -> str:
    """
    Minimal normalization: NFKC width/spacing normalization,
    collapse inner runs of spaces/tabs to single space, trim ends.
    Does NOT change fancy punctuation.
    """
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"[ \t]+", " ", s).strip()
    return s

def canon_context_line(text: str) -> str:

    content = text[len("<CONTEXT>"):].strip()
    if content.startswith(">"):
        content = content[1:].strip()
    return f"<CONTEXT> {content}"

def canon_player_line(text: str) -> str:
    content = text[len("<PLAYER>"):].strip()
    if content.startswith(">"):
        content = content[1:].strip()
    return f"<PLAYER> {content}"

def canon_npc_line(text: str) -> str:
    
    m = re.match(r"^<NPC>\(([^)]+)\)\s*(.*)$", text)
    if not m:
        return None
    name, rest = m.group(1).strip(), m.group(2).strip()
    if name not in VALID_NPCS:
        return None
    return f"<NPC>({name}) {rest}"


In [4]:
def parse_block(block_text: str):
    """
    Parse a single block. Expected lines:
      <CONTEXT> ...
      <PLAYER> ...
      <NPC>(Name) ...
      <END>
    Return (ctx_line, ply_line, npc_line, end_ok) or (None, None, None, False).
    """
    lines = [ln for ln in block_text.splitlines() if ln.strip()]
    ctx = ply = npc = None
    end_ok = False

    for ln in lines:
        raw = normalize_line_keep_punct(ln)
        if raw.startswith("<CONTEXT>"):
            ctx = canon_context_line(raw)
        elif raw.startswith("<PLAYER>"):
            ply = canon_player_line(raw)
        elif raw.startswith("<NPC>("):
            npc = canon_npc_line(raw)
        elif raw.startswith("<END>"):
            end_ok = True

    if not (ctx and ply and npc and end_ok):
        return None, None, None, False
    return ctx, ply, npc, True


In [None]:
raw = INPUT_FILE.read_text(encoding="utf-8", errors="ignore")
raw_blocks = [b for b in re.split(r"\n\s*\n", raw) if b.strip()]
print(f"Raw blocks detected: {len(raw_blocks)}")


Raw blocks detected: 5119


In [None]:
clean_blocks = []
malformed = 0
bad_npc = 0

for b in raw_blocks:
    ctx, ply, npc, ok = parse_block(b)
    if not ok:
        
        if "<NPC>(" in b and not re.search(r"<NPC>\(([^)]+)\)", b):
            malformed += 1
        else:
            # Check if NPC name exists but invalid
            m = re.search(r"<NPC>\(([^)]+)\)", b)
            if m and m.group(1).strip() not in VALID_NPCS:
                bad_npc += 1
            else:
                malformed += 1
        continue

    
    rebuilt = "\n".join([ctx, ply, npc, "<END>"])
    clean_blocks.append(rebuilt)

print(f"Kept: {len(clean_blocks)} | Malformed: {malformed} | Invalid NPC: {bad_npc}")


Kept: 5119 | Malformed: 0 | Invalid NPC: 0


In [7]:
for i, blk in enumerate(clean_blocks[:3], 1):
    print(f"--- SAMPLE {i} ---")
    print(blk)
    print()


--- SAMPLE 1 ---
<CONTEXT> Player was rude to Father Jacob this morning. Rumors spread quickly among the villagers.
<PLAYER> Jacky, I heard the talk—should I go apologize to Father Jacob?
<NPC>(Jacky) Yes, do it proper and soon. Respect keeps this gate quiet.
<END>

--- SAMPLE 2 ---
<CONTEXT> Player apologized to Father Jacob at the church door and was heard by visitors.
<PLAYER> Father Jacob, thank you for hearing me out earlier.
<NPC>(Father Jacob) We’re alright, Acool. Let today begin gentler than the last.
<END>

--- SAMPLE 3 ---
<CONTEXT> Player helped Tom carry timber behind the church; builders spoke well of it.
<PLAYER> Tom, need another pair of hands with those beams?
<NPC>(Tom) Always. Grab the end and mind your thumbs.
<END>



In [8]:
CLEAN_ALL.write_text("\n\n".join(clean_blocks) + "\n", encoding="utf-8")
print(f"Cleaned corpus written → {CLEAN_ALL}")


Cleaned corpus written → D:\Game\Backend\data\clean\synthetic_dialogue_clean.txt


In [None]:
random.seed(42)
shuffled = clean_blocks[:] 
random.shuffle(shuffled)

n = len(shuffled)
n_test = max(1, int(0.05 * n))
n_val  = max(1, int(0.05 * n))
test = shuffled[:n_test]
val  = shuffled[n_test:n_test+n_val]
train = shuffled[n_test+n_val:]

def write_blocks(path, blocks):
    path.write_text("\n\n".join(blocks) + "\n", encoding="utf-8")

write_blocks(TRAIN_OUT, train)
write_blocks(VAL_OUT, val)
write_blocks(TEST_OUT, test)

print(f"Train: {len(train)} → {TRAIN_OUT}")
print(f"Val:   {len(val)} → {VAL_OUT}")
print(f"Test:  {len(test)} → {TEST_OUT}")


Train: 4609 → D:\Game\Backend\data\clean\train.txt
Val:   255 → D:\Game\Backend\data\clean\val.txt
Test:  255 → D:\Game\Backend\data\clean\test.txt


In [None]:
def block_to_jsonl_item(block):
    
    lines = [ln for ln in block.splitlines() if ln.strip()]
    ctx = lines[0][len("<CONTEXT>"):].strip()
    ply = lines[1][len("<PLAYER>"):].strip()
    m   = re.match(r"^<NPC>\(([^)]+)\)\s(.*)$", lines[2])
    npc_name, npc_text = (m.group(1).strip(), m.group(2).strip()) if m else ("", "")
    return {
        "context": ctx,
        "player": ply,
        "npc_name": npc_name,
        "reference": npc_text,
        "prompt": f"<CONTEXT> {ctx}\n<PLAYER> {ply}\n<NPC>({npc_name})",
        "target": f" {npc_text}"
    }

def write_jsonl(path, blocks):
    with path.open("w", encoding="utf-8") as f:
        for b in blocks:
            f.write(json.dumps(block_to_jsonl_item(b), ensure_ascii=False) + "\n")

write_jsonl(TRAIN_JSONL, train)
write_jsonl(VAL_JSONL, val)
write_jsonl(TEST_JSONL, test)

print(f"JSONL written:\n- {TRAIN_JSONL}\n- {VAL_JSONL}\n- {TEST_JSONL}")


JSONL written:
- D:\Game\Backend\data\clean\train.jsonl
- D:\Game\Backend\data\clean\val.jsonl
- D:\Game\Backend\data\clean\test.jsonl


In [None]:
def assert_block_ok(block):
    lines = [ln for ln in block.splitlines() if ln.strip()]
    assert lines[0].startswith("<CONTEXT> "), "Bad CONTEXT line"
    assert lines[1].startswith("<PLAYER> "), "Bad PLAYER line"
    assert lines[2].startswith("<NPC>("),   "Bad NPC header"
    assert lines[3].strip() == "<END>",     "Missing END"

    m = re.match(r"^<NPC>\(([^)]+)\)\s", lines[2])
    assert m and m.group(1).strip() in VALID_NPCS, "Invalid NPC name"

for path in [TRAIN_OUT, VAL_OUT, TEST_OUT]:
    txt = Path(path).read_text(encoding="utf-8")
    blks = [b for b in re.split(r"\n\s*\n", txt) if b.strip()]
    for b in blks[:50]: 
        assert_block_ok(b)
print("Integrity checks passed on samples.")


Integrity checks passed on samples.
