In [None]:

# Enhanced QE Dataset Builder (Full Version with Stronger Degradation)
# All comments in English.

import os
import argparse
import random
import re
import csv

# Base paths
BASE_DIR = os.getcwd()
DATASET_DIR = BASE_DIR + '\\dataset'

# Merge text files inside a folder
def merge_text_files(folder_path, output_file="merged.txt", encoding="utf-8"):
    with open(output_file, "w", encoding=encoding) as outfile:
        for filename in sorted(os.listdir(folder_path)):
            if filename.lower().endswith(".txt"):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, "r", encoding=encoding) as infile:
                    content = infile.read().strip()
                    outfile.write(content + "\n")
    print(f"[OK] All text files have been combined into '{output_file}'.")

# Prepare source and target merged text files
source = DATASET_DIR + '\\ko\\'
src_path = BASE_DIR + '\\source.txt'
if not os.path.exists(src_path):
    merge_text_files(source, output_file='source.txt')

target = DATASET_DIR + '\\en\\'
tgt_path = BASE_DIR + '\\target.txt'
if not os.path.exists(tgt_path):
    merge_text_files(target, output_file='target.txt')

# Argument parser
p = argparse.ArgumentParser(description="Build synthetic QE dataset CSV")
p.add_argument("--source", default="source.txt")
p.add_argument("--target", default="target.txt")
p.add_argument("--out", default="dataset.csv")
p.add_argument("--seed", type=int, default=42)
p.add_argument("--no-perfect", action="store_true")
args, _ = p.parse_known_args()

# File existence check
if not os.path.exists(args.source) or not os.path.exists(args.target):
    raise FileNotFoundError("source.txt / target.txt not found.")

# Read lines
def read_lines(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return [ln.rstrip("\n\r") for ln in f.readlines()]

srcs = read_lines(args.source)
tgts = read_lines(args.target)

if len(srcs) != len(tgts):
    raise ValueError("Source and target file line counts do not match.")

# Regex helpers
_WORD_RE = re.compile(r"[^\W\d_]+", re.UNICODE)
_NUM_RE = re.compile(r"\b\d{1,4}\b")
_AMPM_RE = re.compile(r"\b(am|pm|a\.m\.|p\.m\.)\b", re.IGNORECASE)

MONTHS = ["january","february","march","april","may","june",
          "july","august","september","october","november","december"]
DAYS = ["monday","tuesday","wednesday","thursday","friday","saturday","sunday"]

AUX = ["am","is","are","was","were","be","been","being",
       "do","does","did","have","has","had",
       "can","could","will","would","shall","should","may","might","must"]

STOP_LITE = {"a","an","the","to","of","in","on","for","and","or","but","so",
             "as","at","by","with","from","than","then","there","here"}

ALPHABET = "abcdefghijklmnopqrstuvwxyz"
FILLERS = ["actually", "really", "kind of", "maybe", "sort of", "in fact"]

# Case preservation
def _preserve_case(src_token: str, repl: str) -> str:
    if src_token.isupper(): return repl.upper()
    if src_token[0].isupper(): return repl.capitalize()
    return repl

# Replace one vocabulary item
def replace_one_of_set(text: str, vocab: list[str]) -> str:
    lowered = text.lower()
    candidates = [w for w in vocab if re.search(rf"\b{re.escape(w)}\b", lowered)]
    if not candidates:
        return text
    target = random.choice(candidates)
    others = [w for w in vocab if w != target]
    if not others:
        return text
    repl = random.choice(others)

    def _sub(m):
        tok = m.group(0)
        return _preserve_case(tok, repl)

    return re.sub(rf"\b{re.escape(target)}\b", _sub, text, count=1, flags=re.IGNORECASE)

# Mutate numbers
def mutate_numbers(text: str) -> str:
    def _mut(m):
        n = int(m.group(0))
        if n <= 2:
            return str(n + 1)
        return str(n - 1 if random.random() < 0.5 else n + 1)
    return _NUM_RE.sub(_mut, text, count=random.randint(1, 2))

# Flip AM/PM
def flip_ampm(text: str) -> str:
    def _flip(m):
        v = m.group(0)
        low = v.lower()
        alt = "pm" if "am" in low else "am"
        if "." in low:
            alt = alt[0] + ".m."
        return _preserve_case(v, alt)
    return _AMPM_RE.sub(_flip, text, count=1)

# Insert or remove 'not'
def insert_or_remove_not(text: str) -> str:
    if re.search(r"\bnot\b", text, flags=re.IGNORECASE) and random.random() < 0.5:
        return re.sub(r"\bnot\b","", text, count=1, flags=re.IGNORECASE).replace("  "," ")
    for aux in AUX:
        m = re.search(rf"\b{aux}\b", text, flags=re.IGNORECASE)
        if m:
            i = m.end()
            return text[:i] + " not" + text[i:]
    return text

# Drop content words
def drop_content_words(text: str, drop_ratio=(0.15, 0.35)) -> str:
    tokens = text.split()
    if not tokens: return text
    k = random.uniform(*drop_ratio)
    kept = []
    for t in tokens:
        plain = re.sub(r"[^\w]", "", t)
        is_word = bool(_WORD_RE.fullmatch(plain)) and plain
        if is_word and len(plain) > 3 and plain.lower() not in STOP_LITE:
            if random.random() < k:
                continue
        kept.append(t)
    out = " ".join(kept)
    return out if out.strip() else text

# Shuffle chunks
def shuffle_chunks(text: str) -> str:
    parts = re.split(r"(,|;| and | but )", text)
    if len(parts) < 3:
        toks = text.split()
        random.shuffle(toks)
        return " ".join(toks)

    chunks, cur = [], ""
    for p in parts:
        cur += p
        if p in {",",";"," and "," but "}:
            chunks.append(cur.strip())
            cur = ""
    if cur.strip(): chunks.append(cur.strip())
    random.shuffle(chunks)
    return " ".join(chunks)

# Typo word
def _typo_word(word: str) -> str:
    core = re.sub(r"^\W+|\W+$", "", word)
    if len(core) < 4:
        return word
    i = random.randrange(1, len(core) - 1)
    old = core[i]
    candidates = [c for c in ALPHABET if c != old.lower()]
    if not candidates: return word
    new = random.choice(candidates)
    if old.isupper(): new = new.upper()
    out = core[:i] + new + core[i+1:]
    prefix = word[:word.find(core)] if core in word else ""
    suffix = word[word.find(core)+len(core):] if core in word else ""
    return prefix + out + suffix

def introduce_typos(text: str, prob=0.3) -> str:
    tokens = text.split()
    out = []
    for t in tokens:
        if random.random() < prob:
            out.append(_typo_word(t))
        else:
            out.append(t)
    return " ".join(out)

# Insert noise token
def insert_noise_token(text: str):
    tokens = text.split()
    if len(tokens) < 3: return text
    idx = random.randrange(1, len(tokens))
    if random.random() < 0.5:
        tokens.insert(idx, random.choice(FILLERS))
    else:
        tokens.insert(idx, tokens[idx-1])
    return " ".join(tokens)

# Truncate tail
def truncate_tail(text: str, min_words=5, max_drop_ratio=0.4):
    tokens = text.split()
    n = len(tokens)
    if n <= min_words+2: return text
    max_drop = int(n * max_drop_ratio)
    cut = random.randint(min_words, n - max_drop)
    return " ".join(tokens[:cut])

# Corrupt punctuation
def corrupt_punctuation(text: str):
    if not re.search(r"[,.!?]", text): return text
    mode = random.choice(["drop","swap"])
    if mode == "drop":
        return re.sub(r"[,.!?]","", text, count=random.randint(1,3))
    else:
        t = text.replace("...",".")
        t = re.sub(r"\?",".", t)
        return re.sub(r"\.",",", t, count=1)

# Strong degrade function
def degrade_mid(text: str) -> str:
    ops = [
        lambda s: replace_one_of_set(s, MONTHS),
        lambda s: replace_one_of_set(s, DAYS),
        mutate_numbers,
        flip_ampm,
        insert_or_remove_not,
        drop_content_words,
        shuffle_chunks,
        introduce_typos,
        insert_noise_token,
        corrupt_punctuation,
        truncate_tail,
    ]

    random.shuffle(ops)
    n_tokens = len(text.split())
    base = 4 if n_tokens < 10 else 5
    extra = 1 if n_tokens < 20 else 2
    num_ops = min(len(ops), base + random.randint(0,extra))

    out = text
    for fn in ops[:num_ops]:
        out = fn(out)

    if out.strip() == text.strip():
        out = drop_content_words(out, drop_ratio=(0.30,0.50))

    return re.sub(r"\s+"," ", out).strip()

# Build dataset
def make_dataset(sources, targets, include_perfect=True, seed=42):
    random.seed(seed)
    n = len(sources)
    rows = []

    perm = list(range(n))
    random.shuffle(perm)
    for i in range(n):
        if perm[i] == i:
            j = (i+1) % n
            perm[i], perm[j] = perm[j], perm[i]

    for i in range(n):
        src = sources[i].strip()
        good = targets[i].strip()

        if include_perfect:
            rows.append([src, good, 100])

        mid = degrade_mid(good)
        if len(mid.split()) <= max(2, len(good.split())//4):
            mid = degrade_mid(good)
        rows.append([src, mid, 50])

        wrong = targets[perm[i]].strip()
        rows.append([src, wrong, 0])

    return rows

rows = make_dataset(srcs, tgts, include_perfect=not args.no_perfect, seed=args.seed)

# Write CSV
def write_csv(rows, out_path: str):
    with open(out_path, "w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(["SOURCE","TARGET","SCORE"])
        for r in rows:
            w.writerow(r)

write_csv(rows, args.out)

print("Dataset generation complete.")
