In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
from openai import OpenAI
import json
from tqdm import tqdm
import os, random, time

Mounted at /content/drive


In [None]:
# hugging face wikilarge dataset
# https://huggingface.co/datasets/waboucay/wikilarge/tree/main
with open("wiki.full.aner.ori.train.src") as f:
    complex_train = [line.strip() for line in f]

with open("wiki.full.aner.ori.train.dst") as f:
    simple_train = [line.strip() for line in f]

len(complex_train), len(simple_train)


(296402, 296402)

In [None]:
train_df = pd.DataFrame({
    "complex": complex_train,
    "simple": simple_train
})

train_df.head()


Unnamed: 0,complex,simple
0,There is manuscript evidence that Austen conti...,There is some proof that Austen continued to w...
1,"In a remarkable comparative analysis , Mandaea...",Mandaean scholar Säve-Söderberg showed that Ma...
2,"Before Persephone was released to Hermes , who...",When Demeter went to the Underworld to rescue ...
3,Cogeneration plants are commonly found in dist...,Cogeneration plants are commonly found in dist...
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",The city 's main newspaper is the Tribune de G...


In [None]:
seed_complex = train_df["complex"].tolist()[1100:1300]
seed_simple = train_df["simple"].tolist()[1100:1300]

with open("complex_seed.txt", "w") as f:
    f.write("\n".join(seed_complex))

with open("simple_seed.txt", "w") as f:
    f.write("\n".join(seed_simple))


In [None]:
MODEL_NAME        = "gpt-4.1-nano"   # or use "gpt-4.1-mini
OUTPUT_PATH       = "/content/drive/MyDrive/synthetic_50k.jsonl"
TARGET_SAMPLES    = 100000
CHECKPOINT_EVERY  = 100
SEED_EXAMPLES_K   = 5

client = OpenAI(api_key="")

# Load seeds
def load_seed_file(path):
    if os.path.exists(path):
        with open(path, encoding="utf-8") as f:
            return [line.strip() for line in f.readlines() if line.strip()]
    return []

complex_seed = load_seed_file("complex_seed.txt")
simple_seed  = load_seed_file("simple_seed.txt")

if len(complex_seed) == 0 or len(simple_seed) == 0:
    raise RuntimeError("Seed files complex_seed.txt / simple_seed.txt not found or empty.")

def build_examples(k=5):
    pairs = list(zip(complex_seed, simple_seed))
    sampled = random.sample(pairs, k=min(k, len(pairs)))
    txt = ""
    for c, s in sampled:
        txt += f"Complex: {c}\nSimple: {s}\n\n"
    return txt

SYSTEM_PROMPT = """
You generate NEW synthetic sentence simplification pairs.

Use the examples only to learn the FORMAT and relationship between
complex and simple sentences, NOT the topics or specific wording.

Requirements:
- Do NOT copy topics, phrases, or rare words from the examples.
- Each new pair should clearly be about the requested TOPIC.
- Complex sentence: one detailed sentence, formal / academic style.
- Simple sentence: shorter, clearer version with the same meaning.
- Do NOT summarize away important details; keep content equivalent.
- Vary vocabulary and sentence structure across examples.
- Avoid repeating the same uncommon words too often across generations.

Output format ONLY:
Complex: ...
Simple: ...
"""

TOPICS = [
    "public transportation in a big city",
    "online learning and education",
    "climate change and extreme weather",
    "hospital emergency room procedures",
    "social media and mental health",
    "sports training for young athletes",
    "space exploration missions",
    "financial planning for young adults",
    "tourism in historical cities",
    "workplace communication and teamwork",
    "food safety and nutrition",
    "wildlife conservation",
]

def generate_pair(topic: str):
    """Generate ONE complex/simple pair on a given topic."""
    examples = build_examples()
    user_content = (
        examples
        + f"\nNow generate ONE NEW pair on the topic: {topic}.\n"
        + "Do not mention the word 'topic' in the sentences.\n"
        + "Follow the required output format exactly.\n"
    )

    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_content},
        ],
        temperature=0.7,  # higher for more variety
    )

    text = resp.choices[0].message.content

    if "Complex:" in text and "Simple:" in text:
        comp = text.split("Complex:")[1].split("Simple:")[0].strip()
        simp = text.split("Simple:")[1].strip()
        return comp, simp
    return None, None


In [None]:
N_SAMPLES   = 30000

with open(OUTPUT_PATH, "w", encoding="utf-8") as out_f:
    for i in tqdm(range(N_SAMPLES), desc="Generating 50k synthetic pairs"):
        comp, simp = generate_pair(random.choice(TOPICS))
        if comp and simp:
            out_f.write(json.dumps({"complex": comp, "simple": simp}, ensure_ascii=False) + "\n")
        else:
            print(f"Skipped one malformed generation at i={i}") # if fail case
        time.sleep(0.05)
