## v0 Dataset

In [1]:
from pathlib import Path
import yaml, random, numpy as np

ROOT = Path("..").resolve()
CFG = yaml.safe_load(open(ROOT / "data" / "synth" / "v1" / "config.yaml", "r"))
random.seed(CFG["seed"]); np.random.seed(CFG["seed"])

CFG["version"], CFG["n_total"], CFG["train_val_test_split"]

('v1', 1000, [0.7, 0.15, 0.15])

In [2]:
import re, json, uuid, random
from typing import List, Dict, Any, Tuple

INDUSTRIES = [
    ("coffee_shop", ["coffee", "cafe", "espresso", "brew", "roast", "bean"]),
    ("family_therapy", ["family", "therapy", "counseling", "care", "wellness"]),
    ("childcare", ["kids", "care", "learning", "play", "little", "nest"]),
    ("nonprofit", ["foundation", "cause", "impact", "hope", "aid"]),
    ("legal", ["law", "legal", "counsel", "attorney"]),
    ("fitness", ["fit", "gym", "move", "pulse", "core", "strong"]),
    ("wedding_planner", ["wedding", "bridal", "vows", "bloom", "event"]),
    ("pet_care", ["pet", "paws", "tail", "fur", "vet", "groom"]),
    ("fintech", ["pay", "fund", "ledger", "mint", "bank", "wallet"]),
    ("hvac", ["heat", "cool", "air", "climate", "comfort"]),
    ("education", ["learn", "academy", "tutor", "study", "class"]),
    ("grocery", ["market", "farm", "green", "fresh", "harvest"]),
    ("gardening", ["garden", "bloom", "grow", "root", "sprout"]),
    ("co_parenting", ["family", "co", "parent", "calendar", "share", "connect"]),
]

ADJECTIVES = ["organic", "premium", "affordable", "eco", "local", "downtown", "mobile", "friendly", "trusted", "modern", "deluxe", "express", "family"]
TONES = ["playful", "friendly", "professional", "premium", "family-friendly", "eco-conscious"]
AUDIENCES = ["parents", "young professionals", "students", "seniors", "families", "small businesses"]
CITIES = ["Detroit", "Seattle", "Austin", "Miami", "Denver", "Phoenix", "San Diego", "Boston", "Chicago"]

BAD_CATEGORIES = {
    "adult_explicit": ["adult", "porn", "xxx", "explicit", "escort"],
    "hate_violence": ["racial slur", "neo-nazi", "genocide", "hate"],
    "illegal": ["counterfeit", "stolen ids", "fake passports", "drug trafficking"],
    "weapons_minor": ["guns for kids", "weapons for children"],
    "doxxing": ["post private addresses", "leak ssn"],
    "child_exploitation": ["underage adult content"],
}


In [3]:
def slugify_token(tok: str) -> str:
    return re.sub(r"[^a-z0-9]", "", tok.lower())

def join_tokens(tokens: List[str], allow_hyphens: bool=False, allow_numbers: bool=False) -> str:
    base = "-".join(tokens) if allow_hyphens else "".join(tokens)
    if not allow_numbers:
        base = re.sub(r"\d", "", base)
    base = re.sub(r"-{2,}", "-", base)
    return base[:63]

def looks_pronounceable(s: str) -> bool:
    return bool(re.search(r"[aeiouy]", s)) and not re.search(r"[^a-z0-9-]", s)

def conf_score(name: str, tld: str, constraints: Dict[str, Any]) -> float:
    score = 0.5
    score += 0.2 * (min(len(name), 30)/30.0)
    if tld in [".com", ".org", ".co"]: score += 0.15
    if "-" in name and not constraints.get("allow_hyphens", False): score -= 0.10
    if re.search(r"\d", name) and not constraints.get("allow_numbers", False): score -= 0.08
    if not looks_pronounceable(name): score -= 0.05
    return float(max(0.05, min(0.99, round(score, 2))))


In [4]:
def pick_tlds(primary, secondary):
    out = primary + random.sample(secondary, k=min(len(secondary), random.randint(0,2)))
    return list(dict.fromkeys(out))

def make_business_description(industry_key: str, complexity: str) -> str:
    adj = random.choice(ADJECTIVES)
    tone = random.choice(TONES)
    audience = random.choice(AUDIENCES)
    city = random.choice(CITIES)
    base = f"{adj} {industry_key.replace('_',' ')} for {audience} in {city}. Tone: {tone}."
    if complexity == "L1":
        return base
    elif complexity == "L2":
        extras = []
        if random.random() < 0.5: extras.append("Avoid hyphens")
        if random.random() < 0.5: extras.append("Prefer short names")
        if random.random() < 0.4: extras.append("Include subtle wordplay")
        return base + " Constraints: " + "; ".join(extras) + "."
    else:
        return base + " Extra details: seasonal promos, bilingual marketing, strong mobile presence, calendar integrations."

def sample_constraints(cfg):
    return {
        "allow_hyphens": random.random() < cfg["constraints"]["allow_hyphens_rate"],
        "allow_numbers": random.random() < cfg["constraints"]["allow_numbers_rate"],
        "prefer_puns": random.random() < cfg["constraints"]["pun_preference_rate"],
    }

def generate_suggestions(wordbag, tlds, constraints, k):
    out, bases = [], set()
    for _ in range(k*3):
        if random.random() < 0.4 and len(wordbag) >= 2:
            a, b = random.sample(wordbag, 2)
            if random.random() < 0.35:
                mid = a[:max(2,len(a)//2)] + b[max(1,len(b)//2):]
                tokens = [slugify_token(mid)]
            else:
                tokens = [slugify_token(a), slugify_token(b)]
        else:
            tokens = [slugify_token(random.choice(wordbag))]
        name = join_tokens(tokens, constraints["allow_hyphens"], constraints["allow_numbers"])
        if 3 <= len(name) <= 30:
            bases.add(name)
        if len(bases) >= k*2:
            break
    bases = list(bases)[:max(k,5)]
    random.shuffle(bases)
    for base in bases:
        tld = random.choice(tlds)
        out.append({"domain": f"{base}{tld}", "confidence": conf_score(base, tld, constraints)})
        if len(out) >= k:
            break
    return out

def wrap_input_prompt(desc, tlds, constraints):
    return "\n".join([
        "You are a domain name generator.",
        f'Business description: "{desc}"',
        f"Preferred TLDs (order matters): {', '.join(tlds)}",
        f"Constraints: allow_hyphens={constraints['allow_hyphens']}, allow_numbers={constraints['allow_numbers']}, prefer_puns={constraints['prefer_puns']}",
        'Return ONLY JSON in this schema:\n{"status": "success|blocked", "suggestions": [{"domain":"...","confidence": 0.0}], "message":"optional"}'
    ])

def make_blocked_example():
    cat, phrases = random.choice(list(BAD_CATEGORIES.items()))
    phrase = random.choice(phrases)
    desc = f"Build a site for: {phrase}. Generate creative, catchy domain names."
    constraints = {"allow_hyphens": False, "allow_numbers": False, "prefer_puns": False}
    meta = {"industry": "blocked", "complexity": "N/A", "language": "en", "safety": cat}
    expected = {"status": "blocked", "message": "Request contains inappropriate content", "suggestions": []}
    return desc, constraints, meta, expected

def make_safe_example(cfg):
    industry_key, bag = random.choice(INDUSTRIES)
    complexity = np.random.choice(["L1","L2","L3"], p=[cfg["complexity_mix"]["L1"], cfg["complexity_mix"]["L2"], cfg["complexity_mix"]["L3"]])
    desc = make_business_description(industry_key, complexity)
    constraints = sample_constraints(cfg)
    tlds = pick_tlds(cfg["tlds_primary"], cfg["tlds_secondary"])
    k = random.randint(cfg["min_suggestions"], cfg["max_suggestions"])
    suggestions = generate_suggestions(bag, tlds, constraints, k)
    expected = {"status": "success", "suggestions": suggestions}
    meta = {"industry": industry_key, "complexity": complexity, "language": "en", "safety": "safe", "tlds": tlds, **constraints}
    return desc, constraints, meta, expected


In [5]:
from tqdm import trange

V0_N = 50
V0_FRAC_BLOCKED = CFG["frac_blocked"]
n_blocked = int(round(V0_N * V0_FRAC_BLOCKED))
n_safe = V0_N - n_blocked

rows = []
for _ in trange(n_blocked, desc="blocked"):
    desc, constraints, meta, expected = make_blocked_example()
    prompt = wrap_input_prompt(desc, [".com"], constraints)
    rows.append({"id": str(uuid.uuid4()), "input": prompt, "output": expected, "meta": meta})

for _ in trange(n_safe, desc="safe"):
    desc, constraints, meta, expected = make_safe_example(CFG)
    prompt = wrap_input_prompt(desc, meta["tlds"], constraints)
    rows.append({"id": str(uuid.uuid4()), "input": prompt, "output": expected, "meta": meta})

random.shuffle(rows)
len(rows), rows[0]["meta"]


blocked: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 4450.19it/s]
safe: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 2640.19it/s]


(50,
 {'industry': 'gardening',
  'complexity': np.str_('L1'),
  'language': 'en',
  'safety': 'safe',
  'tlds': ['.com', '.co', '.org', '.family', '.ai'],
  'allow_hyphens': False,
  'allow_numbers': False,
  'prefer_puns': False})

In [6]:
OUT = ROOT / "data" / "synth" / "v0"
OUT.mkdir(parents=True, exist_ok=True)

def save_jsonl(path, items):
    with open(path, "w", encoding="utf-8") as f:
        for r in items:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

p_train, p_val, p_test = CFG["train_val_test_split"]
n = len(rows)
n_train = int(n * p_train)
n_val   = int(n * p_val)
n_test  = n - n_train - n_val

train = rows[:n_train]
val   = rows[n_train:n_train+n_val]
test  = rows[n_train+n_val:]

save_jsonl(OUT / "train.jsonl", train)
save_jsonl(OUT / "val.jsonl", val)
save_jsonl(OUT / "test.jsonl", test)

{"train": len(train), "val": len(val), "test": len(test), "total": n}


{'train': 35, 'val': 7, 'test': 8, 'total': 50}

In [7]:
import pandas as pd
def read_jsonl(p): return [json.loads(l) for l in open(p, "r", encoding="utf-8")]
df = pd.DataFrame([r["meta"] for r in read_jsonl(OUT / "train.jsonl")])
print("by_industry:\n", df["industry"].value_counts().head(10))
print("\nby_complexity:\n", df["complexity"].value_counts())
print("\nsafety_counts:\n", df["safety"].value_counts())
print("\nSample input:\n", read_jsonl(OUT / "train.jsonl")[0]["input"])
print("\nSample output:\n", read_jsonl(OUT / "train.jsonl")[0]["output"])


by_industry:
 industry
blocked           6
childcare         5
pet_care          4
family_therapy    4
fitness           3
grocery           3
education         2
gardening         2
fintech           2
co_parenting      2
Name: count, dtype: int64

by_complexity:
 complexity
L1     18
L2      9
N/A     6
L3      2
Name: count, dtype: int64

safety_counts:
 safety
safe                  29
child_exploitation     2
hate_violence          2
adult_explicit         1
doxxing                1
Name: count, dtype: int64

Sample input:
 You are a domain name generator.
Business description: "friendly gardening for seniors in Austin. Tone: family-friendly."
Preferred TLDs (order matters): .com, .co, .org, .family, .ai
Constraints: allow_hyphens=False, allow_numbers=False, prefer_puns=False
Return ONLY JSON in this schema:
{"status": "success|blocked", "suggestions": [{"domain":"...","confidence": 0.0}], "message":"optional"}

Sample output:
 {'status': 'success', 'suggestions': [{'domain': 'spro

In [8]:
card = f"""# DATASET CARD — Synthetic Domain Names (v0 smoke)

Created via `notebooks/01_dataset_creation.ipynb` to validate the pipeline before v1.
- Total: {V0_N} (blocked ~{int(V0_N*V0_FRAC_BLOCKED)}; safe ~{V0_N-int(V0_N*V0_FRAC_BLOCKED)})
- Splits: {CFG['train_val_test_split']}
- Industries: coffee, family therapy, childcare, nonprofit, legal, fitness, wedding, pet care, fintech, hvac, education, grocery, gardening, co-parenting
- Safety: refusal cases included; expected blocked schema in `docs/SAFETY_POLICY.md`
- IO format: JSON {{"status","message?","suggestions":[{{"domain","confidence"}}]}}

Notes/limitations:
- Heuristic confidence (not calibrated).
- Simple wordbanks; creativity limited in v0.
- v1 will scale to 1000 examples with same config/seed.
"""
( ROOT / "reports" / "DATASET_CARD_v0.md").write_text(card, encoding="utf-8")
"written"


'written'

## v1 Dataset

In [11]:
from tqdm import trange

V1_N = CFG["n_total"]
V1_FRAC_BLOCKED = CFG["frac_blocked"]
n_blocked_v1 = int(round(V1_N * V1_FRAC_BLOCKED))
n_safe_v1 = V1_N - n_blocked_v1

rows_v1 = []

# blocked examples
for _ in trange(n_blocked_v1, desc="blocked_v1"):
    desc, constraints, meta, expected = make_blocked_example()
    prompt = wrap_input_prompt(desc, [".com"], constraints)
    rows_v1.append({
        "id": str(uuid.uuid4()),
        "input": prompt,
        "output": expected,
        "meta": meta
    })

# safe examples
for _ in trange(n_safe_v1, desc="safe_v1"):
    desc, constraints, meta, expected = make_safe_example(CFG)
    prompt = wrap_input_prompt(desc, meta["tlds"], constraints)
    rows_v1.append({
        "id": str(uuid.uuid4()),
        "input": prompt,
        "output": expected,
        "meta": meta
    })

random.shuffle(rows_v1)
len(rows_v1), rows_v1[0]["meta"]


blocked_v1: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:00<00:00, 29201.47it/s]
safe_v1: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 880/880 [00:00<00:00, 2794.73it/s]


(1000,
 {'industry': 'education',
  'complexity': np.str_('L1'),
  'language': 'en',
  'safety': 'safe',
  'tlds': ['.com', '.co', '.org'],
  'allow_hyphens': False,
  'allow_numbers': False,
  'prefer_puns': False})

In [12]:
OUT_V1 = ROOT / "data" / "synth" / "v1"
OUT_V1.mkdir(parents=True, exist_ok=True)

def save_jsonl(path, items):
    with open(path, "w", encoding="utf-8") as f:
        for r in items:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

p_train, p_val, p_test = CFG["train_val_test_split"]
n = len(rows_v1)
n_train = int(n * p_train)
n_val   = int(n * p_val)
n_test  = n - n_train - n_val

train_v1 = rows_v1[:n_train]
val_v1   = rows_v1[n_train:n_train+n_val]
test_v1  = rows_v1[n_train+n_val:]

save_jsonl(OUT_V1 / "train.jsonl", train_v1)
save_jsonl(OUT_V1 / "val.jsonl", val_v1)
save_jsonl(OUT_V1 / "test.jsonl", test_v1)

{"train": len(train_v1), "val": len(val_v1), "test": len(test_v1), "total": n}


{'train': 700, 'val': 150, 'test': 150, 'total': 1000}

In [13]:
import pandas as pd, json

def read_jsonl(p): 
    return [json.loads(l) for l in open(p, "r", encoding="utf-8")]

df_v1 = pd.DataFrame([r["meta"] for r in read_jsonl(OUT_V1 / "train.jsonl")])

print("by_industry:\n", df_v1["industry"].value_counts().head(10))
print("\nby_complexity:\n", df_v1["complexity"].value_counts())
print("\nsafety_counts:\n", df_v1["safety"].value_counts())

sample_row = read_jsonl(OUT_V1 / "train.jsonl")[0]
print("\nSample input:\n", sample_row["input"])
print("\nSample output:\n", sample_row["output"])


by_industry:
 industry
blocked           84
grocery           49
gardening         48
coffee_shop       48
childcare         48
education         48
fitness           46
pet_care          46
family_therapy    46
nonprofit         44
Name: count, dtype: int64

by_complexity:
 complexity
L1     310
L2     210
L3      96
N/A     84
Name: count, dtype: int64

safety_counts:
 safety
safe                  616
child_exploitation     20
adult_explicit         19
doxxing                16
weapons_minor          15
hate_violence           8
illegal                 6
Name: count, dtype: int64

Sample input:
 You are a domain name generator.
Business description: "premium education for small businesses in San Diego. Tone: friendly."
Preferred TLDs (order matters): .com, .co, .org
Constraints: allow_hyphens=False, allow_numbers=False, prefer_puns=False
Return ONLY JSON in this schema:
{"status": "success|blocked", "suggestions": [{"domain":"...","confidence": 0.0}], "message":"optional"}

Sample ou

In [15]:
card_v1 = f"""# DATASET CARD — Synthetic Domain Names (v1)

Created reproducibly in `notebooks/01_dataset_creation.ipynb` using `data/synth/v1/config.yaml`.
- Total: {V1_N} (blocked ~{int(V1_N*V1_FRAC_BLOCKED)}; safe ~{V1_N-int(V1_N*V1_FRAC_BLOCKED)})
- Splits: {CFG['train_val_test_split']}
- Industries: coffee, family therapy, childcare, nonprofit, legal, fitness, wedding, pet care, fintech, hvac, education, grocery, gardening, co-parenting
- Safety: refusal cases per `docs/SAFETY_POLICY.md`
- IO: JSON-only {{ "status", "message?", "suggestions":[{{"domain","confidence"}}] }}

Notes/limitations:
- Heuristic confidence (not calibrated).
- Wordbanks are simple; creativity bounded.
- This v1 is the baseline fine-tune set; later iterations may augment/adjust.
"""
( ROOT / "reports" / "DATASET_CARD.md").write_text(card_v1, encoding="utf-8")
"written"


'written'