<a href="https://colab.research.google.com/github/Dhwani123p/MediScripts/blob/main/ML-model/ner/data/generating_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import random
import os

# ===============================
# 1. Load real medicine names
# ===============================

def load_drug_list(path):
    with open(path, encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

# Path to drug list
DRUG_LIST_PATH = "drug_list.txt"

DRUGS = load_drug_list(DRUG_LIST_PATH)

# ===============================
# 2. Controlled vocabularies
# ===============================

DOSES = [
    ("250", "mg"), ("500", "mg"), ("650", "mg"),
    ("10", "mg"), ("20", "mg"), ("40", "mg")
]

FREQS_EN = [
    ["once", "daily"],
    ["twice", "a", "day"],
    ["thrice", "daily"],
    ["every", "six", "hours"],
    ["at", "night"]
]

DURS_EN = [
    ["3", "days"],
    ["5", "days"],
    ["7", "days"]
]

ROUTES_EN = [
    ["after", "food"],
    ["before", "breakfast"],
    ["after", "meals"]
]

FREQS_HI = [
    ["din", "mein", "ek", "baar"],
    ["din", "mein", "do", "baar"],
    ["roz", "ek", "baar"],
    ["raat", "ko"]
]

DURS_HI = [
    ["3", "din"],
    ["5", "din"],
    ["7", "din"]
]

ROUTES_HI = [
    ["khane", "ke", "baad"],
    ["khane", "se", "pehle"]
]

# ===============================
# 3. Templates
# ===============================

TEMPLATES_EN = [
    ["DRUG", "DOSE", "FREQ", "DUR", "ROUTE"],
    ["DRUG", "DOSE", "FREQ", "DUR"],
    ["DRUG", "DOSE", "FREQ"],
    ["DRUG", "DOSE", "ROUTE"]
]

# ===============================
# 4. English sentence generator
# ===============================

def generate_english_sentence():
    template = random.choice(TEMPLATES_EN)
    conll = []

    for slot in template:
        if slot == "DRUG":
            drug = random.choice(DRUGS)
            conll.append((drug, "B-DRUG"))

        elif slot == "DOSE":
            num, unit = random.choice(DOSES)
            conll.append((num, "B-DOSE"))
            conll.append((unit, "I-DOSE"))

        elif slot == "FREQ":
            freq = random.choice(FREQS_EN)
            conll.append((freq[0], "B-FREQ"))
            for w in freq[1:]:
                conll.append((w, "I-FREQ"))

        elif slot == "DUR":
            dur = random.choice(DURS_EN)
            conll.append((dur[0], "B-DUR"))
            conll.append((dur[1], "I-DUR"))

        elif slot == "ROUTE":
            route = random.choice(ROUTES_EN)
            conll.append((route[0], "B-ROUTE"))
            conll.append((route[1], "I-ROUTE"))

    return conll

# ===============================
# 5. Hindi (Roman) sentence generator
# ===============================

def generate_hindi_sentence():
    conll = []

    drug = random.choice(DRUGS)
    conll.append((drug, "B-DRUG"))

    num, unit = random.choice(DOSES)
    conll.append((num, "B-DOSE"))
    conll.append((unit, "I-DOSE"))

    freq = random.choice(FREQS_HI)
    conll.append((freq[0], "B-FREQ"))
    for w in freq[1:]:
        conll.append((w, "I-FREQ"))

    dur = random.choice(DURS_HI)
    conll.append((dur[0], "B-DUR"))
    conll.append((dur[1], "I-DUR"))

    route = random.choice(ROUTES_HI)
    conll.append((route[0], "B-ROUTE"))
    conll.append((route[1], "I-ROUTE"))
    conll.append((route[2], "I-ROUTE"))

    return conll

# ===============================
# 6. Write .conll file
# ===============================

def write_conll_file(filename, n_en=600, n_hi=400):
    with open(filename, "w", encoding="utf-8") as f:
        # English samples
        for _ in range(n_en):
            sent = generate_english_sentence()
            for token, label in sent:
                f.write(f"{token} {label}\n")
            f.write("\n")

        # Hindi samples
        for _ in range(n_hi):
            sent = generate_hindi_sentence()
            for token, label in sent:
                f.write(f"{token} {label}\n")
            f.write("\n")

# ===============================
# 7. Run generator
# ===============================

if __name__ == "__main__":
    output_file = "synthetic_train.conll"
    write_conll_file(output_file, n_en=60, n_hi=40)
    print(f"✅ Synthetic dataset generated: {output_file}")


✅ Synthetic dataset generated: synthetic_train.conll
