In [None]:
import pandas as pd
import random
import os

print("ðŸš€ Generating 1200 synthetic stance comments...")

os.makedirs("dataset", exist_ok=True)

policies = [
    ("P001", "Sukanya Samriddhi Account Scheme", "Financial_Inclusion"),
    ("P002", "NPEGEL â€“ Girls Education Initiative", "Girls_Education"),
    ("P003", "RTE Act 12(1)(c) â€“ EWS Quota", "School_Access"),
    ("P004", "Kasturba Gandhi Balika Vidyalaya (KGBV)", "Residential_Girls_Education"),
    ("P005", "Mid-Day Meal Scheme (MDM)", "School_Nutrition"),
    ("P006", "Swachh Survekshan Grameen / Swachh Bharat", "Sanitation_Rural"),
    ("P007", "National Education Policy 2020 (NEP 2020)", "Education_Reform"),
    ("P008", "Ayushman Bharat â€“ PM-JAY Health Insurance", "Public_Health"),
    ("P009", "Digital India Mission", "Digital_Governance"),
    ("P010", "PM-KISAN Income Support Scheme", "Farmer_Support"),
    ("P011", "MGNREGA â€“ Rural Employment Guarantee", "Employment"),
    ("P012", "PM Awas Yojana (PMAY) â€“ Housing for All", "Housing"),
    ("P013", "Atal Pension Yojana (APY)", "Social_Security"),
    ("P014", "National Food Security Act / PDS", "Food_Security"),
    ("P015", "Beti Bachao Beti Padhao (BBBP)", "Women_Child"),
    ("P016", "Jal Jeevan Mission (JJM)", "Drinking_Water"),
    ("P017", "Smart Cities Mission", "Urban_Infra"),
    ("P018", "Data Protection / DPDP Act", "Data_Protection"),
    ("P019", "Electric Vehicle (EV) Policy & FAME-II", "Clean_Mobility"),
    ("P020", "Goods and Services Tax (GST)", "Tax_Economy"),
]

states_districts = [
    ("Delhi", "New Delhi"),
    ("Uttar Pradesh", "Lucknow"),
    ("Maharashtra", "Mumbai"),
    ("Karnataka", "Bengaluru"),
    ("Tamil Nadu", "Chennai"),
    ("Rajasthan", "Jaipur"),
    ("Assam", "Guwahati"),
    ("Punjab", "Ludhiana"),
    ("Haryana", "Gurugram"),
    ("West Bengal", "Kolkata"),
]

stance_labels = ["Support", "Oppose", "Neutral"]
sentiment_map = {"Support": "Positive", "Oppose": "Negative", "Neutral": "Neutral"}

respondent_types = ["citizen", "parent", "student", "farmer", "teacher", "shopkeeper", "employee", "official", "NGO_rep"]
source_types = ["news_comment", "play_store_review", "survey_feedback", "social_media_post"]

core_comments = {
    "Support": [
        "has genuinely improved access for ordinary people",
        "is a step in the right direction and should be strengthened further",
        "has made real difference on the ground in our area",
        "reduced our daily struggles and brought some relief",
        "helps poor and middle-class families who had no support earlier",
        "is one of the few schemes that actually reaches villages",
    ],
    "Oppose": [
        "looks good on paper but implementation is very weak",
        "has too much paperwork and corruption at local level",
        "is not reaching the actual beneficiaries in our district",
        "has created more confusion than benefit for common people",
        "feels more like a political advertisement than a real solution",
        "has many loopholes and people are being harassed instead of helped",
    ],
    "Neutral": [
        "has mixed results and people are still trying to understand its impact",
        "seems useful for some groups but others are left out",
        "needs proper monitoring and transparency before we can judge it",
        "is still new and long-term results are not visible yet",
        "could become effective if local officials take it seriously",
        "has potential but the government needs to fix practical issues",
    ],
}

prefix_templates = [
    "In {district}, many {respondent_type_plural} feel that",
    "From the perspective of {respondent_type_plural} in {district},",
    "According to feedback collected in {district},",
    "On the ground in {district}, it is observed that",
    "People in {district} often say that",
]

suffix_templates = [
    "based on their day-to-day experience.",
    "when asked in local meetings and discussions.",
    "as reflected in recent conversations on social media.",
    "according to informal surveys done by community groups.",
    "whenever the topic comes up in public meetings.",
    "",
]

def pluralize(rt: str) -> str:
    if rt.endswith("y"):
        return rt[:-1] + "ies"
    if rt == "NGO_rep":
        return "NGO representatives"
    return rt + "s"

rows = []
target_rows = 1200

for i in range(target_rows):
    policy_id, policy_title, domain = random.choice(policies)
    state, district = random.choice(states_districts)
    stance = random.choice(stance_labels)
    sentiment = sentiment_map[stance]
    respondent = random.choice(respondent_types)
    src_type = random.choice(source_types)

    core = random.choice(core_comments[stance])
    prefix = random.choice(prefix_templates)
    suffix = random.choice(suffix_templates)

    comment = f"{prefix.format(district=district, respondent_type_plural=pluralize(respondent))} {policy_title} {core} {suffix}".strip()
    # Small cleanup spaces
    comment = " ".join(comment.split())

    rows.append({
        "policy_id": policy_id,
        "policy_title": policy_title,
        "domain": domain,
        "state": state,
        "district": district,
        "source_type": src_type,
        "respondent_type": respondent,
        "comment_text": comment,
        "stance_label": stance,
        "sentiment_label": sentiment,
    })

df = pd.DataFrame(rows)

out_path = r"C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\synthetic_policy_comments_1200.csv"
df.to_csv(out_path, index=False)

print(f"âœ… Synthetic dataset generated: {out_path}")
print(f"Total rows: {len(df)}")


ðŸš€ Generating 1200 synthetic stance comments...
âœ… Synthetic dataset generated: C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\synthetic_policy_comments_1200.csv
Total rows: 1200
