In [2]:
import pandas as pd
import random
import os

# =========================
#  DATASET GENERATOR SCRIPT
# =========================

print("Generating dataset...")

# Ensure directory exists
os.makedirs("data", exist_ok=True)

# Policy definitions based on uploaded reports
policies = [
    ("P001", "Sukanya Samriddhi Account Scheme", "Financial_Inclusion"),
    ("P002", "National Programme for Education of Girls at the Elementary Level (NPEGEL)", "Girls_Education"),
    ("P003", "RTE Act Section 12(1)(c) â€“ EWS Quota in Private Schools", "School_Access_Equity"),
    ("P004", "Kasturba Gandhi Balika Vidyalaya (KGBV) Scheme", "Residential_Girls_Education"),
    ("P005", "Mid-Day Meal Scheme (MDM)", "School_Nutrition"),
    ("P006", "Swachh Survekshan Grameen 2022 â€“ Rural Sanitation Survey", "Sanitation_Rural"),
    ("P007", "National Education Policy 2020 (NEP 2020)", "Education_Reform")
]

# Example State/Districts
state_districts = [
    ("Haryana", "Rohtak"), ("Assam", "Udalguri"), ("Delhi", "New Delhi"),
    ("Delhi", "South Delhi"), ("Madhya Pradesh", "Bhopal"), ("Madhya Pradesh", "Indore"),
    ("Karnataka", "Bengaluru Urban"), ("Karnataka", "Mysuru"), ("Uttar Pradesh", "Lucknow"),
    ("Uttar Pradesh", "Varanasi"), ("Rajasthan", "Jaipur"), ("Rajasthan", "Udaipur"),
    ("Chhattisgarh", "Raipur"), ("Tamil Nadu", "Chennai"), ("Tamil Nadu", "Madurai"),
    ("Tamil Nadu", "Coimbatore")
]

# Labels
stance_labels = ["Support", "Oppose", "Neutral"]
sentiment_map = {"Support": "Positive", "Oppose": "Negative", "Neutral": "Neutral"}

source_types = ["survey_report", "citizen_interview", "focus_group", "key_informant"]
respondent_types = [
    "parent", "student", "girl_student", "teacher", "headmaster", "official", "investor", "citizen", "NGO_rep"
]

# Source names
source_names = {
    "survey_report": [
        "Survey Report â€“ RTE", "Survey â€“ School Meals", "Swachh Bharat Survey",
        "KGBV Field Evaluation", "NPEGEL Impact Study"
    ],
    "citizen_interview": [
        "Interview â€“ Parent", "Interview â€“ Student", "Interview â€“ Teacher", "Interview â€“ Community"
    ],
    "focus_group": [
        "FGD with Mothers", "FGD with Girls", "FGD with SMC", "FGD with Panchayat"
    ],
    "key_informant": [
        "Interview â€“ Headmaster", "Interview â€“ Education Officer",
        "Interview â€“ District Coordinator", "Interview â€“ NGO Leader"
    ]
}

# Example comments for each policy + stance
comments = {
    "P001": {
        "Support": [
            "The scheme gives us a safe way to save for our daughters' education.",
            "High interest rate makes this a good option for securing my girl's future."
        ],
        "Oppose": [
            "The long lock-in period makes the scheme impractical for many families.",
            "Documentation and bank procedures are too complicated for rural parents."
        ],
        "Neutral": [
            "The scheme is useful but more awareness camps are required.",
            "People have accounts but many do not understand the conditions."
        ]
    },
    "P002": {
        "Support": [
            "NPEGEL helped tribal girls attend school more regularly.",
            "Community mobilization meetings changed parents' attitudes."
        ],
        "Oppose": [
            "The programme is active only on paper in our cluster.",
            "Funds are not reaching remote habitations."
        ],
        "Neutral": [
            "Teachers attended training but behaviour has changed only slightly.",
            "The programme has potential if monitoring improves."
        ]
    },
    "P003": {
        "Support": [
            "RTE opened the doors of good private schools for poor children.",
            "Parents feel proud that their children study in reputed schools."
        ],
        "Oppose": [
            "Private schools resist admitting EWS children and create hidden costs.",
            "Reimbursement delays make implementation difficult."
        ],
        "Neutral": [
            "Awareness is still low in many neighbourhoods.",
            "Admission procedures are confusing for new applicants."
        ]
    },
    "P004": {
        "Support": [
            "KGBV hostel allows girls from remote villages to continue education.",
            "Parents feel their daughters are safe and supervised."
        ],
        "Oppose": [
            "Infrastructure is inadequate, especially toilets and bathrooms.",
            "Staff vacancies create pressure on existing wardens."
        ],
        "Neutral": [
            "Life skills sessions should be more frequent.",
            "Some KGBVs need urgent repairs."
        ]
    },
    "P005": {
        "Support": [
            "Mid-day meals increased attendance in our school.",
            "Children from poor families depend on the meal for nutrition."
        ],
        "Oppose": [
            "Quality of food is inconsistent and sometimes poor.",
            "Kitchen facilities are insufficient."
        ],
        "Neutral": [
            "Menu is simple; children want more variety.",
            "Monitoring should be strengthened at village level."
        ]
    },
    "P006": {
        "Support": [
            "Cleanliness drive reduced open defecation in our village.",
            "Women report better dignity after toilets were built."
        ],
        "Oppose": [
            "Many toilets are unused due to water shortage.",
            "Waste disposal systems exist but are poorly maintained."
        ],
        "Neutral": [
            "Awareness is improving but habits change slowly.",
            "Survey scores do not fully reflect ground issues."
        ]
    },
    "P007": {
        "Support": [
            "NEP 2020 emphasizes critical thinking over rote learning.",
            "Flexible subject choices will help students."
        ],
        "Oppose": [
            "Implementation roadmap is unclear at ground level.",
            "Rural schools lack infrastructure for the new curriculum."
        ],
        "Neutral": [
            "People have heard about NEP but details are unclear.",
            "Results depend on funding and coordination."
        ]
    }
}

# Generate dataset
rows = []
for i in range(500):
    policy_id, policy_title, domain = random.choice(policies)
    state, district = random.choice(state_districts)
    stance = random.choice(stance_labels)
    sentiment = sentiment_map[stance]
    source_type = random.choice(source_types)
    source_name = random.choice(source_names[source_type])
    respondent = random.choice(respondent_types)
    comment = random.choice(comments[policy_id][stance])

    rows.append({
        "policy_id": policy_id,
        "policy_title": policy_title,
        "domain": domain,
        "state": state,
        "district": district,
        "source_type": source_type,
        "source_name": source_name,
        "respondent_type": respondent,
        "comment_text": comment,
        "stance_label": stance,
        "sentiment_label": sentiment
    })

df = pd.DataFrame(rows)

df.to_csv("data/policy_comment_dataset_500.csv", index=False)

print("\nDataset Created Successfully! ðŸŽ‰")
print("Files saved to /data folder:")
print(" â†’ policy_comment_dataset_500.csv")


Generating dataset...

Dataset Created Successfully! ðŸŽ‰
Files saved to /data folder:
 â†’ policy_comment_dataset_500.csv
