In [12]:
import pandas as pd
import numpy as np
import random
import os

# Reproducibility
random.seed(42)
np.random.seed(42)

# Constants
districts = ["Kathmandu", "Lalitpur", "Bhaktapur", "Dhading", "Nuwakot", "Rasuwa", "Sindhupalchok"]
urban_districts = {"Kathmandu", "Lalitpur", "Bhaktapur"}
rural_districts = {"Dhading", "Nuwakot", "Rasuwa", "Sindhupalchok"}

num_schools = 1000
num_students = num_schools * 120
num_teachers = num_schools * 10

# 1. SCHOOL_INFO
school_ids = [f"SCH{i:04d}" for i in range(1, num_schools + 1)]
school_data = []

for sid in school_ids:
    district = random.choice(districts)
    loc_type = "Urban" if random.random() > 0.7 else "Rural"
    infra_score = np.clip(np.random.normal(loc=4 if loc_type == "Urban" else 2.5, scale=1), 1, 5)
    elec = "Y" if (loc_type == "Urban" and random.random() < 0.9) or (loc_type == "Rural" and random.random() < 0.6) else "N"
    net = "Y" if (loc_type == "Urban" and random.random() < 0.8) or (loc_type == "Rural" and random.random() < 0.3) else "N"
    lib = "Y" if (loc_type == "Urban" and random.random() < 0.7) or (loc_type == "Rural" and random.random() < 0.4) else "N"
    school_data.append([sid, district, loc_type, round(infra_score, 1), elec, net, lib])

df_school = pd.DataFrame(school_data, columns=[
    "School_ID", "District", "Location_Type", "Infrastructure_Score",
    "Electricity_Access", "Internet_Access", "Library_Status"
])

# Inject 5% missing Library_Status
df_school.loc[df_school.sample(frac=0.05).index, "Library_Status"] = np.nan

# Mapping School_ID → Location_Type for fast lookup
school_loc_map = df_school.set_index("School_ID")["Location_Type"].to_dict()

# 2. STUDENT_RECORDS
student_ids = [f"STU{i:05d}" for i in range(1, num_students + 1)]
grade_probs = np.linspace(0.12, 0.02, 12)
grade_probs /= grade_probs.sum()

student_data = []
for sid in student_ids:
    school = random.choice(school_ids)
    loc_type = school_loc_map[school]
    ses = np.random.choice(["Low", "Medium", "High"], p=[0.6, 0.3, 0.1]) if loc_type == "Rural" else np.random.choice(["Low", "Medium", "High"], p=[0.2, 0.5, 0.3])
    grade = np.random.choice(range(1, 13), p=grade_probs)
    gender = np.random.choice(["F", "M", "Other"], p=[0.5, 0.48, 0.02])
    attendance = np.random.uniform(50, 100)
    if loc_type == "Rural":
        attendance -= np.random.uniform(0, 10)
    attendance = np.clip(attendance, 50, 100)
    gpa = np.clip(np.random.normal(2.5, 0.7), 0, 4)
    dropout = "Y" if (attendance < 70 or (ses == "Low" and random.random() < 0.3)) else "N"
    student_data.append([sid, school, grade, gender, ses, round(attendance, 1), round(gpa, 2), dropout])

df_student = pd.DataFrame(student_data, columns=[
    "Student_ID", "School_ID", "Grade", "Gender", "Socioeconomic_Status",
    "Attendance_Rate", "Previous_Year_GPA", "Dropout_Status"
])

# Inject 5% missing Attendance_Rate
df_student.loc[df_student.sample(frac=0.05).index, "Attendance_Rate"] = np.nan

# 3. TEACHER_DEPLOYMENT
teacher_ids = [f"TCH{i:04d}" for i in range(1, num_teachers + 1)]
subjects = ["Math", "Science", "Nepali", "English", "Social Studies"]

teacher_data = []
for tid in teacher_ids:
    school = random.choice(school_ids)
    loc_type = school_loc_map[school]
    training = "Basic" if loc_type == "Rural" and random.random() < 0.7 else "Advanced"
    experience = np.random.randint(1, 31) if loc_type == "Urban" else np.random.randint(1, 20)
    ratio = np.random.randint(10, 15) if loc_type == "Urban" else np.random.randint(20, 30)
    teacher_data.append([tid, school, random.choice(subjects), training, experience, ratio])

df_teacher = pd.DataFrame(teacher_data, columns=[
    "Teacher_ID", "School_ID", "Subject", "Training_Level",
    "Years_of_Experience", "Student_Teacher_Ratio"
])

# 4. DISTRICT_LEVEL_STATS
district_stats = []
for district in districts:
    literacy = np.random.uniform(85, 95) if district in urban_districts else np.random.uniform(70, 85)
    poverty = np.random.uniform(10, 20) if district in urban_districts else np.random.uniform(25, 40)
    dist_km = np.random.uniform(1, 3) if district in urban_districts else np.random.uniform(5, 10)
    district_stats.append([district, round(literacy, 2), round(poverty, 2), round(dist_km, 1)])

df_district = pd.DataFrame(district_stats, columns=[
    "District", "Literacy_Rate", "Poverty_Rate", "Avg_School_Distance_km"
])

# SAVE TO CSV
os.makedirs("education_dataset", exist_ok=True)
df_school.to_csv("education_dataset/School_Info.csv", index=False)
df_student.to_csv("education_dataset/Student_Records.csv", index=False)
df_teacher.to_csv("education_dataset/Teacher_Deployment.csv", index=False)
df_district.to_csv("education_dataset/District_Level_Stats.csv", index=False)

print("✅ All CSVs generated in `education_dataset/`")

✅ All CSVs generated in `education_dataset/`
