In [None]:
import pandas as pd
import numpy as np
from io import StringIO
import sys

# Optional: For Colab download
try:
    from google.colab import files
    IN_COLAB = True
except:
    IN_COLAB = False

# =============================================
# STEP 1: GENERATE 1050+ CAREER OPPORTUNITIES
# =============================================

# Define base data pools
sectors = [
    "IT & Technology", "Healthcare", "Engineering", "Finance & Commerce", "Management",
    "Arts & Humanities", "Law", "Education", "Science & Research", "Media & Design",
    "Government & Defense", "Agriculture & Allied", "Hospitality & Travel", "Green Energy",
    "AI & ML", "Space Tech", "Gig Economy", "FinTech", "E-Commerce", "Cybersecurity"
]

streams = [
    "Science (PCM)", "Science (PCB)", "Science (PCM/CS)", "Commerce", "Arts", "Any Stream"
]

qualifications = [
    "10+2 + JEE", "10+2 + NEET", "10+2 + Bachelor's", "Bachelor's + CAT/MAT",
    "10+2 + CLAT", "Bachelor's + UPSC", "10+2 + Diploma", "Bachelor's + Cert"
]

# Expanded career & course templates
career_templates = {
    "IT & Technology": ["Software Engineer", "Data Scientist", "Cloud Architect", "DevOps Engineer", "Cybersecurity Analyst", "Full-Stack Developer", "AI Engineer", "Blockchain Developer"],
    "Healthcare": ["Doctor", "Nurse", "Pharmacist", "Physiotherapist", "Medical Researcher", "Ayurvedic Doctor", "Dentist", "Genetic Counselor"],
    "Engineering": ["Civil Engineer", "Mechanical Engineer", "Electrical Engineer", "Aerospace Engineer", "Robotics Engineer", "EV Designer", "3D Printing Specialist"],
    "Finance & Commerce": ["CA", "Investment Banker", "Financial Analyst", "Tax Consultant", "Actuary", "Forensic Accountant", "ESG Analyst"],
    "Management": ["HR Manager", "Marketing Manager", "Operations Manager", "Startup Founder", "Supply Chain Manager"],
    "Emerging": ["Prompt Engineer", "Metaverse Designer", "Drone Pilot", "Sustainability Consultant", "Quantum Computing Researcher"]
}

# Generate 1050+ rows
data = []
np.random.seed(42)  # For reproducibility

row_id = 1
while len(data) < 1050:
    sector = np.random.choice(sectors)
    stream = np.random.choice(streams)
    qual = np.random.choice(qualifications)

    # Pick career based on sector
    if "IT" in sector or "AI" in sector or "Cyber" in sector:
        career_pool = career_templates.get("IT & Technology", []) + career_templates.get("Emerging", [])
    elif "Health" in sector:
        career_pool = career_templates.get("Healthcare", [])
    elif "Eng" in sector:
        career_pool = career_templates.get("Engineering", [])
    elif "Fin" in sector:
        career_pool = career_templates.get("Finance & Commerce", [])
    else:
        career_pool = career_templates.get("Management", []) + career_templates.get("Emerging", [])

    career = np.random.choice(career_pool) if career_pool else "Professional"

    # Course logic
    if "10+2 + JEE" in qual:
        course = np.random.choice(["B.Tech CSE", "B.Tech Mechanical", "B.Tech Civil", "B.Tech Aerospace"])
    elif "10+2 + NEET" in qual:
        course = np.random.choice(["MBBS", "BDS", "B.Sc Nursing", "B.Pharma"])
    elif "Bachelor's + CAT" in qual:
        course = "MBA Finance/Marketing/HR"
    else:
        course = np.random.choice(["B.Com", "BA Psychology", "B.Sc Data Science", "LLB", "BHM"])

    salary = round(np.random.uniform(3, 25), 1)
    demand = np.random.choice(["High", "Growing", "Stable", "Emerging", "Evergreen"])

    data.append({
        "ID": row_id,
        "Sector": sector,
        "Stream": stream,
        "Course": course,
        "Qualification": qual,
        "Career": f"{career} ({sector.split()[0]})",
        "Salary_LPA": salary,
        "Demand_2025": demand
    })
    row_id += 1

# Create DataFrame
df = pd.DataFrame(data)

# =============================================
# STEP 2: SAVE TO CSV
# =============================================

csv_filename = "career_opportunities_1000plus.csv"
df.to_csv(csv_filename, index=False)
print(f"Dataset saved as '{csv_filename}' with {len(df)} opportunities.")

# =============================================
# STEP 3: DOWNLOAD (Colab or Local)
# =============================================

if IN_COLAB:
    print("Downloading in Colab...")
    files.download(csv_filename)
else:
    print(f"File saved locally: {csv_filename}")
    print("You can find it in your current working directory.")
    print("Tip: Use pd.read_csv('career_opportunities_1000plus.csv') to load it later.")

Dataset saved as 'career_opportunities_1000plus.csv' with 1050 opportunities.
Downloading in Colab...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Install pyspark
!pip install pyspark --quiet

In [None]:
import pandas as pd
import numpy as np
import random

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# Lists for realistic data (focused on rural India)
states = ['Uttar Pradesh', 'Bihar', 'Rajasthan', 'Madhya Pradesh', 'Maharashtra',
          'Andhra Pradesh', 'West Bengal', 'Tamil Nadu', 'Karnataka', 'Odisha']
districts = ['Allahabad', 'Patna', 'Jaipur', 'Bhopal', 'Nagpur', 'Visakhapatnam', 'Kolkata',
             'Chennai', 'Bengaluru', 'Bhubaneswar', 'Lucknow', 'Muzaffarpur', 'Jodhpur',
             'Indore', 'Pune', 'Guntur', 'Bardhaman', 'Coimbatore', 'Mysore', 'Cuttack',
             'Varanasi', 'Gaya', 'Udaipur', 'Jabalpur', 'Nashik', 'Vijayawada', 'Howrah',
             'Madurai', 'Hubli', 'Rourkela', 'Gorakhpur', 'Darbhanga', 'Bikaner', 'Gwalior',
             'Aurangabad', 'Kurnool', 'Durgapur', 'Tiruchirappalli', 'Belgaum', 'Berhampur',
             'Agra', 'Bhagalpur', 'Ajmer', 'Sagar', 'Amravati', 'Nellore', 'Asansol',
             'Salem', 'Davanagere', 'Brahmapur'] * 3  # Extended list
languages_home = ['Hindi', 'Bhojpuri', 'Rajasthani', 'Marathi', 'Telugu', 'Bengali',
                  'Tamil', 'Kannada', 'Odia', 'Maithili', 'Magahi', 'Bundeli']
languages_teach = ['Hindi', 'English']

# Generate 1000 records
data = []
for i in range(1, 1001):
    state = random.choice(states)
    district = random.choice(districts)
    village = f"Village_{random.randint(1, 999)}_{district[:3]}"
    total_students = np.random.randint(50, 501)
    primary_lang = random.choice(languages_home)
    teach_lang = random.choice(languages_teach)

    # Simulate % non-native: higher if mismatch
    mismatch = 1 if primary_lang != teach_lang else 0
    base_non_native = np.random.uniform(30, 90)
    percentage_non_native = base_non_native + (mismatch * 20)  # Adjust for mismatch
    percentage_non_native = min(100, max(0, percentage_non_native))  # Clamp 0-100

    # Barrier severity based on % non-native
    if percentage_non_native < 50:
        severity = 'Low'
    elif percentage_non_native < 75:
        severity = 'Medium'
    else:
        severity = 'High'

    # Learning impact score (inverse: higher barrier = lower score)
    match_factor = 1 if primary_lang == teach_lang else 0.6
    impact_score = max(1, min(10, 11 - (match_factor * percentage_non_native / 10)))

    survey_year = np.random.choice([2023, 2024, 2025])

    data.append({
        'school_id': i,
        'state': state,
        'district': district,
        'village': village,
        'total_students': total_students,
        'primary_language_spoken': primary_lang,
        'teaching_language': teach_lang,
        'percentage_non_native_speakers': round(percentage_non_native, 1),
        'barrier_severity': severity,
        'learning_impact_score': round(impact_score),
        'survey_year': survey_year
    })

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv('language_barriers_rural_schools.csv', index=False)
print("Dataset generated and saved as 'language_barriers_rural_schools.csv'")
print(df.head())  # Preview first 5 rows
print(f"\nDataset shape: {df.shape}")

Dataset generated and saved as 'language_barriers_rural_schools.csv'
   school_id           state     district          village  total_students  \
0          1           Bihar      Kolkata  Village_760_Kol             152   
1          2  Madhya Pradesh      Kurnool  Village_755_Kur             156   
2          3          Odisha    Bengaluru   Village_33_Ben             152   
3          4  Madhya Pradesh  Bhubaneswar  Village_518_Bhu             380   
4          5       Karnataka    Allahabad  Village_734_All             149   

  primary_language_spoken teaching_language  percentage_non_native_speakers  \
0                  Telugu             Hindi                            97.8   
1                Bhojpuri             Hindi                            96.8   
2                   Hindi             Hindi                            56.7   
3                Maithili             Hindi                            77.6   
4                  Magahi           English                        

In [None]:
import pandas as pd
import numpy as np
import random

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# === REALISTIC DATA: States, Districts, and Actual Village Names ===
# Format: {state: {district: [list of real villages]}}
village_data = {
    'Uttar Pradesh': {
        'Allahabad': ['Koraon', 'Meja', 'Soraon', 'Phulpur', 'Handia', 'Bara', 'Chaka', 'Karchana'],
        'Varanasi': ['Pindra', 'Harhua', 'Sevapuri', 'Baragaon', 'Cholapur', 'Arajiline', 'Kashividyapeeth'],
        'Lucknow': ['Mohanlalganj', 'Bakshi Ka Talab', 'Mal', 'Sarojaninagar', 'Gosainganj'],
        'Gorakhpur': ['Pipraich', 'Sahjanwa', 'Khorabar', 'Campierganj', 'Bansgaon'],
        'Agra': ['Kheragarh', 'Fatehabad', 'Etmadpur', 'Bah', 'Jagner']
    },
    'Bihar': {
        'Patna': ['Phulwari', 'Daniawan', 'Masaurhi', 'Bihta', 'Maner', 'Paliganj'],
        'Gaya': ['Belaganj', 'Wazirganj', 'Mohanpur', 'Imamganj', 'Sherghati'],
        'Muzaffarpur': ['Muraul', 'Kanti', 'Marwan', 'Bochaha', 'Sakra'],
        'Bhagalpur': ['Sabour', 'Kahalgaon', 'Naugachhia', 'Pirpainti', 'Bihpur'],
        'Darbhanga': ['Benipur', 'Bahadurpur', 'Hanumannagar', 'Jale', 'Keotiranway']
    },
    'Rajasthan': {
        'Jaipur': ['Amber', 'Phagi', 'Chaksu', 'Sanganer', 'Jamwa Ramgarh'],
        'Jodhpur': ['Balesar', 'Luni', 'Osian', 'Bilara', 'Mandor'],
        'Udaipur': ['Girwa', 'Gogunda', 'Kherwara', 'Sarada', 'Salumbar'],
        'Bikaner': ['Kolayat', 'Nokha', 'Lunkaransar', 'Khajuwala'],
        'Ajmer': ['Kishangarh', 'Pushkar', 'Peesangan', 'Masuda']
    },
    'Madhya Pradesh': {
        'Bhopal': ['Phanda', 'Huzur', 'Berasia', 'Kolar'],
        'Indore': ['Depalpur', 'Sanwer', 'Mhow', 'Hatod'],
        'Jabalpur': ['Patan', 'Sihora', 'Panagar', 'Shahpura'],
        'Gwalior': ['Dabra', 'Bhander', 'Bhitarwar', 'Ghatigaon'],
        'Sagar': ['Rehli', 'Khurai', 'Bina', 'Rahatgarh']
    },
    'Maharashtra': {
        'Pune': ['Haveli', 'Khed', 'Junnar', 'Ambegaon', 'Mawal'],
        'Nagpur': ['Umred', 'Hingna', 'Kalmeshwar', 'Saoner', 'Parseoni'],
        'Nashik': ['Malegaon', 'Niphad', 'Sinnar', 'Igatpuri', 'Dindori'],
        'Aurangabad': ['Kannad', 'Phulambri', 'Khuldabad', 'Gangapur'],
        'Amravati': ['Daryapur', 'Anjangaon', 'Chandur Bazar', 'Achalpur']
    },
    'Andhra Pradesh': {
        'Visakhapatnam': ['Anandapuram', 'Padmanabham', 'Bheemunipatnam', 'Pendurthi'],
        'Guntur': ['Tadepalle', 'Mangalagiri', 'Tulluru', 'Amaravati', 'Pedakakani'],
        'Vijayawada': ['Ibrahimpatnam', 'Gannavaram', 'Mylavaram', 'Nandigama'],
        'Kurnool': ['Adoni', 'Alur', 'Aspari', 'Devanakonda'],
        'Nellore': ['Kavali', 'Atmakur', 'Udayagiri', 'Podili']
    },
    'West Bengal': {
        'Kolkata': ['Rajarhat', 'Bhangar', 'Sonarpur', 'Baruipur'],
        'Bardhaman': ['Memari', 'Kalna', 'Katwa', 'Manteswar'],
        'Howrah': ['Amta', 'Udaynarayanpur', 'Jagatballavpur', 'Shyampur'],
        'Durgapur': ['Andal', 'Pandabeswar', 'Kanksa', 'Faridpur'],
        'Asansol': ['Jamuria', 'Raniganj', 'Barabani']
    },
    'Tamil Nadu': {
        'Chennai': ['Sholinganallur', 'Maduravoyal', 'Ambattur', 'Avadi'],
        'Coimbatore': ['Sulur', 'Kinathukadavu', 'Annur', 'Mettupalayam'],
        'Madurai': ['Melur', 'Vadipatti', 'Alanganallur', 'Thiruparankundram'],
        'Salem': ['Attur', 'Gangavalli', 'Vazhapadi', 'Yercaud'],
        'Tiruchirappalli': ['Manapparai', 'Thuraiyur', 'Lalgudi', 'Manachanallur']
    },
    'Karnataka': {
        'Bengaluru': ['Anekal', 'Devanahalli', 'Hosakote', 'Nelamangala'],
        'Mysore': ['Nanjangud', 'T.Narsipur', 'Hunsur', 'Periyapatna'],
        'Hubli': ['Kundgol', 'Kalghatgi', 'Navalgund', 'Alnavar'],
        'Belgaum': ['Hukkeri', 'Athani', 'Chikodi', 'Ramdurg'],
        'Davanagere': ['Harihar', 'Honnali', 'Channagiri', 'Jagalur']
    },
    'Odisha': {
        'Bhubaneswar': ['Balianta', 'Balipatna', 'Jatni', 'Khurda'],
        'Cuttack': ['Baranga', 'Nischintakoili', 'Salepur', 'Tangi'],
        'Rourkela': ['Bisra', 'Lahunipara', 'Rajgangpur', 'Kuanrmunda'],
        'Berhampur': ['Ganjam', 'Chhatrapur', 'Hinjilicut', 'Digapahandi'],
        'Brahmapur': ['Kukudakhandi', 'Golabandha', 'Patrapur']
    }
}

# Flatten to list of (state, district, village) tuples
location_pool = []
for state, districts in village_data.items():
    for district, villages in districts.items():
        for village in villages:
            location_pool.append((state, district, village))

# Ensure we have enough combinations
location_pool = location_pool * 4  # Repeat to cover 1000+ records

# Languages
languages_home = ['Hindi', 'Bhojpuri', 'Rajasthani', 'Marathi', 'Telugu', 'Bengali',
                  'Tamil', 'Kannada', 'Odia', 'Maithili', 'Magahi', 'Bundeli']
languages_teach = ['Hindi', 'English']

# Generate 1000 records
data = []
for i in range(1, 1001):
    state, district, village = random.choice(location_pool)
    total_students = np.random.randint(50, 501)
    primary_lang = random.choice(languages_home)
    teach_lang = random.choice(languages_teach)

    # Language mismatch logic
    mismatch = 1 if primary_lang != teach_lang else 0
    base_non_native = np.random.uniform(30, 90)
    percentage_non_native = min(100, base_non_native + (mismatch * 25))
    percentage_non_native = max(0, round(percentage_non_native, 1))

    # Barrier severity
    if percentage_non_native < 50:
        severity = 'Low'
    elif percentage_non_native < 75:
        severity = 'Medium'
    else:
        severity = 'High'

    # Learning impact (1-10, lower = worse)
    match_factor = 1.0 if primary_lang == teach_lang else 0.6
    impact_score = max(1, min(10, round(11 - (match_factor * percentage_non_native / 10))))

    survey_year = np.random.choice([2023, 2024, 2025])

    data.append({
        'school_id': i,
        'state': state,
        'district': district,
        'village': village,  # Now REAL village name
        'total_students': total_students,
        'primary_language_spoken': primary_lang,
        'teaching_language': teach_lang,
        'percentage_non_native_speakers': percentage_non_native,
        'barrier_severity': severity,
        'learning_impact_score': impact_score,
        'survey_year': survey_year
    })

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('language_barriers_rural_schools_real_villages.csv', index=False)

# Preview
print("Dataset saved as 'language_barriers_rural_schools_real_villages.csv'")
print(f"Total records: {len(df)}")
print("\nFirst 10 records:")
print(df.head(10)[['school_id', 'state', 'district', 'village', 'primary_language_spoken', 'teaching_language']])

Dataset saved as 'language_barriers_rural_schools_real_villages.csv'
Total records: 1000

First 10 records:
   school_id           state    district         village  \
0          1          Odisha    Rourkela      Rajgangpur   
1          2  Madhya Pradesh       Sagar          Khurai   
2          3   Uttar Pradesh   Allahabad        Karchana   
3          4   Uttar Pradesh        Agra          Jagner   
4          5      Tamil Nadu     Chennai  Sholinganallur   
5          6           Bihar       Patna        Phulwari   
6          7   Uttar Pradesh     Lucknow             Mal   
7          8  Andhra Pradesh  Vijayawada      Gannavaram   
8          9   Uttar Pradesh   Allahabad          Handia   
9         10      Tamil Nadu     Chennai           Avadi   

  primary_language_spoken teaching_language  
0                Bhojpuri             Hindi  
1                  Telugu             Hindi  
2              Rajasthani             Hindi  
3                 Bundeli             Hindi  
4

In [None]:
import pandas as pd
import numpy as np
from faker import Faker  # pip install faker (for realistic names; optional)

# Initialize Faker for names (optional; comment out if not installed)
fake = Faker('hi_IN')  # For Indian names

# Set seed for reproducibility
np.random.seed(42)

# Number of records
n_records = 100000

# Generate synthetic data for rural government school students
data = {
    'student_id': range(1, n_records + 1),
    'name': [fake.name() for _ in range(n_records)],  # Realistic Indian names
    'age': np.random.randint(6, 18, n_records),  # Ages 6-17 (primary to secondary)
    'gender': np.random.choice(['Male', 'Female'], n_records, p=[0.52, 0.48]),  # Slight male bias in rural
    'caste_category': np.random.choice(['SC', 'ST', 'OBC', 'General'], n_records, p=[0.25, 0.15, 0.40, 0.20]),
    'parent_education': np.random.choice(['Illiterate', 'Primary', 'Secondary', 'Graduate'], n_records, p=[0.40, 0.30, 0.25, 0.05]),
    'family_income_monthly': np.random.exponential(5000, n_records).astype(int),  # Low income skew (mean ~5k INR)
    'distance_to_school_km': np.random.exponential(2, n_records),  # Rural travel distances
    'attendance_rate': np.random.beta(2, 5, n_records) * 100,  # Skewed low (mean ~40%, rural avg.)
    'school_type': 'Rural Government',  # Fixed as per query
    'state': np.random.choice(['Uttar Pradesh', 'Bihar', 'Rajasthan', 'Madhya Pradesh', 'Odisha'], n_records, p=[0.25, 0.20, 0.15, 0.20, 0.20]),
    # Learning outcomes (scores out of 100; gaps simulated: lower in govt rural)
    'math_score': np.maximum(0, np.random.normal(45, 15, n_records)),  # Mean 45 (gap vs. urban ~65)
    'reading_score': np.maximum(0, np.random.normal(50, 18, n_records)),  # Mean 50 (gap vs. urban ~70)
    'language_score': np.maximum(0, np.random.normal(55, 16, n_records)),  # Slightly higher
    'grade': np.random.randint(1, 10, n_records),  # Classes 1-9
    'dropout_risk': np.random.choice([0, 1], n_records, p=[0.85, 0.15]),  # 15% at risk
}

# Create DataFrame
df = pd.DataFrame(data)

# Add correlations for realism (e.g., lower income -> lower scores)
df['math_score'] = df['math_score'] + (df['family_income_monthly'] / 1000 * -0.5) + (df['attendance_rate'] * 0.2) + np.random.normal(0, 2, n_records)
df['reading_score'] = df['reading_score'] + (df['parent_education'].map({'Illiterate': -10, 'Primary': -5, 'Secondary': 0, 'Graduate': 10})) + np.random.normal(0, 2, n_records)

# Select only numeric columns before clipping
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].clip(lower=0, upper=100) # Ensure scores in 0-100

# Save as CSV (simulates "download")
df.to_csv('rural_student_learning_data.csv', index=False)
print(f"Dataset generated and saved: {n_records} records in rural_student_learning_data.csv")
print(df.head())
print(f"Dataset shape: {df.shape}")
print(f"Sample stats:\n{df[['math_score', 'reading_score']].describe()}")

Dataset generated and saved: 100000 records in rural_student_learning_data.csv
   student_id             name  age  gender caste_category parent_education  \
0           1      मञ्जु घोषजी   12    Male            OBC          Primary   
1           2        गोतम झाजी    9  Female            OBC       Illiterate   
2           3    काशी बनर्जीजी   16    Male        General       Illiterate   
3           4      चण्डा चौहान   13    Male        General        Secondary   
4           5  लीलावती चोपराजी   10  Female            OBC          Primary   

   family_income_monthly  distance_to_school_km  attendance_rate  \
0                    100               0.812143        27.667086   
1                    100               0.817527        13.134414   
2                    100               0.160323        39.287899   
3                    100               0.042609        29.472503   
4                    100               0.215314        50.726744   

        school_type          state  m

In [None]:
%pip install faker --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.0/2.0 MB[0m [31m119.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker for Indian names
fake = Faker('hi_IN')
np.random.seed(42)

n_records = 100000

# Realistic rural household income (in thousands INR) - log-normal distribution
# Based on NSSO/PLFS: mean ~10.4k, median ~8k → log-mean ≈ 2.08, log-sd ≈ 0.55
income_log_mean = np.log(8)  # Median = 8k
income_log_sd = 0.55

# Generate monthly income in thousands (e.g., 8.5 = ₹8,500)
family_income_thousands = np.random.lognormal(mean=income_log_mean, sigma=income_log_sd, size=n_records)
family_income_thousands = np.clip(family_income_thousands, 1, 50)  # Cap at ₹50k (rare in rural)

# Round to 1 decimal (e.g., 7.8 = ₹7,800)
family_income_thousands = np.round(family_income_thousands, 1)

# Generate dataset
data = {
    'student_id': range(1, n_records + 1),
    'name': [fake.name() for _ in range(n_records)],
    'age': np.random.randint(6, 18, n_records),
    'gender': np.random.choice(['Male', 'Female'], n_records, p=[0.52, 0.48]),
    'caste_category': np.random.choice(['SC', 'ST', 'OBC', 'General'], n_records, p=[0.25, 0.15, 0.40, 0.20]),
    'parent_education': np.random.choice(['Illiterate', 'Primary', 'Secondary', 'Graduate'], n_records, p=[0.40, 0.30, 0.25, 0.05]),
    'family_income_monthly_thousands': family_income_thousands,  # In thousands (e.g., 8.2 = ₹8,200)
    'distance_to_school_km': np.random.exponential(2, n_records),
    'attendance_rate': np.random.beta(2, 5, n_records) * 100,
    'school_type': 'Rural Government',
    'state': np.random.choice(['Uttar Pradesh', 'Bihar', 'Rajasthan', 'Madhya Pradesh', 'Odisha'], n_records, p=[0.25, 0.20, 0.15, 0.20, 0.20]),
    'math_score': np.maximum(0, np.random.normal(45, 15, n_records)),
    'reading_score': np.maximum(0, np.random.normal(50, 18, n_records)),
    'language_score': np.maximum(0, np.random.normal(55, 16, n_records)),
    'grade': np.random.randint(1, 10, n_records),
    'dropout_risk': np.random.choice([0, 1], n_records, p=[0.85, 0.15]),
}

df = pd.DataFrame(data)

# Add realistic correlations
df['math_score'] = df['math_score'] + (df['family_income_monthly_thousands'] * 0.8) + (df['attendance_rate'] * 0.15) + np.random.normal(0, 3, n_records)
df['reading_score'] = df['reading_score'] + (df['parent_education'].map({'Illiterate': -12, 'Primary': -6, 'Secondary': 0, 'Graduate': 12})) + np.random.normal(0, 3, n_records)

# Clip scores
df[['math_score', 'reading_score', 'language_score']] = df[['math_score', 'reading_score', 'language_score']].clip(0, 100)

# Save dataset
df.to_csv('rural_student_learning_data_updated.csv', index=False)

# Summary
print(f"Dataset saved: rural_student_learning_data_updated.csv ({n_records} records)")
print("\nIncome Distribution (in ₹ thousands):")
print(df['family_income_monthly_thousands'].describe())
print(f"Median: ₹{df['family_income_monthly_thousands'].median() * 1000:,.0f}")
print(f"Mean:   ₹{df['family_income_monthly_thousands'].mean() * 1000:,.0f}")

Dataset saved: rural_student_learning_data_updated.csv (100000 records)

Income Distribution (in ₹ thousands):
count    100000.000000
mean          9.309628
std           5.501281
min           1.000000
25%           5.500000
50%           8.000000
75%          11.600000
max          50.000000
Name: family_income_monthly_thousands, dtype: float64
Median: ₹8,000
Mean:   ₹9,310
