In [0]:
from pyspark.sql.functions import avg

In [0]:
spark_df = spark.table("school_enrollment_db.cleaned_school_enrollment")

In [0]:
spark_df.show(5)

+------------+--------------------+------+---------------+-------------+-----+------+-----------------+-----------------+---------------+---------------+---------------------+-------------------+-----------------+---------------+----------------------+------------------+-----------------------+-------------------+---------------------+-------------------------+----------------------+-----------------+-----------+
|   school_id|         school_name|  city|       district|academic_year|grade|gender|enrolled_students|   avg_exam_score|pass_percentage|attendance_rate|learning_growth_index|        source_file|median_exam_score|fail_percentage|distinction_percentage|avg_internal_score|exam_participation_rate|remedial_percentage|student_teacher_ratio|digital_access_percentage|scholarship_percentage|subject_pass_rate|skill_index|
+------------+--------------------+------+---------------+-------------+-----+------+-----------------+-----------------+---------------+---------------+-------------

In [0]:
def printSparkResult(df, title, n=20):
    # Prints the title for the Spark DataFrame output
    print(f"\n{title}")  
    # Displays the top `n` rows of the Spark DataFrame without truncating column values          
    df.show(n, truncate=False)       


Year wise Enrollment Trend

In [0]:
from pyspark.sql.functions import sum as spark_sum
# Group the data by academic year, calculate total enrollment, and sort by year
yearly_enrollment = (
    # Group records based on academic year
    spark_df
    .groupBy("academic_year")
    # Aggregate total enrolled students for each academic year
    .agg(
        spark_sum("enrolled_students").alias("total_enrollment")
    )
    # Order the result by academic year for trend analysis
    .orderBy("academic_year")
)
printSparkResult(yearly_enrollment, "Year wise Enrollment Trend")



Year wise Enrollment Trend
+-------------+----------------+
|academic_year|total_enrollment|
+-------------+----------------+
|2020         |831474.5        |
|2021         |834422.5        |
|2022         |824672.5        |
|2023         |825296.0        |
|2024         |832112.0        |
+-------------+----------------+



Gender wise Enrollment Trend

In [0]:
# Group the data by gender and calculate total enrollment for each gender
gender_enrollment = (
    spark_df
    .groupBy("gender")    
    # Aggregate total enrolled students for each gender
    .agg(
        spark_sum("enrolled_students").alias("total_enrollment")
    )
)
printSparkResult(gender_enrollment, "Gender wise Enrollment Distribution")


Gender wise Enrollment Distribution
+------+----------------+
|gender|total_enrollment|
+------+----------------+
|Male  |2494890.0       |
|Female|1653087.5       |
+------+----------------+



Grade Level Enrollment

In [0]:
# Group the data by grade level, calculate total enrollment, and sort by grade
grade_enrollment = (
    spark_df
    .groupBy("grade")
    # Aggregate total enrolled students for each grade
    .agg(
        spark_sum("enrolled_students").alias("total_enrollment")
    )
    # Order the results by grade level
    .orderBy("grade")
)
printSparkResult(grade_enrollment, "Grade Level Enrollment")


Grade Level Enrollment
+-----+----------------+
|grade|total_enrollment|
+-----+----------------+
|1    |339056.0        |
|2    |346081.0        |
|3    |346966.0        |
|4    |345384.0        |
|5    |343651.0        |
|6    |350771.0        |
|7    |339795.0        |
|8    |350540.0        |
|9    |352377.0        |
|10   |346221.0        |
|11   |343267.0        |
|12   |343868.5        |
+-----+----------------+



District wise Enrollment

In [0]:
# Group the data by district, calculate total enrollment, and sort by highest enrollment first
district_enrollment = (
    spark_df
    .groupBy("district")
    # Aggregate total enrolled students for each district
    .agg(
        spark_sum("enrolled_students").alias("total_enrollment")
    )
    # Order districts by total enrollment in descending order
    .orderBy("total_enrollment", ascending=False)
)
printSparkResult(district_enrollment, "District wise Enrollment")


District wise Enrollment
+-----------------+----------------+
|district         |total_enrollment|
+-----------------+----------------+
|Raigad Rural     |864772.5        |
|New Delhi        |445210.5        |
|Chennai          |360338.0        |
|Bangalore Urban  |331399.5        |
|Mumbai Suburban  |327441.5        |
|Pune Rural       |297453.5        |
|Lucknow Rural    |272352.5        |
|Ranga Reddy      |267309.0        |
|Ahmedabad Rural  |266854.5        |
|Bangalore Rural  |239153.5        |
|North 24 Parganas|208235.0        |
|Jaipur Urban     |146737.5        |
|Pune Urban       |120720.0        |
+-----------------+----------------+



Average Score by Year

In [0]:
# Group the data by academic year, calculate the average exam score, and sort the results by year
avgscore_year = (
    spark_df
    .groupBy("academic_year")
    # Compute the average exam score for each academic year
    .agg(
        avg("avg_exam_score").alias("avg_exam_score")
    )
    # Order the output by academic year
    .orderBy("academic_year")
)
printSparkResult(avgscore_year, "Average Exam Score by Year")


Average Exam Score by Year
+-------------+-----------------+
|academic_year|avg_exam_score   |
+-------------+-----------------+
|2020         |74.23961309074657|
|2021         |74.45818389332263|
|2022         |74.46198580937349|
|2023         |74.18933385436154|
|2024         |74.40807421986436|
+-------------+-----------------+



Pass Rate by District

In [0]:
# Group the data by district, calculate the average pass rate, and sort by highest pass rate
passrate_district = (
    spark_df
    .groupBy("district")
    # Compute the average pass percentage for each district
    .agg(
        avg("pass_percentage").alias("avg_pass_rate")
    )
    # Order districts by average pass rate in descending order
    .orderBy("avg_pass_rate", ascending=False)
)
printSparkResult(passrate_district, "Pass Rate by District")


Pass Rate by District
+-----------------+-----------------+
|district         |avg_pass_rate    |
+-----------------+-----------------+
|North 24 Parganas|81.9179523809524 |
|Lucknow Rural    |81.77051851851844|
|Ranga Reddy      |81.59799999999973|
|Jaipur Urban     |81.56959999999998|
|Bangalore Urban  |81.5166363636364 |
|Ahmedabad Rural  |81.51437037037005|
|Pune Urban       |81.49633333333337|
|Bangalore Rural  |81.4654583333334 |
|Chennai          |81.46480555555551|
|Raigad Rural     |81.44547126436768|
|New Delhi        |81.33580000000023|
|Mumbai Suburban  |81.21784848484869|
|Pune Rural       |81.09099999999991|
+-----------------+-----------------+



Attendance vs Performance

In [0]:
# Group the data by district to analyze attendance and academic performance together
attendance_perf = (
    spark_df
    .groupBy("district")    
    # Calculate average attendance rate and average exam score for each district
    .agg(
        avg("attendance_rate").alias("avg_attendance"),
        avg("avg_exam_score").alias("avg_exam_score")
    )
)
printSparkResult(attendance_perf, "Attendance vs Performance")



Attendance vs Performance
+-----------------+-----------------+-----------------+
|district         |avg_attendance   |avg_exam_score   |
+-----------------+-----------------+-----------------+
|Mumbai Suburban  |85.05196969696979|73.86800156832608|
|Lucknow Rural    |85.09100000000011|74.29948865791057|
|Pune Rural       |84.87370000000008|74.36627079470499|
|Raigad Rural     |85.10868965517223|74.32300193916949|
|New Delhi        |84.88033333333318|74.24764656372902|
|Bangalore Urban  |85.18836363636377|74.32613476269252|
|Ranga Reddy      |85.12433333333323|74.62764945714149|
|Chennai          |84.82261111111113|74.92851851336323|
|Bangalore Rural  |84.84958333333324|74.10615743529803|
|Jaipur Urban     |85.34006666666669|74.30500966987076|
|Ahmedabad Rural  |85.17011111111134|74.15630793985946|
|North 24 Parganas|85.03314285714292|74.52911778133698|
|Pune Urban       |85.2880833333334 |74.7496437213605 |
+-----------------+-----------------+-----------------+



Year wise Learning Growth Trend

In [0]:
# Group the data by academic year to analyze learning growth over time
learning_growth = (
    spark_df
    .groupBy("academic_year")    
    # Calculate the average learning growth index for each academic year
    .agg(
        avg("learning_growth_index").alias("avg_learning_growth")
    )
    # Order the results by academic year to observe trends
    .orderBy("academic_year")
)
printSparkResult(learning_growth, "Learning Growth Trend")


Learning Growth Trend
+-------------+-------------------+
|academic_year|avg_learning_growth|
+-------------+-------------------+
|2020         |0.5009100719424461 |
|2021         |0.5047865707434069 |
|2022         |0.5016342925659476 |
|2023         |0.5004808153477209 |
|2024         |0.4983441247002392 |
+-------------+-------------------+



Skill Index by District

In [0]:
# Group the data by district to evaluate skill levels across regions
skill_district = (
    spark_df
    .groupBy("district")
    # Calculate the average skill index for each district
    .agg(
        avg("skill_index").alias("avg_skill_index")
    )    
    # Order districts by average skill index in descending order
    .orderBy("avg_skill_index", ascending=False)
)
printSparkResult(skill_district, "Skill Index by District")



Skill Index by District
+-----------------+-----------------+
|district         |avg_skill_index  |
+-----------------+-----------------+
|Lucknow Rural    |65.19377777777781|
|Pune Rural       |65.15149999999993|
|Jaipur Urban     |65.12566666666663|
|Bangalore Rural  |65.06308333333325|
|Ahmedabad Rural  |65.03396296296286|
|New Delhi        |64.98637777777803|
|Bangalore Urban  |64.88509090909096|
|Raigad Rural     |64.85219540229829|
|Mumbai Suburban  |64.8146363636364 |
|North 24 Parganas|64.74528571428577|
|Pune Urban       |64.7398333333333 |
|Chennai          |64.70258333333328|
|Ranga Reddy      |64.2905185185185 |
+-----------------+-----------------+



In [0]:
spark.sql("USE school_enrollment_db")

DataFrame[]

In [0]:
%python

# Write yearly enrollment aggregation to a Delta table
yearly_enrollment.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`yearly_enrollment`")
# Write gender-wise enrollment aggregation to a Delta table
gender_enrollment.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`gender_enrollment`")
# Write grade-level enrollment aggregation to a Delta table
grade_enrollment.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`grade_enrollment`")
# Write district-wise enrollment aggregation to a Delta table
district_enrollment.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`district_enrollment`")
# Write average exam score by academic year to a Delta table
avgscore_year.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`average_score_by_year`")
# Write average pass rate by district to a Delta table
passrate_district.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`pass_rate_by_district`")
# Write attendance versus performance analysis to a Delta table
attendance_perf.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`attendance_vs_performance`")
# Write learning growth trend by academic year to a Delta table
learning_growth.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`learning_growth_by_year`")
# Write skill index analysis by district to a Delta table
skill_district.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`skill_index_by_district`")


School Analysis

In [0]:
from pyspark.sql.functions import avg
# Aggregate school-level performance metrics across academic years
school_performance = (
    spark_df
    # Group data by school and academic context
    .groupBy("school_id","school_name","district","academic_year")
    # Calculate average performance indicators for each school per academic year
    .agg(
        avg("avg_exam_score").alias("avg_exam_score"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("attendance_rate").alias("avg_attendance"),
        avg("learning_growth_index").alias("avg_learning_growth"),
        avg("skill_index").alias("avg_skill_index")
    )
)

In [0]:
# Write the school-level performance metrics to a Delta table
school_performance.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("School_Performance")

In [0]:
# Filter schools with high academic performance based on pass rate and attendance thresholds
high_performing_schools = school_performance.filter(
    # Select schools with an average attendance rate of 85% or higher
    (school_performance.avg_pass_rate >= 80) &   
    # Select schools with an average attendance rate of 85% or higher
    (school_performance.avg_attendance >= 85)
)
# Write the high-performing schools data to a Delta table
high_performing_schools.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("High_Performing_Schools")


In [0]:
from pyspark.sql.functions import avg
# Aggregate key academic and infrastructure metrics at the school level by academic year
school_metrics = (
    spark_df
    # Group data by school identity, location, and academic year
    .groupBy("school_id","school_name","district","academic_year")
    # Calculate average values for multiple school performance and access indicators
    .agg(
        avg("attendance_rate").alias("avg_attendance"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("learning_growth_index").alias("avg_learning_growth"),
        avg("skill_index").alias("avg_skill_index"),
        avg("digital_access_percentage").alias("avg_digital_access"),
        avg("remedial_percentage").alias("avg_remedial")
    )
)


In [0]:
from pyspark.sql.functions import col, when
# Calculate a composite dropout risk score based on multiple weighted academic and access factors
dropout_risk = school_metrics.withColumn(
    "dropout_risk_score",
    # Contribution from low attendance (higher risk if attendance is lower)
    (1 - col("avg_attendance") / 100) * 0.25 +
    # Contribution from low pass rate (higher risk if pass rate is lower)
    (1 - col("avg_pass_rate") / 100) * 0.25 +
    # Contribution from low learning growth index
    (1 - col("avg_learning_growth")) * 0.15 +
    # Contribution from low skill index
    (1 - col("avg_skill_index") / 100) * 0.15 +
    # Contribution from low digital access
    (1 - col("avg_digital_access") / 100) * 0.10 +
    # Contribution from high remedial requirement
    (col("avg_remedial") / 100) * 0.10
)


In [0]:
# Assign categorical risk levels based on the calculated dropout risk score
dropout_risk = dropout_risk.withColumn(
    "risk_level",
    # Mark schools as High risk if the dropout risk score is 0.35 or above
    when(col("dropout_risk_score") >= 0.35, "High")
    # Mark schools as Medium risk if the score is between 0.20 and 0.35
    .when(col("dropout_risk_score") >= 0.20, "Medium")
    # Mark schools as Low risk if the score is below 0.20
    .otherwise("Low")
)

In [0]:
# Write the dropout risk analysis results to a Delta table
dropout_risk.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("Dropout_Risk")

In [0]:
from pyspark.sql.functions import avg, stddev
# Analyze school performance stability using mean and variability metrics
school_stability = (
    spark_df    
    # Group data by school identity and district
    .groupBy("school_id","school_name","district")
    # Calculate mean and standard deviation for exam scores and pass rates
    .agg(
        avg("avg_exam_score").alias("Exam_mean_score"),
        stddev("avg_exam_score").alias("Exam_standard_deviation_score"),
        avg("pass_percentage").alias("Mean_Pass_rate"),
        stddev("pass_percentage").alias("Pass_rate_standard_deviation")
    )
)
# Write the school stability analysis to a Delta table
school_stability.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("School_Stability")


In [0]:
from pyspark.sql.functions import min, max, first, last
# Determine the first and last academic years available for each school
yearsBoundaries = (
    spark_df
    # Group data by school
    .groupBy("school_id")
    # Identify the earliest and latest academic year for each school
    .agg(
        min("academic_year").alias("First_year"),
        max("academic_year").alias("Last_year")
    )
)
# Analyze improvement or decline in key performance indicators across years
schoolTrends = (
    # Join the year boundaries back to the main dataset
    spark_df
    .join(yearsBoundaries, "school_id")
    # Group data by school identity and location
    .groupBy("school_id","school_name","district")
    # Calculate changes between the first and last available records
    .agg(
        (last("pass_percentage") - first("pass_percentage"))
            .alias("pass_rate_change"),
        (last("learning_growth_index") - first("learning_growth_index"))
            .alias("learning_growth_change"),
        (last("skill_index") - first("skill_index"))
            .alias("skill_index_change")
    )
)
# Write the school improvement vs decline analysis to a Delta table
schoolTrends.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("School_Improvement_Vs_Decline")


In [0]:
from pyspark.sql.functions import corr
# Analyze the impact of remedial programs on academic performance at the school level
remedialImpact = (
    spark_df
    # Group data by school identity and district
    .groupBy("school_id","school_name","district")
    # Calculate average remedial participation and performance indicators
    .agg(
        avg("remedial_percentage").alias("avg_remedial"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("learning_growth_index").alias("avg_learning_growth")
    )
)
# Write the remedial impact analysis to a Delta table
remedialImpact.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("Remedial_Impact")


In [0]:
from pyspark.sql.functions import max, min
# Analyze performance disparity within each district
district_gap = (
    spark_df    
    # Group data by district
    .groupBy("district")
    # Identify the best and worst school pass rates within each district
    .agg(
        max("pass_percentage").alias("Best_school_pass_rate"),
        min("pass_percentage").alias("Worst_school_pass_rate")
    )
    # Calculate the performance gap between best and worst schools
    .withColumn(
        "performance_gap",
        col("Best_school_pass_rate") - col("Worst_school_pass_rate")
    )
)
# Write the district-level performance gap analysis to a Delta table
district_gap.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("District_Performance_Gap")

In [0]:
# Compare enrollment, performance, and access metrics across districts
UrbanRuralComparison = (
    spark_df
    # Group data by district 
    .groupBy("district")
    # Calculate average enrollment, performance, and access indicators
    .agg(
        avg("enrolled_students").alias("avg_enrollment"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("digital_access_percentage").alias("avg_digital_access"),
        avg("skill_index").alias("avg_skill_index")
    )
)
# Write the urban–rural comparison analysis to a Delta table
UrbanRuralComparison.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("Urban_Rural_Comparison")


In [0]:
# Analyze how efficiently educational resources are utilized across districts
resourceEfficiency = (
    spark_df    
    # Group data by district
    .groupBy("district")
    # Calculate average resource and performance indicators
    .agg(
        avg("student_teacher_ratio").alias("avg_student_teacher_ratio"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("learning_growth_index").alias("avg_learning_growth")
    )
)
# Write the resource efficiency analysis to a Delta table
resourceEfficiency.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("Resource_Efficiency")

In [0]:
# Analyze learning growth trends over time at the district level
districtGrowth = (
    spark_df    
    # Group data by district and academic year to capture yearly growth patterns
    .groupBy(
        "district",
        "academic_year"
    )
    # Calculate the average learning growth index for each district-year combination
    .agg(
        avg("learning_growth_index").alias("avg_learning_growth")
    )
    # Order results by district and academic year for time-series analysis
    .orderBy("district", "academic_year")
)
# Write the district growth momentum analysis to a Delta table
districtGrowth.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("District_Growth_Momentum")


In [0]:
# Analyze the relationship between student enrollment and academic performance at the school level
EnrollmentvsPerformance = (
    spark_df
    # Group data by school identity, location, and academic year
    .groupBy(
        "school_id",
        "school_name",
        "district",
        "academic_year"
    )
    # Calculate average enrollment and pass rate for each school-year combination
    .agg(
        avg("enrolled_students").alias("avg_enrollment"),
        avg("pass_percentage").alias("avg_pass_rate")
    )
)
# Write the enrollment versus performance analysis to a Delta table
EnrollmentvsPerformance.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("Enrollment_vs_Performance")


In [0]:
from pyspark.sql.functions import when
# Classify schools based on their learning growth trajectory over time
learningTrajectory = (
    schoolTrends    
    # Assign a trajectory type based on change in learning growth index
    .withColumn(
        "trajectory_type",
        # Schools with significant positive growth
        when(col("learning_growth_change") > 0.10, "Fast Improver")
        # Schools with moderate positive growth
        .when(col("learning_growth_change") > 0.02, "Slow Improver")
        # Schools showing a decline in learning growth
        .when(col("learning_growth_change") < -0.02, "Declining")
        # Schools with minimal or no change
        .otherwise("Stagnant")
    )
)
# Write the learning trajectory classification to a Delta table
learningTrajectory.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("Learning_Trajectory")


In [0]:
# Aggregate key early warning indicators at the school level
earlyWarning = (
    spark_df
    # Group data by school identity and district
    .groupBy("school_id","school_name","district")    
    # Calculate average values for attendance, exam participation, and remedial need
    .agg(
        avg("attendance_rate").alias("avg_attendance"),
        avg("exam_participation_rate").alias("avg_exam_participation"),
        avg("remedial_percentage").alias("avg_remedial")
    )
)
# Write the early warning indicators to a Delta table
earlyWarning.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("Early_Warning_Indicators")


Rank Schools using Score

In [0]:
# Import column reference function from PySpark
from pyspark.sql.functions import col
# Calculate a composite school performance score using weighted indicators
schoolScore = (
    school_metrics
    # Create a weighted school score combining academic and engagement factors
    .withColumn(
        "school_score",
        (
            # Weight contribution from pass rate
            col("avg_pass_rate") * 0.35 +
            # Weight contribution from attendance rate
            col("avg_attendance") * 0.25 +
            # Weight contribution from learning growth (scaled to percentage)
            col("avg_learning_growth") * 100 * 0.20 +
            # Weight contribution from skill index
            col("avg_skill_index") * 0.20
        )
    )
)


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank, lit
# Add a dummy column for global ranking
schoolScore_rank_base = schoolScore.withColumn("global_partition", lit(1))
# Global ranking window (no performance warning)
overall_window = (
    Window.partitionBy("global_partition")
          .orderBy(col("school_score").desc())
)
# District-wise ranking window (already correct)
district_window = (
    Window.partitionBy("district")
          .orderBy(col("school_score").desc())
)
# Apply rankings
school_ranking = (
    schoolScore_rank_base
        .withColumn("overall_rank", rank().over(overall_window))
        .withColumn("district_rank", rank().over(district_window))
        .drop("global_partition")
)


In [0]:
school_ranking.write.format("delta").mode("overwrite") \
    .saveAsTable("School_Ranking")

Machine Learning Techniques

In [0]:
# Select relevant academic, performance, and infrastructure columns for pandas analysis
pand = (
    # Use the main Spark DataFrame
    spark_df
    # Select required columns for downstream pandas-based analysis or modeling
    .select(
        "school_id",
        "academic_year",
        "enrolled_students",
        "attendance_rate",
        "pass_percentage",
        "avg_exam_score",
        "learning_growth_index",
        "skill_index",
        "digital_access_percentage",
        "scholarship_percentage",
        "student_teacher_ratio"
    )
    # Convert the Spark DataFrame to a pandas DataFrame
    .toPandas()
)

In [0]:
# Aggregate school-level yearly information using pandas
schoolyear_information = (
    # Use the pandas DataFrame converted from Spark
    pand
    # Group data by school and academic year
    .groupby(
        ["school_id", "academic_year"],
        as_index=False
    )
    # Compute yearly aggregates for enrollment, performance, and resource indicators
    .agg(
        # Total number of enrolled students per school-year
        total_enrollment=("enrolled_students", "sum"),
        # Average attendance rate per school-year
        averageattendance=("attendance_rate", "mean"),
        # Average pass rate per school-year
        averagepassrate=("pass_percentage", "mean"),    
        # Average exam score per school-year
        averageexamscore=("avg_exam_score", "mean"),
        # Average learning growth index per school-year
        averagelearninggrowth=("learning_growth_index", "mean"),
        # Average skill index per school-year
        averageskillindex=("skill_index", "mean"),
        # Average digital access percentage per school-year
        averagedigitalaccess=("digital_access_percentage", "mean"),
        # Average scholarship coverage per school-year
        averagescholarship=("scholarship_percentage", "mean"),
        # Average student–teacher ratio per school-year
        averagestudentteacherratio=("student_teacher_ratio", "mean")
    )
)


In [0]:
# Sort the data by school and academic year to ensure correct temporal ordering
schoolyear_information = (
    schoolyear_information
    .sort_values(["school_id", "academic_year"])
)
# Create a column containing the previous year's enrollment for each school
schoolyear_information["prev_enrollment"] = (
    schoolyear_information
    .groupby("school_id")["total_enrollment"]
    .shift(1)
)
# Calculate year-over-year enrollment growth for each school
schoolyear_information["EnrollmentGrowth"] = (
    (schoolyear_information["total_enrollment"] -
     schoolyear_information["prev_enrollment"]) /
     schoolyear_information["prev_enrollment"]
)
# Remove rows with missing values created due to the shift operation
enrollment = schoolyear_information.dropna()

In [0]:
from sklearn.ensemble import RandomForestRegressor
# Define the feature set used to predict enrollment growth
features = [
    "averageattendance",
    "averagepassrate",
    "averageexamscore",
    "averagelearninggrowth",
    "averageskillindex",
    "averagedigitalaccess",
    "averagescholarship",
    "averagestudentteacherratio"
]
# Create the input feature matrix from the enrollment DataFrame
X = enrollment[features]
# Define the target variable as year-over-year enrollment growth
y = enrollment["EnrollmentGrowth"]
# Initialize the Random Forest Regressor with chosen hyperparameters
randomforest = RandomForestRegressor(
    # Number of decision trees in the forest
    n_estimators=100, 
    # Maximum depth of each tree to control overfitting
    max_depth=4,   
    # Seed for reproducibility    
    random_state=40   
)
# Train the Random Forest model on the input features and target variable
randomforest.fit(X, y)

In [0]:
import pandas as pd
# Create a DataFrame to store feature importance values from the trained Random Forest model
featureImportance = pd.DataFrame({
    "feature": features,
    "importance": randomforest.feature_importances_ * 100
})
# Sort features by importance in descending order
featureImportance = featureImportance.sort_values(
    "importance",
    ascending=False
)
print(featureImportance)

                      feature  importance
2            averageexamscore   20.044368
4           averageskillindex   17.230824
6          averagescholarship   12.950768
3       averagelearninggrowth   12.731256
7  averagestudentteacherratio   10.359344
0           averageattendance    9.509665
1             averagepassrate    9.103673
5        averagedigitalaccess    8.070101


In [0]:
spark.createDataFrame(featureImportance) \
    .write.format("delta").mode("overwrite") \
    .saveAsTable("Enrollment_Feature_Importance")


In [0]:
# Convert Spark to Pandas
inputs = spark_df.select(
    "school_id",
    "attendance_rate",
    "pass_percentage",
    "avg_exam_score",
    "learning_growth_index",
    "skill_index",
    "digital_access_percentage",
    "remedial_percentage",
    "exam_participation_rate",
    "student_teacher_ratio"
).toPandas()
# Aggregate to school level
schools = (
    inputs
    .groupby("school_id", as_index=False)
    .agg(
        averageattendance=("attendance_rate", "mean"),
        averagepass_rate=("pass_percentage", "mean"),
        averageexam_score=("avg_exam_score", "mean"),
        averagelearning_growth=("learning_growth_index", "mean"),
        averageskill_index=("skill_index", "mean"),
        averagedigital_access=("digital_access_percentage", "mean"),
        averageremedial=("remedial_percentage", "mean"),
        averageexam_participation=("exam_participation_rate", "mean"),
        averagestudent_teacher_ratio=("student_teacher_ratio", "mean")
    )
)


In [0]:
# Compute the lower quartilethresholds for key school performance indicators
# Attendance cutoff to identify low-attendance schools
attendance_cutoff = schools["averageattendance"].quantile(0.25)
# Pass rate cutoff to identify low academic success
passrate_cutoff = schools["averagepass_rate"].quantile(0.25)
# Learning growth cutoff to identify weak learning progression
learninggrowth_cutoff = schools["averagelearning_growth"].quantile(0.25)
# Skill index cutoff to identify low skill development
skillindex_cutoff = schools["averageskill_index"].quantile(0.25)
# Digital access cutoff to identify limited technology access
digitalaccess_cutoff = schools["averagedigital_access"].quantile(0.25)
# Remedial percentage cutoff to identify high remedial dependency
remedial_cutoff = schools["averageremedial"].quantile(0.25)
# Exam participation cutoff to identify low exam engagement
exam_participation = schools["averageexam_participation"].quantile(0.25)
# Student–teacher ratio cutoff to identify high classroom load
student_teacher_ratio = schools["averagestudent_teacher_ratio"].quantile(0.25)

In [0]:
# Create a binary dropout risk indicator based on multiple low-performance thresholds
schools["dropout_risk"] = (
    (schools["averageattendance"] <= attendance_cutoff) |    
    (schools["averagepass_rate"] <= passrate_cutoff) |
    (schools["averagelearning_growth"] <= learninggrowth_cutoff) |
    (schools["averageskill_index"] <= skillindex_cutoff) |
    (schools["averagedigital_access"] <= digitalaccess_cutoff) |
    (schools["averageremedial"] <= remedial_cutoff) |
    (schools["averageexam_participation"] <= exam_participation) |
    (schools["averagestudent_teacher_ratio"] <= student_teacher_ratio)
).astype(int)   # Convert boolean values to binary


In [0]:
from sklearn.ensemble import RandomForestClassifier
# Define the feature set used to predict dropout risk
features = [
    "averageattendance",
    "averagepass_rate",
    "averageexam_score",
    "averagelearning_growth",
    "averageskill_index",
    "averagedigital_access",
    "averageremedial",
    "averageexam_participation",
    "averagestudent_teacher_ratio"
]
# Create the input feature matrix from the schools DataFrame
X = schools[features]

# Define the target variable indicating dropout risk
y = schools["dropout_risk"]
# Initialize the Random Forest Classifier with selected hyperparameters
randomforest = RandomForestClassifier(
    # Number of trees in the forest
    n_estimators=100, 
    # Maximum depth to control model complexity 
    max_depth=5, 
    # Seed for reproducibility      
    random_state=4    
)
# Train the Random Forest classifier on the input features and target labels
randomforest.fit(X, y)


In [0]:
import pandas as pd
# Create a DataFrame to display feature importance from the trained dropout risk model
dropoutfeature = pd.DataFrame({
    # Feature names used in the classification model
    "feature": features,
    # Corresponding importance scores converted to percentages
    "importance": randomforest.feature_importances_ * 100
})
# Sort features by importance in descending order
dropoutfeature = dropoutfeature.sort_values(
    "importance",
    ascending=False
)
# Display the feature importance table
print(dropoutfeature)

                        feature  importance
0             averageattendance   21.858281
6               averageremedial   16.040056
8  averagestudent_teacher_ratio   10.670944
1              averagepass_rate   10.181498
4            averageskill_index   10.059705
3        averagelearning_growth    9.010947
2             averageexam_score    8.665146
7     averageexam_participation    7.054047
5         averagedigital_access    6.459375


In [0]:
spark.createDataFrame(dropoutfeature) \
    .write.format("delta").mode("overwrite") \
    .saveAsTable("Dropout_Features")