In [0]:
from pyspark.sql.functions import avg

In [0]:
spark_df = spark.table("school_enrollment_db.cleaned_school_enrollment")

In [0]:
spark_df.show(5)

+------------+--------------------+------+---------------+-------------+-----+------+-----------------+-----------------+---------------+---------------+---------------------+-------------------+-----------------+---------------+----------------------+------------------+-----------------------+-------------------+---------------------+-------------------------+----------------------+-----------------+-----------+
|   school_id|         school_name|  city|       district|academic_year|grade|gender|enrolled_students|   avg_exam_score|pass_percentage|attendance_rate|learning_growth_index|        source_file|median_exam_score|fail_percentage|distinction_percentage|avg_internal_score|exam_participation_rate|remedial_percentage|student_teacher_ratio|digital_access_percentage|scholarship_percentage|subject_pass_rate|skill_index|
+------------+--------------------+------+---------------+-------------+-----+------+-----------------+-----------------+---------------+---------------+-------------

In [0]:
def printSparkResult(df, title, n=20):
    print(f"\n{title}")
    df.show(n, truncate=False)

Year wise Enrollment Trend

In [0]:
from pyspark.sql.functions import sum as spark_sum
yearly_enrollment = (
    spark_df
    .groupBy("academic_year")
    .agg(spark_sum("enrolled_students").alias("total_enrollment"))
    .orderBy("academic_year")
)
printSparkResult(yearly_enrollment, "Year wise Enrollment Trend")


Year wise Enrollment Trend
+-------------+----------------+
|academic_year|total_enrollment|
+-------------+----------------+
|2020         |831474.5        |
|2021         |834422.5        |
|2022         |824672.5        |
|2023         |825296.0        |
|2024         |832112.0        |
+-------------+----------------+



Gender wise Enrollment Trend

In [0]:
gender_enrollment = (
    spark_df
    .groupBy("gender")
    .agg(spark_sum("enrolled_students").alias("total_enrollment"))
)
printSparkResult(gender_enrollment, "Gender wise Enrollment Distribution")


Gender wise Enrollment Distribution
+------+----------------+
|gender|total_enrollment|
+------+----------------+
|Male  |2494890.0       |
|Female|1653087.5       |
+------+----------------+



Grade Level Enrollment

In [0]:
grade_enrollment = (
    spark_df
    .groupBy("grade")
    .agg(spark_sum("enrolled_students").alias("total_enrollment"))
    .orderBy("grade")
)
printSparkResult(grade_enrollment, "Grade Level Enrollment")


Grade Level Enrollment
+-----+----------------+
|grade|total_enrollment|
+-----+----------------+
|1    |339056.0        |
|2    |346081.0        |
|3    |346966.0        |
|4    |345384.0        |
|5    |343651.0        |
|6    |350771.0        |
|7    |339795.0        |
|8    |350540.0        |
|9    |352377.0        |
|10   |346221.0        |
|11   |343267.0        |
|12   |343868.5        |
+-----+----------------+



District wise Enrollment

In [0]:
district_enrollment = (
    spark_df
    .groupBy("district")
    .agg(spark_sum("enrolled_students").alias("total_enrollment"))
    .orderBy("total_enrollment", ascending=False)
)
printSparkResult(district_enrollment, "District wise Enrollment")


District wise Enrollment
+-----------------+----------------+
|district         |total_enrollment|
+-----------------+----------------+
|Raigad Rural     |864772.5        |
|New Delhi        |445210.5        |
|Chennai          |360338.0        |
|Bangalore Urban  |331399.5        |
|Mumbai Suburban  |327441.5        |
|Pune Rural       |297453.5        |
|Lucknow Rural    |272352.5        |
|Ranga Reddy      |267309.0        |
|Ahmedabad Rural  |266854.5        |
|Bangalore Rural  |239153.5        |
|North 24 Parganas|208235.0        |
|Jaipur Urban     |146737.5        |
|Pune Urban       |120720.0        |
+-----------------+----------------+



Average Score by Year

In [0]:
avgscore_year = (
    spark_df
    .groupBy("academic_year")
    .agg(
        avg("avg_exam_score").alias("avg_exam_score"))
    .orderBy("academic_year")
)
printSparkResult(avgscore_year, "Average Exam Score by Year")


Average Exam Score by Year
+-------------+-----------------+
|academic_year|avg_exam_score   |
+-------------+-----------------+
|2020         |74.23961309074657|
|2021         |74.45818389332263|
|2022         |74.46198580937349|
|2023         |74.18933385436154|
|2024         |74.40807421986436|
+-------------+-----------------+



Pass Rate by District

In [0]:
passrate_district = (
    spark_df
    .groupBy("district")
    .agg(avg("pass_percentage").alias("avg_pass_rate"))
    .orderBy("avg_pass_rate", ascending=False)
)
printSparkResult(passrate_district, "Pass Rate by District")



Pass Rate by District
+-----------------+-----------------+
|district         |avg_pass_rate    |
+-----------------+-----------------+
|North 24 Parganas|81.9179523809524 |
|Lucknow Rural    |81.77051851851844|
|Ranga Reddy      |81.59799999999973|
|Jaipur Urban     |81.56959999999998|
|Bangalore Urban  |81.5166363636364 |
|Ahmedabad Rural  |81.51437037037005|
|Pune Urban       |81.49633333333337|
|Bangalore Rural  |81.4654583333334 |
|Chennai          |81.46480555555551|
|Raigad Rural     |81.44547126436768|
|New Delhi        |81.33580000000023|
|Mumbai Suburban  |81.21784848484869|
|Pune Rural       |81.09099999999991|
+-----------------+-----------------+



Attendance vs Performance

In [0]:
attendance_perf = (
    spark_df
    .groupBy("district")
    .agg(
        avg("attendance_rate").alias("avg_attendance"),
        avg("avg_exam_score").alias("avg_exam_score")
    )
)
printSparkResult(attendance_perf, "Attendance vs Performance")



Attendance vs Performance
+-----------------+-----------------+-----------------+
|district         |avg_attendance   |avg_exam_score   |
+-----------------+-----------------+-----------------+
|Mumbai Suburban  |85.05196969696979|73.86800156832608|
|Lucknow Rural    |85.09100000000011|74.29948865791057|
|Pune Rural       |84.87370000000008|74.36627079470499|
|Raigad Rural     |85.10868965517223|74.32300193916949|
|New Delhi        |84.88033333333318|74.24764656372902|
|Bangalore Urban  |85.18836363636377|74.32613476269252|
|Ranga Reddy      |85.12433333333323|74.62764945714149|
|Chennai          |84.82261111111113|74.92851851336323|
|Bangalore Rural  |84.84958333333324|74.10615743529803|
|Jaipur Urban     |85.34006666666669|74.30500966987076|
|Ahmedabad Rural  |85.17011111111134|74.15630793985946|
|North 24 Parganas|85.03314285714292|74.52911778133698|
|Pune Urban       |85.2880833333334 |74.7496437213605 |
+-----------------+-----------------+-----------------+



Year wise Learning Growth Trend

In [0]:
learning_growth = (
    spark_df
    .groupBy("academic_year")
    .agg(avg("learning_growth_index").alias("avg_learning_growth"))
    .orderBy("academic_year")
)
printSparkResult(learning_growth, "Learning Growth Trend")



Learning Growth Trend
+-------------+-------------------+
|academic_year|avg_learning_growth|
+-------------+-------------------+
|2020         |0.5009100719424461 |
|2021         |0.5047865707434069 |
|2022         |0.5016342925659476 |
|2023         |0.5004808153477209 |
|2024         |0.4983441247002392 |
+-------------+-------------------+



Skill Index by District

In [0]:
skill_district = (
    spark_df
    .groupBy("district")
    .agg(avg("skill_index").alias("avg_skill_index"))
    .orderBy("avg_skill_index", ascending=False)
)
printSparkResult(skill_district, "Skill Index by District")


Skill Index by District
+-----------------+-----------------+
|district         |avg_skill_index  |
+-----------------+-----------------+
|Lucknow Rural    |65.19377777777781|
|Pune Rural       |65.15149999999993|
|Jaipur Urban     |65.12566666666663|
|Bangalore Rural  |65.06308333333325|
|Ahmedabad Rural  |65.03396296296286|
|New Delhi        |64.98637777777803|
|Bangalore Urban  |64.88509090909096|
|Raigad Rural     |64.85219540229829|
|Mumbai Suburban  |64.8146363636364 |
|North 24 Parganas|64.74528571428577|
|Pune Urban       |64.7398333333333 |
|Chennai          |64.70258333333328|
|Ranga Reddy      |64.2905185185185 |
+-----------------+-----------------+



In [0]:
spark.sql("USE school_enrollment_db")

DataFrame[]

In [0]:
%python
yearly_enrollment.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`yearly_enrollment`")

gender_enrollment.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`gender_enrollment`")

grade_enrollment.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`grade_enrollment`")

district_enrollment.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`district_enrollment`")

avgscore_year.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`average_score_by_year`")

passrate_district.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`pass_rate_by_district`")

attendance_perf.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`attendance_vs_performance`")

learning_growth.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`learning_growth_by_year`")

skill_district.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("`skill_index_by_district`")

School Analysis

In [0]:
from pyspark.sql.functions import avg
school_performance = (
    spark_df
    .groupBy("school_id", "school_name", "district","academic_year")
    .agg(
        avg("avg_exam_score").alias("avg_exam_score"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("attendance_rate").alias("avg_attendance"),
        avg("learning_growth_index").alias("avg_learning_growth"),
        avg("skill_index").alias("avg_skill_index")
    )
)

In [0]:
school_performance.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("School_Performance")

In [0]:
high_performing_schools = school_performance.filter(
    (school_performance.avg_pass_rate >= 80) &
    (school_performance.avg_attendance >= 85)
)
high_performing_schools.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("High_Performing_Schools")

In [0]:
from pyspark.sql.functions import avg
school_metrics = (
    spark_df
    .groupBy("school_id", "school_name", "district","academic_year")
    .agg(
        avg("attendance_rate").alias("avg_attendance"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("learning_growth_index").alias("avg_learning_growth"),
        avg("skill_index").alias("avg_skill_index"),
        avg("digital_access_percentage").alias("avg_digital_access"),
        avg("remedial_percentage").alias("avg_remedial")
    )
)

In [0]:
from pyspark.sql.functions import col, when
dropout_risk = school_metrics.withColumn(
    "dropout_risk_score",
    (1 - col("avg_attendance") / 100) * 0.25 +
    (1 - col("avg_pass_rate") / 100) * 0.25 +
    (1 - col("avg_learning_growth")) * 0.15 +
    (1 - col("avg_skill_index") / 100) * 0.15 +
    (1 - col("avg_digital_access") / 100) * 0.10 +
    (col("avg_remedial") / 100) * 0.10
)

In [0]:

dropout_risk = dropout_risk.withColumn(
    "risk_level",
    when(col("dropout_risk_score") >= 0.35, "High")
    .when(col("dropout_risk_score") >= 0.20, "Medium")
    .otherwise("Low")
)

In [0]:
dropout_risk.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("Dropout_Risk")

In [0]:
from pyspark.sql.functions import avg, stddev
school_stability = (
    spark_df
    .groupBy("school_id", "school_name", "district")
    .agg(
        avg("avg_exam_score").alias("Exam_mean_score"),
        stddev("avg_exam_score").alias("Exam_standard_deviation_score"),
        avg("pass_percentage").alias("Mean_Pass_rate"),
        stddev("pass_percentage").alias("Pass_rate_standard_deviation")
    )
)
school_stability.write.format("delta").mode("overwrite") \
    .saveAsTable("School_Stability")


In [0]:
from pyspark.sql.functions import min, max, first, last
yearsBoundaries= (
    spark_df
    .groupBy("school_id")
    .agg(
        min("academic_year").alias("First_year"),
        max("academic_year").alias("Last_year")
    )
)
schoolTrends = (
    spark_df
    .join(yearsBoundaries, "school_id")
    .groupBy("school_id", "school_name", "district")
    .agg(
        (last("pass_percentage") - first("pass_percentage")).alias("pass_rate_change"),
        (last("learning_growth_index") - first("learning_growth_index")).alias("learning_growth_change"),
        (last("skill_index") - first("skill_index")).alias("skill_index_change")
    )
)
schoolTrends.write.format("delta").mode("overwrite") \
    .saveAsTable("School_Improvement_Vs_Decline")

In [0]:
from pyspark.sql.functions import corr
remedialImpact = (
    spark_df
    .groupBy("school_id", "school_name", "district")
    .agg(
        avg("remedial_percentage").alias("avg_remedial"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("learning_growth_index").alias("avg_learning_growth")
    )
)
remedialImpact.write.format("delta").mode("overwrite") \
    .saveAsTable("Remedial_Impact")

In [0]:
from pyspark.sql.functions import max, min
district_gap = (
    spark_df
    .groupBy("district")
    .agg(
        max("pass_percentage").alias("Best_school_pass_rate"),
        min("pass_percentage").alias("Worst_school_pass_rate")
    )
    .withColumn(
        "performance_gap",
        col("best_school_pass_rate") - col("worst_school_pass_rate")
    )
)
district_gap.write.format("delta").mode("overwrite") \
    .saveAsTable("District_Performance_Gap")


In [0]:
UrbanRuralComparison = (
    spark_df
    .groupBy("district") 
    .agg(
        avg("enrolled_students").alias("avg_enrollment"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("digital_access_percentage").alias("avg_digital_access"),
        avg("skill_index").alias("avg_skill_index")
    )
)
UrbanRuralComparison.write.format("delta").mode("overwrite") \
    .saveAsTable("Urban_Rural_Comparison")

In [0]:
resourceEfficiency = (
    spark_df
    .groupBy("district")
    .agg(
        avg("student_teacher_ratio").alias("avg_student_teacher_ratio"),
        avg("pass_percentage").alias("avg_pass_rate"),
        avg("learning_growth_index").alias("avg_learning_growth")
    )
)
resourceEfficiency.write.format("delta").mode("overwrite") \
    .saveAsTable("Resource_Efficiency")

In [0]:
districtGrowth = (
    spark_df
    .groupBy("district", "academic_year")
    .agg(avg("learning_growth_index").alias("avg_learning_growth"))
    .orderBy("district", "academic_year")
)
districtGrowth.write.format("delta").mode("overwrite") \
    .saveAsTable("District_Growth_Momentum")

In [0]:
EnrollmentvsPerformance = (
    spark_df
    .groupBy("school_id", "school_name", "district", "academic_year")
    .agg(
        avg("enrolled_students").alias("avg_enrollment"),
        avg("pass_percentage").alias("avg_pass_rate")
    )
)
EnrollmentvsPerformance.write.format("delta").mode("overwrite") \
    .saveAsTable("Enrollment_vs_Performance")


In [0]:
from pyspark.sql.functions import when
learningTrajectory = (
    schoolTrends
    .withColumn(
        "trajectory_type",
        when(col("learning_growth_change") > 0.10, "Fast Improver")
        .when(col("learning_growth_change") > 0.02, "Slow Improver")
        .when(col("learning_growth_change") < -0.02, "Declining")
        .otherwise("Stagnant")
    )
)
learningTrajectory.write.format("delta").mode("overwrite") \
    .saveAsTable("Learning_Trajectory")


In [0]:
earlyWarning = (
    spark_df
    .groupBy("school_id", "school_name", "district")
    .agg(
        avg("attendance_rate").alias("avg_attendance"),
        avg("exam_participation_rate").alias("avg_exam_participation"),
        avg("remedial_percentage").alias("avg_remedial")
    )
)
earlyWarning.write.format("delta").mode("overwrite") \
    .saveAsTable("Early_Warning_Indicators")


Rank Schools using Score

In [0]:
from pyspark.sql.functions import col
schoolScore = school_metrics.withColumn(
    "school_score",
    (
        col("avg_pass_rate") * 0.35 +
        col("avg_attendance") * 0.25 +
        col("avg_learning_growth") * 100 * 0.20 +
        col("avg_skill_index") * 0.20
    )
)


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank, lit

# Add a dummy column for global ranking
schoolScore_rank_base = schoolScore.withColumn("global_partition", lit(1))

# Global ranking window (no performance warning)
overall_window = (
    Window.partitionBy("global_partition")
          .orderBy(col("school_score").desc())
)

# District-wise ranking window (already correct)
district_window = (
    Window.partitionBy("district")
          .orderBy(col("school_score").desc())
)

# Apply rankings
school_ranking = (
    schoolScore_rank_base
        .withColumn("overall_rank", rank().over(overall_window))
        .withColumn("district_rank", rank().over(district_window))
        .drop("global_partition")
)


In [0]:
school_ranking.write.format("delta").mode("overwrite") \
    .saveAsTable("School_Ranking")

Machine Learning Techniques

In [0]:
pand = spark_df.select(
    "school_id",
    "academic_year",
    "enrolled_students",
    "attendance_rate",
    "pass_percentage",
    "avg_exam_score",
    "learning_growth_index",
    "skill_index",
    "digital_access_percentage",
    "scholarship_percentage",
    "student_teacher_ratio"
).toPandas()

In [0]:
schoolyear_information = (
    pand
    .groupby(["school_id", "academic_year"], as_index=False)
    .agg(
        total_enrollment=("enrolled_students", "sum"),
        averageattendance=("attendance_rate", "mean"),
        averagepassrate=("pass_percentage", "mean"),
        averageexamscore=("avg_exam_score", "mean"),
        averagelearninggrowth=("learning_growth_index", "mean"),
        averageskillindex=("skill_index", "mean"),
        averagedigitalaccess=("digital_access_percentage", "mean"),
        averagescholarship=("scholarship_percentage", "mean"),
        averagestudentteacherratio=("student_teacher_ratio", "mean")
    )
)


In [0]:
schoolyear_information = schoolyear_information.sort_values(
    ["school_id", "academic_year"]
)
schoolyear_information["prev_enrollment"] = (
    schoolyear_information
    .groupby("school_id")["total_enrollment"]
    .shift(1)
)
schoolyear_information["EnrollmentGrowth"] = (
    (schoolyear_information["total_enrollment"] -
     schoolyear_information["prev_enrollment"]) /
     schoolyear_information["prev_enrollment"]
)
enrollment=schoolyear_information.dropna()

In [0]:
from sklearn.ensemble import RandomForestRegressor
features = [
    "averageattendance",
    "averagepassrate",
    "averageexamscore",
    "averagelearninggrowth",
    "averageskillindex",
    "averagedigitalaccess",
    "averagescholarship",
    "averagestudentteacherratio"
]
X = enrollment[features]
y = enrollment["EnrollmentGrowth"]
randomforest = RandomForestRegressor(
    n_estimators=100,
    max_depth=4,
    random_state=40
)
randomforest.fit(X, y)


In [0]:
import pandas as pd
featureImportance = pd.DataFrame({
    "feature": features,
    "importance": randomforest.feature_importances_*100
}).sort_values("importance", ascending=False)

print(featureImportance)


                      feature  importance
2            averageexamscore   20.044368
4           averageskillindex   17.230824
6          averagescholarship   12.950768
3       averagelearninggrowth   12.731256
7  averagestudentteacherratio   10.359344
0           averageattendance    9.509665
1             averagepassrate    9.103673
5        averagedigitalaccess    8.070101


In [0]:
spark.createDataFrame(featureImportance) \
    .write.format("delta").mode("overwrite") \
    .saveAsTable("Enrollment_Feature_Importance")


In [0]:
# Convert Spark to Pandas
inputs = spark_df.select(
    "school_id",
    "attendance_rate",
    "pass_percentage",
    "avg_exam_score",
    "learning_growth_index",
    "skill_index",
    "digital_access_percentage",
    "remedial_percentage",
    "exam_participation_rate",
    "student_teacher_ratio"
).toPandas()
# Aggregate to school level
schools = (
    inputs
    .groupby("school_id", as_index=False)
    .agg(
        averageattendance=("attendance_rate", "mean"),
        averagepass_rate=("pass_percentage", "mean"),
        averageexam_score=("avg_exam_score", "mean"),
        averagelearning_growth=("learning_growth_index", "mean"),
        averageskill_index=("skill_index", "mean"),
        averagedigital_access=("digital_access_percentage", "mean"),
        averageremedial=("remedial_percentage", "mean"),
        averageexam_participation=("exam_participation_rate", "mean"),
        averagestudent_teacher_ratio=("student_teacher_ratio", "mean")
    )
)


In [0]:
attendance_cutoff = schools["averageattendance"].quantile(0.25)
passrate_cutoff = schools["averagepass_rate"].quantile(0.25)
learninggrowth_cutoff = schools["averagelearning_growth"].quantile(0.25)
skillindex_cutoff=schools["averageskill_index"].quantile(0.25)
digitalaccess_cutoff=schools["averagedigital_access"].quantile(0.25)
remedial_cutoff=schools["averageremedial"].quantile(0.25)
exam_participation=schools["averageexam_participation"].quantile(0.25)
student_teacher_ratio=schools["averagestudent_teacher_ratio"].quantile(0.25)

In [0]:
schools["dropout_risk"] = (
    (schools["averageattendance"]<= attendance_cutoff)|
    (schools["averagepass_rate"]<= passrate_cutoff)|
    (schools["averagelearning_growth"]<= learninggrowth_cutoff)|
    (schools["averageskill_index"]<= skillindex_cutoff)|
    (schools["averagedigital_access"]<= digitalaccess_cutoff)|
    (schools["averageremedial"]<= remedial_cutoff)|
    (schools["averageexam_participation"]<= exam_participation)|
    (schools["averagestudent_teacher_ratio"]<=student_teacher_ratio)
).astype(int)


In [0]:
from sklearn.ensemble import RandomForestClassifier
features = [
    "averageattendance",
    "averagepass_rate",
    "averageexam_score",
    "averagelearning_growth",
    "averageskill_index",
    "averagedigital_access",
    "averageremedial",
    "averageexam_participation",
    "averagestudent_teacher_ratio"
]
X = schools[features]
y = schools["dropout_risk"]
randomforest = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=4
)
randomforest.fit(X, y)

In [0]:
import pandas as pd

dropoutfeature = pd.DataFrame({
    "feature": features,
    "importance": randomforest.feature_importances_*100
}).sort_values("importance", ascending=False)
print(dropoutfeature)


                        feature  importance
0             averageattendance   21.858281
6               averageremedial   16.040056
8  averagestudent_teacher_ratio   10.670944
1              averagepass_rate   10.181498
4            averageskill_index   10.059705
3        averagelearning_growth    9.010947
2             averageexam_score    8.665146
7     averageexam_participation    7.054047
5         averagedigital_access    6.459375


In [0]:
spark.createDataFrame(dropoutfeature) \
    .write.format("delta").mode("overwrite") \
    .saveAsTable("Dropout_Features")