In [0]:
import pandas as pd
import os 

In [0]:
def loadData(file_path):
    # Loads the combined raw enrollment dataset
    try:
        df = pd.read_csv(file_path)
        return df
    except FileNotFoundError:
        raise FileNotFoundError(f"Raw data file not found at path: {file_path}")
    except Exception as e:
        raise Exception(f"Failed to load raw data: {e}")

In [0]:
# Load the table as a Spark DataFrame
rawData = spark.table("combined_school_enrollment")
print("Raw data loaded successfully")
print("Number of rows:", rawData.count())
print("Number of columns:", len(rawData.columns))

display(rawData.limit(20))

Raw data loaded successfully
Number of rows: 41700
Number of columns: 24


school_id,school_name,city,district,academic_year,grade,gender,enrolled_students,avg_exam_score,pass_percentage,attendance_rate,learning_growth_index,source_file,median_exam_score,fail_percentage,distinction_percentage,avg_internal_score,exam_participation_rate,remedial_percentage,student_teacher_ratio,digital_access_percentage,scholarship_percentage,subject_pass_rate,skill_index
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,4,Male,102.0,82.3,76.1,84.7,0.71,enrollment_2023.csv,63.2,23.9,31.6,83.9,86.9,26.1,45.8,84.4,40.5,79.1,81.1
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,4,Female,94.0,58.7,93.2,71.4,0.2,enrollment_2023.csv,94.3,6.8,25.6,77.2,77.3,8.6,39.8,86.8,49.6,71.9,56.1
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,4,M,32.0,53.8,66.2,85.3,0.32,enrollment_2023.csv,66.6,33.8,31.2,87.4,98.1,13.1,35.5,87.3,56.4,92.9,88.7
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,4,F,,59.6,65.7,76.3,0.27,enrollment_2023.csv,73.3,34.3,10.4,63.3,89.1,6.6,22.5,34.7,20.6,77.0,60.3
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,4,male,116.0,103.3,76.2,70.1,0.55,enrollment_2023.csv,93.7,23.8,13.1,68.7,96.9,13.5,19.7,57.7,61.7,75.8,57.0
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,5,Male,86.0,88.5,89.2,90.2,0.21,enrollment_2023.csv,55.6,10.8,23.3,52.0,84.6,7.5,26.3,78.7,23.4,87.6,36.9
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,5,Female,68.0,48.5,70.1,99.6,0.61,enrollment_2023.csv,91.9,29.9,14.1,50.2,86.7,28.1,28.5,67.0,61.3,68.6,59.2
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,5,M,131.0,64.1,73.3,84.1,0.32,enrollment_2023.csv,89.3,26.7,12.7,77.6,94.1,29.2,34.1,39.9,58.7,72.3,80.6
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,5,F,,96.7,78.9,78.0,0.89,enrollment_2023.csv,88.2,21.1,28.6,61.6,91.9,25.2,39.9,81.2,51.1,93.8,63.1
IND_SCH_0105,Govt High School Mumbai,Mumbai,Mumbai suburban,2023,5,male,108.0,44.2,74.9,83.6,0.62,enrollment_2023.csv,74.4,25.1,9.9,88.6,81.5,21.4,46.7,59.7,23.0,82.9,63.3


In [0]:
rawData = rawData.toPandas()

In [0]:
def datasetOverview(df):
    # Prints basic information about the dataset
    print("Dataset shape:", df.count())
    print("\nColumn names:")
    print(len(df.columns))
    print("\nData types:")
    print(df.dtypes)

In [0]:
def MissingValues(df):
    # Displays count of missing values per column
    print("\nMissing values per column:")
    print(df.isnull().sum())

In [0]:
def Duplicates(df):
    # Checks number of duplicate rows
    duplicate_count = df.duplicated().sum()
    print("\nDuplicate rows:", duplicate_count)

In [0]:
def CategoricalValues(df, column_name):
    # Displays unique values in a categorical column
    print(f"\nUnique values in '{column_name}':")
    print(df[column_name].value_counts())

In [0]:
def NumericRanges(df, column_name):
    # Displays min and max values for a numeric column
    print(f"\nRange check for '{column_name}':")
    print("Min:", df[column_name].min())
    print("Max:", df[column_name].max())

In [0]:
def AcademicYears(df):
    # Displays unique academic years present in the dataset
    print("\nAcademic years present:")
    print(sorted(df["academic_year"].dropna().unique()))

In [0]:
def checkdataQuality(df):
    # Runs all data quality checks without modifying data
    datasetOverview(df)
    MissingValues(df)
    Duplicates(df)
    AcademicYears(df)
    CategoricalValues(df, "gender")
    CategoricalValues(df, "district")
    NumericRanges(df, "avg_exam_score")
    NumericRanges(df, "pass_percentage")
    NumericRanges(df, "attendance_rate")

In [0]:
checkdataQuality(rawData)

Dataset shape: school_id                    41700
school_name                  41700
city                         41700
district                     41700
academic_year                41700
grade                        41700
gender                       41700
enrolled_students            38299
avg_exam_score               37627
pass_percentage              41700
attendance_rate              41700
learning_growth_index        41700
source_file                  41700
median_exam_score            41700
fail_percentage              41700
distinction_percentage       41700
avg_internal_score           41700
exam_participation_rate      41700
remedial_percentage          41700
student_teacher_ratio        41700
digital_access_percentage    41700
scholarship_percentage       41700
subject_pass_rate            41700
skill_index                  41700
dtype: int64

Column names:
24

Data types:
school_id                     object
school_name                   object
city                       

In [0]:
def cleanGenderColumn(df):
    # Standardizes gender values to Male and Female
    gender = {
        "male": "Male",
        "m": "Male",
        "M": "Male",
        "female": "Female",
        "f": "Female",
        "F": "Female"
    }
    df["gender"] = df["gender"].str.lower().map(gender)
    return df

In [0]:
def cleanDistrictNames(df):
    # Standardizes district naming and casing
    df["district"] = df["district"].str.strip().str.title()
    district = {
        "Raigad Rural": "Raigad Rural",
        "Raigad Rural ": "Raigad Rural",
        "Pune Urban": "Pune Urban",
        "Jaipur Urban": "Jaipur Urban",
        "New Delhi": "New Delhi",
        "Ranga Reddy": "Ranga Reddy",
        "Ahmedabad Rural": "Ahmedabad Rural"
    }
    df["district"] = df["district"].replace(district)
    return df

In [0]:
def cleanMissingValues(df):
    # Handles missing values using statistics
    df["enrolled_students"] = df.groupby("grade")["enrolled_students"].transform(lambda x: x.fillna(x.median()))
    df["avg_exam_score"] = df.groupby("grade")["avg_exam_score"].transform(lambda x: x.fillna(x.mean()))
    df["pass_percentage"] = df.groupby("grade")["pass_percentage"].transform(lambda x: x.fillna(x.mean()))
    df["attendance_rate"] = df.groupby("grade")["attendance_rate"].transform(lambda x: x.fillna(x.mean()))
    df["learning_growth_index"] = df.groupby("grade")["learning_growth_index"].transform(lambda x: x.fillna(x.median()))
    return df

In [0]:
def cleanNumericRanges(df):
    # Fits numeric values to valid ranges
    df["avg_exam_score"] = df["avg_exam_score"].clip(0, 100)
    df["pass_percentage"] = df["pass_percentage"].clip(0, 100)
    df["attendance_rate"] = df["attendance_rate"].clip(0, 100)
    df["learning_growth_index"] = df["learning_growth_index"].clip(0, 1)
    return df

In [0]:
def cleanEnrollmentData(df):
    # Orchestrates full data cleaning process
    df = cleanGenderColumn(df)
    df = cleanDistrictNames(df)
    df = cleanMissingValues(df)
    df = cleanNumericRanges(df)
    return df

In [0]:
clean_df = cleanEnrollmentData(rawData)
print("Cleaning completed")
print("Cleaned dataset shape:", clean_df.shape)

Cleaning completed
Cleaned dataset shape: (41700, 24)


In [0]:
def CleaningValidation(df):
    # Validates that dataset is clean after transformations
    print("Cleaning validation summary\n")
    # Missing values check
    missing = df.isnull().sum().sum()
    print("Total missing values:", missing)
    # Gender validation
    invalid_gender = df[~df["gender"].isin(["Male", "Female"])].shape[0]
    print("Invalid gender values:", invalid_gender)
    # Academic year validation
    invalid_years = df[~df["academic_year"].between(2020, 2024)].shape[0]
    print("Invalid academic years:", invalid_years)
    # Numeric range validation
    invalid_scores = df[(df["avg_exam_score"] < 0) | (df["avg_exam_score"] > 100)].shape[0]
    invalid_pass = df[(df["pass_percentage"] < 0) | (df["pass_percentage"] > 100)].shape[0]
    invalid_attendance = df[(df["attendance_rate"] < 0) | (df["attendance_rate"] > 100)].shape[0]
    invalid_growth = df[(df["learning_growth_index"] < 0) | (df["learning_growth_index"] > 1)].shape[0]
    print("Invalid exam scores:", invalid_scores)
    print("Invalid pass percentages:", invalid_pass)
    print("Invalid attendance rates:", invalid_attendance)
    print("Invalid learning growth values:", invalid_growth)


In [0]:
CleaningValidation(clean_df)

Cleaning validation summary

Total missing values: 0
Invalid gender values: 0
Invalid academic years: 0
Invalid exam scores: 0
Invalid pass percentages: 0
Invalid attendance rates: 0
Invalid learning growth values: 0


In [0]:
def CleanedData(df, table_name):
    #Saves cleaned enrollment data as a Delta table
    try:
        (
            df.write
              .format("delta")
              .mode("overwrite")
              .saveAsTable(table_name)
        )
        print(f"Cleaned data saved successfully as Delta table: {table_name}")
        return df
    except Exception as e:
        raise Exception(f"Failed to save cleaned data as Delta table: {e}")


In [0]:
spark.sql("USE school_enrollment_db")

DataFrame[]

In [0]:
clean_df = spark.createDataFrame(clean_df)

In [0]:
cleaned_df = CleanedData(
    clean_df,
    "cleaned_school_enrollment"
)


Cleaned data saved successfully as Delta table: cleaned_school_enrollment


In [0]:
%sql
OPTIMIZE school_enrollment_db.cleaned_school_enrollment;

path,metrics
,"List(1, 8, List(765940, 765940, 765940.0, 1, 765940), List(116135, 116342, 116262.875, 8, 930103), 0, null, null, 0, 1, 8, 0, true, 0, 0, 1767378366826, 1767378369853, 8, 1, null, List(0, 0), null, 24, 24, 667, 0, null)"
