In [0]:
import pandas as pd
import numpy as np
import os
from typing import List

In [0]:
raw = "/Volumes/workspace/default/capstone"

In [0]:
def csvFiles(directory_path):
    # Lists all CSV files from the given directory
    try:
        files = os.listdir(directory_path)
        csv_files = [
            os.path.join(directory_path, file)
            for file in files
            if file.endswith(".csv")
        ]
        return csv_files
    except Exception as e:
        raise Exception(f"Failed to list files from directory: {e}")

In [0]:
def combineFiles(file_paths):
    # Reads multiple CSV files and combines them into one DataFrame
    dataframes = []
    for path in file_paths:
        try:
            df = pd.read_csv(path)
            df["source_file"] = os.path.basename(path)
            dataframes.append(df)
        except Exception as e:
            raise Exception(f"Error reading file {path}: {e}")
    if not dataframes:
        raise ValueError("No dataframes were created during ingestion")
    return pd.concat(dataframes, ignore_index=True)

In [0]:
def validateData(df):
    # Performs structural validation on ingested data
    if df.empty:
        raise ValueError("Ingested dataset is empty")
    core_columns = ["school_id","academic_year","district","grade", "gender"]
    missing_columns = [col for col in core_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing core columns: {missing_columns}")

In [0]:
def StudentPerformanceColumns(df):
    # Adds raw student performance indicators during ingestion
    np.random.seed(40)
    df["median_exam_score"] = np.random.uniform(55, 95, len(df)).round(1)
    df["pass_percentage"] = np.random.uniform(65, 98, len(df)).round(1)
    df["fail_percentage"] = (100 - df["pass_percentage"]).round(1)
    df["distinction_percentage"] = np.random.uniform(5, 35, len(df)).round(1)
    df["attendance_rate"] = np.random.uniform(70, 100, len(df)).round(1)
    df["avg_internal_score"] = np.random.uniform(50, 90, len(df)).round(1)
    df["exam_participation_rate"] = np.random.uniform(75, 100, len(df)).round(1)
    df["remedial_percentage"] = np.random.uniform(5, 30, len(df)).round(1)
    df["student_teacher_ratio"] = np.random.uniform(15, 50, len(df)).round(1)
    df["digital_access_percentage"] = np.random.uniform(30, 95, len(df)).round(1)
    df["scholarship_percentage"] = np.random.uniform(10, 65, len(df)).round(1)
    df["learning_growth_index"] = np.random.uniform(0.1, 0.9, len(df)).round(2)
    df["subject_pass_rate"] = np.random.uniform(65, 98, len(df)).round(1)
    df["skill_index"] = np.random.uniform(35, 95, len(df)).round(1)
    return df

In [0]:
def ingestionEnrollment(raw_dir):
    # Orchestrates raw enrollment data ingestion
    csv_files = csvFiles(raw_dir)
    if not csv_files:
        raise FileNotFoundError("No CSV files found in data directory")
    rawData=combineFiles(csv_files)
    rawData=StudentPerformanceColumns(rawData)
    validateData(rawData)
    return rawData

In [0]:
try:
    enrollmentData = ingestionEnrollment(raw)
    print("Data ingestion completed successfully")
    print("Total records ingested:", enrollmentData.shape[0])
    enrollmentData.head()
except Exception as e:
    print("Data ingestion failed:", e)

Data ingestion completed successfully
Total records ingested: 41700


In [0]:
def ingestedFiles(df):
    # Returns unique file names that were ingested
    return df["source_file"].unique().tolist()

In [0]:
ingested_files = ingestedFiles(enrollmentData)
print("Files successfully ingested:")
for file in ingested_files:
    print(file)

Files successfully ingested:
enrollment_2020.csv
enrollment_2021.csv
enrollment_2022.csv
enrollment_2023.csv
enrollment_2024.csv


In [0]:
def combinedData(df, table_name):
    #Saves the combined enrollment data as a Delta table.
    try:
        (
            df.write
              .format("delta")
              .mode("append")
              .saveAsTable(table_name)
        )
        print(f"Combined raw data saved as Delta table: {table_name}")
        return df
    except Exception as e:
        raise Exception(f"Failed to save combined raw data as Delta table: {e}")


In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS school_enrollment_db")

DataFrame[]

In [0]:
from pyspark.sql import DataFrame as SparkDataFrame
import pandas as pd
# Ensure enrollmentData is a Spark DataFrame before writing to Delta
if isinstance(enrollmentData, pd.DataFrame):
    enrollmentData = spark.createDataFrame(enrollmentData)
elif not isinstance(enrollmentData, SparkDataFrame):
    raise TypeError("enrollmentData must be either a Pandas or Spark DataFrame")


In [0]:
combined_df = combinedData(
    enrollmentData,
    "combined_school_enrollment"
)

Combined raw data saved as Delta table: combined_school_enrollment
