In [0]:
val root_folder = "/user/yl12043_nyu_edu/schools-funding/"
var totalyears = 2017 to 2022
var newformatyears = 2019 to 2022
val prefixes: List[String] = List("B", "C", "D", "E")
val colNames: List[String] = List("K-12 Enrollment", "Pre-K Enrollment", "Classroom Teachers w/ 0-3 Years Experience", "Classroom Teachers w/ More than 3 Years Experience", "Paraprofessional Classroom Staff", "Principals & Other Admin Staff", "Pupil Support Services Staff", "All Remaining Staff", "Total Staff", "Classroom Teacher Salaries", "All Other Salaries", "Employee Benefits", "BOCES Services", "All Other", "General Ed K-12", "Pre-K", "Preschool", "School Administration", "Instructional Media", "Pupil Support Services", "State & Local Funding", "Federal Funding", "Total Funding Source by School", "State & Local Funding per Pupil", "Federal Funding per Pupil", "Total School Funding per Pupil")
val outputpathstart = "schools-funding/"
val outputpathend = "-schools-funding-clean.parquet"

for (cur <- totalyears){
    val next = cur + 1
    
    var originalDf = spark.read.option("header", "true")
        .option("multiLine", "true")
        .option("inferSchema", "true")
        .option("escape", "\"")
        .csv(root_folder + cur + "-" + next + "-schools-funding.csv")
    
    
    if (newformatyears.contains(cur)) {
        val trimmedDf = originalDf.withColumnRenamed("(Part D) All Other", "Other").withColumnRenamed("(Part D) All Other", "F1").withColumnRenamed("(Part D) Federal Funding", "F2")
        originalDf = trimmedDf
        for (p <- prefixes){
            var prefix = "(Part " + p + ") "
            val renamedDf = originalDf.columns.foldLeft(originalDf) { (df, c) =>
                if (c.startsWith(prefix)) df.withColumnRenamed(c, c.substring(prefix.length, c.length)) else df}
            originalDf = renamedDf
        }
    }
    else {
        val prefix = " "
        val removespaceDf = originalDf.withColumnRenamed(" All Other  ", "Other").withColumnRenamed(" Federal Funding  ", "F1").withColumnRenamed(" Federal Funding   ", "F2").columns.foldLeft(originalDf) { (df, c) =>
            df.withColumnRenamed(c, c.trim())}
        val renamedDf = removespaceDf.withColumnRenamed("BEDS Code", "School BEDS Code").withColumnRenamed("Classroom Teachers", "Classroom Teacher Salaries").withColumnRenamed("Grades K-12", "General Ed K-12")
        val cleanedDf = renamedDf.filter($"School BEDS Code" =!= "(blank)")
        originalDf = cleanedDf
    }
    
    originalDf = originalDf.filter($"School BEDS Code".isNotNull)

    var filteredDf = originalDf.select(
        "School Year",
        "School District",
        "School BEDS Code",
        "School Name",
        "School Type",
        "K-12 Enrollment",
        "Pre-K Enrollment",
        "Classroom Teachers w/ 0-3 Years Experience",
        "Classroom Teachers w/ More than 3 Years Experience",
        "Paraprofessional Classroom Staff",
        "Principals & Other Admin Staff",
        "Pupil Support Services Staff",
        "All Remaining Staff",
        "Total Staff",
        "Classroom Teacher Salaries",
        "All Other Salaries",
        "Employee Benefits",
        "BOCES Services",
        "All Other",
        "General Ed K-12",
        "Pre-K",
        "Preschool",
        "School Administration",
        "Instructional Media",
        "Pupil Support Services",
        "State & Local Funding",
        "Federal Funding",
        "Total Funding Source by School",
        "State & Local Funding per Pupil",
        "Federal Funding per Pupil",
        "Total School Funding per Pupil")
    
    for (c <- colNames) {
        filteredDf = filteredDf.withColumn(c, regexp_replace(filteredDf(c), "\\,", ""))
        filteredDf = filteredDf.withColumn(c, trim(filteredDf(c)))
        filteredDf = filteredDf.withColumn(c, regexp_replace(filteredDf(c), "-", ""))
        filteredDf = filteredDf.withColumn(c, regexp_replace(filteredDf(c), " ", ""))
        filteredDf = filteredDf.withColumn(c, when(col(c) =!= "", col(c)).otherwise(0))
        filteredDf = filteredDf.withColumn(c, col(c).cast("decimal(20,2)"))
    }
    filteredDf.na.fill(0)
    
    filteredDf = filteredDf.withColumn("Total Enrollment", filteredDf("K-12 Enrollment") + filteredDf("Pre-K"))
    filteredDf = filteredDf.withColumn("Total Teachers", filteredDf("Classroom Teachers w/ 0-3 Years Experience") + filteredDf("Classroom Teachers w/ More than 3 Years Experience"))
    filteredDf = filteredDf.withColumn("Teacher per Pupil", (filteredDf("Total Teachers") / filteredDf("Classroom Teachers w/ More than 3 Years Experience")).cast("decimal(20,2)"))
    filteredDf = filteredDf.withColumn("Staff per Pupil", (filteredDf("Total Staff") / filteredDf("Classroom Teachers w/ More than 3 Years Experience")).cast("decimal(20,2)"))
    
    /*
    filteredDf.printSchema
    z.show(filteredDf.limit(5))
    */
    z.show(filteredDf.summary())
    
    
    val outputpath = outputpathstart + cur + "-" + next + outputpathend
    filteredDf.write.mode("overwrite").parquet(outputpath)
}

In [1]:
var totalDf = spark.read.parquet("schools-funding/2017-2018-schools-funding-clean.parquet")
val years = 2018 to 2022
for (cur <- years){
    val next = cur + 1
    val readPath = outputpathstart + cur + "-" + next + outputpathend
    val df = spark.read.parquet(readPath)
    totalDf = totalDf.union(df)
}
totalDf.printSchema
z.show(totalDf.limit(5))
totalDf.write.mode("overwrite").parquet("schools-funding/2017-2023-schools-funding-clean.parquet")

In [2]:
Total Means & Standard Deviations

In [3]:
var df = spark.read.parquet("schools-funding/2017-2023-schools-funding-clean.parquet")

val TotalFunding = df.groupBy("School Year").agg(mean("Total Funding Source by School").cast("decimal(20,2)").alias("Average Total Funding"),
                                                    stddev("Total Funding Source by School").cast("decimal(20,2)").alias("Standard Deviation Total Funding"))
val PupilTotalFunding = df.groupBy("School Year").agg(mean("Total School Funding per Pupil").cast("decimal(20,2)").alias("Average Total Funding per Pupil"),
                                                    stddev("Total School Funding per Pupil").cast("decimal(20,2)").alias("Standard Deviation Total Funding per Pupil"))
val TotalStaff = df.groupBy("School Year").agg(mean("Total Staff").cast("decimal(20,2)").alias("Average Total Staff"),
                                                    stddev("Total Staff").cast("decimal(20,2)").alias("Standard Deviation Total Staff"))
val StateLocalFunding = df.groupBy("School Year").agg(mean("State & Local Funding").cast("decimal(20,2)").alias("Average State & Local Funding"),
                                                    stddev("State & Local Funding").cast("decimal(20,2)").alias("Standard Deviation State & Local Funding"))
val PupilStateLocalFunding = df.groupBy("School Year").agg(mean("State & Local Funding per Pupil").cast("decimal(20,2)").alias("Average State & Local Funding per Pupil"),
                                                    stddev("State & Local Funding per Pupil").cast("decimal(20,2)").alias("Standard Deviation State & Local Funding per Pupil"))
val FederalFunding = df.groupBy("School Year").agg(mean("Federal Funding").cast("decimal(20,2)").alias("Average Federal Funding"),
                                                    stddev("Federal Funding").cast("decimal(20,2)").alias("Standard Deviation Federal Funding"))
val PupilFederalFunding = df.groupBy("School Year").agg(mean("Federal Funding per Pupil").cast("decimal(20,2)").alias("Average Federal Funding per Pupil"),
                                                    stddev("Federal Funding per Pupil").cast("decimal(20,2)").alias("Standard Deviation Federal Funding per Pupil"))
val TotalEnrollment = df.groupBy("School Year").agg(mean("Total Enrollment").cast("decimal(20,2)").alias("Average Total Enrollment"),
                                                    stddev("Total Enrollment").cast("decimal(20,2)").alias("Standard Deviation Total Enrollment"))
val TeacherPerPupil = df.groupBy("School Year").agg(mean("Teacher per Pupil").cast("decimal(20,2)").alias("Average Teacher per Pupil"),
                                                    stddev("Teacher per Pupil").cast("decimal(20,2)").alias("Standard Deviation Teacher per Pupil"))
val StaffPerPupil = df.groupBy("School Year").agg(mean("Staff per Pupil").cast("decimal(20,2)").alias("Average Staff per Pupil"),
                                                    stddev("Staff per Pupil").cast("decimal(20,2)").alias("Standard Deviation Staff per Pupil"))
z.show(TotalFunding)
z.show(PupilTotalFunding)
z.show(TotalStaff)
z.show(StateLocalFunding)
z.show(PupilStateLocalFunding)
z.show(FederalFunding)
z.show(PupilFederalFunding)
z.show(TotalEnrollment)
z.show(TeacherPerPupil)
z.show(StaffPerPupil)

In [4]:
val schoolCodeTypeDf = df.dropDuplicates("School BEDS Code").select("School District", "School BEDS Code", "School Name", "School Type")
z.show(schoolCodeTypeDf.limit(5))