In [0]:
val root_folder = "/user/yl12043_nyu_edu/schools-funding/"
var totalyears = 2017 to 2022
var newformatyears = 2019 to 2022
val prefixes: List[String] = List("B", "C", "D", "E")
val colNames: List[String] = List("K-12 Enrollment", "Pre-K Enrollment", "Classroom Teachers w/ 0-3 Years Experience", "Classroom Teachers w/ More than 3 Years Experience", "Paraprofessional Classroom Staff", "Principals & Other Admin Staff", "Pupil Support Services Staff", "All Remaining Staff", "Total Staff", "Classroom Teacher Salaries", "All Other Salaries", "Employee Benefits", "BOCES Services", "All Other", "General Ed K-12", "Pre-K", "Preschool", "School Administration", "Instructional Media", "Pupil Support Services", "State & Local Funding", "Federal Funding", "Total Funding", "State & Local Funding per Pupil", "Federal Funding per Pupil", "Total Funding per Pupil")
val outputpathstart = "schools-funding/"
val outputpathend = "-schools-funding-clean.parquet"
val year2022Df = spark.read.option("header", "true")
        .option("multiLine", "true")
        .option("inferSchema", "true")
        .option("escape", "\"")
        .csv(root_folder + 2021 + "-" + 2022 + "-schools-funding.csv")
var districtCodeDf = year2022Df.select("School District BEDS Code", "School BEDS Code")
districtCodeDf = districtCodeDf.withColumnRenamed("School BEDS Code", "School_BEDS_Code").withColumnRenamed("School District BEDS Code", "District_BEDS_Code")

for (cur <- totalyears){
    val next = cur + 1
    
    var originalDf = spark.read.option("header", "true")
        .option("multiLine", "true")
        .option("inferSchema", "true")
        .option("escape", "\"")
        .csv(root_folder + cur + "-" + next + "-schools-funding.csv")
    
    
    if (newformatyears.contains(cur)) {
        val trimmedDf = originalDf.withColumnRenamed("(Part D) All Other", "Other").withColumnRenamed("(Part D) All Other", "F1").withColumnRenamed("(Part D) Federal Funding", "F2")
        originalDf = trimmedDf
        for (p <- prefixes){
            var prefix = "(Part " + p + ") "
            val renamedDf = originalDf.columns.foldLeft(originalDf) { (df, c) =>
                if (c.startsWith(prefix)) df.withColumnRenamed(c, c.substring(prefix.length, c.length)) else df}
            originalDf = renamedDf
        }
    }
    else {
        val prefix = " "
        val removespaceDf = originalDf.withColumnRenamed(" All Other  ", "Other").withColumnRenamed(" Federal Funding  ", "F1").withColumnRenamed(" Federal Funding   ", "F2").columns.foldLeft(originalDf) { (df, c) =>
            df.withColumnRenamed(c, c.trim())}
        val renamedDf = removespaceDf.withColumnRenamed("BEDS Code", "School BEDS Code").withColumnRenamed("Classroom Teachers", "Classroom Teacher Salaries").withColumnRenamed("Grades K-12", "General Ed K-12")
        val cleanedDf = renamedDf.filter($"School BEDS Code" =!= "(blank)")
        originalDf = cleanedDf
    }
    
    originalDf = originalDf.filter($"School BEDS Code".isNotNull)
    
    var prefix = cur + "-" + (next - 2000).toString
    val year: String = next.toString
    originalDf = originalDf.withColumn("Year", regexp_replace($"School Year", prefix, year).cast("Int"))
    originalDf = originalDf.withColumnRenamed("Total Funding Source by School", "Total Funding").withColumnRenamed("Total School Funding per Pupil", "Total Funding per Pupil")

    var filteredDf = originalDf.select(
        "Year",
        "School District",
        "School BEDS Code",
        "School Name",
        "School Type",
        "K-12 Enrollment",
        "Pre-K Enrollment",
        "Classroom Teachers w/ 0-3 Years Experience",
        "Classroom Teachers w/ More than 3 Years Experience",
        "Paraprofessional Classroom Staff",
        "Principals & Other Admin Staff",
        "Pupil Support Services Staff",
        "All Remaining Staff",
        "Total Staff",
        "Classroom Teacher Salaries",
        "All Other Salaries",
        "Employee Benefits",
        "BOCES Services",
        "All Other",
        "General Ed K-12",
        "Pre-K",
        "Preschool",
        "School Administration",
        "Instructional Media",
        "Pupil Support Services",
        "State & Local Funding",
        "Federal Funding",
        "Total Funding",
        "State & Local Funding per Pupil",
        "Federal Funding per Pupil",
        "Total Funding per Pupil")
    
    for (c <- colNames) {
        filteredDf = filteredDf.withColumn(c, regexp_replace(filteredDf(c), "\\,", ""))
        filteredDf = filteredDf.withColumn(c, trim(filteredDf(c)))
        filteredDf = filteredDf.withColumn(c, regexp_replace(filteredDf(c), "-", ""))
        filteredDf = filteredDf.withColumn(c, regexp_replace(filteredDf(c), " ", ""))
        filteredDf = filteredDf.withColumn(c, when(col(c) =!= "", col(c)).otherwise(0))
        filteredDf = filteredDf.withColumn(c, col(c).cast("decimal(20,2)"))
    }
    filteredDf.na.fill(0)
    
    filteredDf.printSchema
    
    var newDf = filteredDf.columns.foldLeft(filteredDf) { (df, c) =>
                df.withColumnRenamed(c, c.replaceAll("\\s", "_"))}
    newDf = newDf.join(districtCodeDf, "School_BEDS_Code")
    
    newDf = newDf.withColumn("Total_Enrollment", newDf("K-12_Enrollment") + newDf("Pre-K"))
    newDf = newDf.withColumn("Total_Teachers", newDf("Classroom_Teachers_w/_0-3_Years_Experience") + newDf("Classroom_Teachers_w/_More_than_3_Years_Experience"))
    newDf = newDf.withColumn("Teacher_per_Pupil", (newDf("Total_Teachers") / newDf("Total_Enrollment")).cast("decimal(20,2)"))
    newDf = newDf.withColumn("Staff_per_Pupil", (newDf("Total_Staff") / newDf("Total_Enrollment")).cast("decimal(20,2)"))
    
    /*
    filteredDf.printSchema
    z.show(filteredDf.limit(5))
    */
    z.show(newDf.summary())
    
    
    val outputpath = outputpathstart + cur + "-" + next + outputpathend
    newDf.write.mode("overwrite").parquet(outputpath)
}

In [1]:
var totalDf = spark.read.parquet("schools-funding/2017-2018-schools-funding-clean.parquet")
val years = 2018 to 2022
for (cur <- years){
    val next = cur + 1
    val readPath = outputpathstart + cur + "-" + next + outputpathend
    val df = spark.read.parquet(readPath)
    totalDf = totalDf.union(df)
}
totalDf.printSchema
z.show(totalDf.limit(5))
totalDf.write.mode("overwrite").parquet("schools-funding/2017-2023-schools-funding-clean.parquet")

In [2]:
Total Means & Standard Deviations

In [3]:
var df = spark.read.parquet("schools-funding/2017-2023-schools-funding-clean.parquet")

val TotalFunding = df.groupBy("Year").agg(mean("Total_Funding").cast("decimal(20,2)").alias("Average_Total_Funding"),
                                                    stddev("Total_Funding").cast("decimal(20,2)").alias("Standard_Deviation_Total_Funding"))
val PupilTotalFunding = df.groupBy("Year").agg(mean("Total_Funding_per_Pupil").cast("decimal(20,2)").alias("Average_Total_Funding_per_Pupil"),
                                                    stddev("Total_Funding_per_Pupil").cast("decimal(20,2)").alias("Standard_Deviation_Total_Funding_per_Pupil"))
val TotalStaff = df.groupBy("Year").agg(mean("Total_Staff").cast("decimal(20,2)").alias("Average_Total_Staff"),
                                                    stddev("Total_Staff").cast("decimal(20,2)").alias("Standard_Deviation_Total_Staff"))
val StateLocalFunding = df.groupBy("Year").agg(mean("State_&_Local_Funding").cast("decimal(20,2)").alias("Average_State_&_Local_Funding"),
                                                    stddev("State_&_Local_Funding").cast("decimal(20,2)").alias("Standard_Deviation_State_&_Local_Funding"))
val PupilStateLocalFunding = df.groupBy("Year").agg(mean("State_&_Local_Funding_per_Pupil").cast("decimal(20,2)").alias("Average_State_&_Local_Funding_per_Pupil"),
                                                    stddev("State_&_Local_Funding_per_Pupil").cast("decimal(20,2)").alias("Standard_Deviation_State_&_Local_Funding_per_Pupil"))
val FederalFunding = df.groupBy("Year").agg(mean("Federal_Funding").cast("decimal(20,2)").alias("Average_Federal_Funding"),
                                                    stddev("Federal_Funding").cast("decimal(20,2)").alias("Standard_Deviation_Federal_Funding"))
val PupilFederalFunding = df.groupBy("Year").agg(mean("Federal_Funding_per_Pupil").cast("decimal(20,2)").alias("Average_Federal_Funding_per_Pupil"),
                                                    stddev("Federal_Funding_per_Pupil").cast("decimal(20,2)").alias("Standard_Deviation_Federal_Funding_per_Pupil"))
val TotalEnrollment = df.groupBy("Year").agg(mean("Total_Enrollment").cast("decimal(20,2)").alias("Average_Total_Enrollment"),
                                                    stddev("Total_Enrollment").cast("decimal(20,2)").alias("Standard_Deviation_Total_Enrollment"))
val TeacherPerPupil = df.groupBy("Year").agg(mean("Teacher_per_Pupil").cast("decimal(20,2)").alias("Average_Teacher_per_Pupil"),
                                                    stddev("Teacher_per_Pupil").cast("decimal(20,2)").alias("Standard_Deviation_Teacher_per_Pupil"))
val StaffPerPupil = df.groupBy("Year").agg(mean("Staff_per_Pupil").cast("decimal(20,2)").alias("Average_Staff_per_Pupil"),
                                                    stddev("Staff_per_Pupil").cast("decimal(20,2)").alias("Standard_Deviation_Staff_per_Pupil"))
z.show(TotalFunding)
z.show(PupilTotalFunding)
z.show(TotalStaff)
z.show(StateLocalFunding)
z.show(PupilStateLocalFunding)
z.show(FederalFunding)
z.show(PupilFederalFunding)
z.show(TotalEnrollment)
z.show(TeacherPerPupil)
z.show(StaffPerPupil)

In [4]:
var df = spark.read.parquet("schools-funding/2017-2023-schools-funding-clean.parquet")
df = df.select(
    "Year",
    "School_BEDS_Code",
    "District_BEDS_Code",
    "School_Type",
    "Total_Enrollment",
    "Total_Teachers",
    "Teacher_per_Pupil",
    "Total_Staff",
    "Staff_per_Pupil",
    "Total_Funding",
    "Total_Funding_per_Pupil",
    "Federal_Funding",
    "Federal_Funding_per_Pupil",
    "State_&_Local_Funding",
    "State_&_Local_Funding_per_Pupil"
    )
z.show(df.limit(5))
df.write.mode("overwrite").parquet("schools-funding/2018-2023-schools-funding-processed.parquet")
df.write.mode("overwrite").csv("schools-funding/2018-2023-schools-funding-processed.csv")


In [5]:
var df = spark.read.parquet("/user/yx3494_nyu_edu/scr_data/funding_safety_nrc_inexp.parquet")
z.show(df.limit(5))
z.show(df.groupBy("Year").count())
