In [0]:
var df = spark.read.parquet("/user/yx3494_nyu_edu/scr_data/funding_safety.parquet")

In [1]:
z.show(df.limit(10))
df.printSchema

In [2]:
// add the total safety incidents column
df = df
.withColumn(
  "total_safety_incidents",
  coalesce(col("Homocide"), lit(0)) +
  coalesce(col("Sexual_Offense"), lit(0)) +
  coalesce(col("Assault"), lit(0)) +
  coalesce(col("Weapons_Possession"), lit(0)) +
  coalesce(col("Dignity_Act_Excluding_Cyberbullying"), lit(0)) +
  coalesce(col("Dignity_Act_Cyberbullying"), lit(0)) +
  coalesce(col("Bomb_Threat"), lit(0)) +
  coalesce(col("False_Alarm"), lit(0)) +
  coalesce(col("Drugs"), lit(0)) +
  coalesce(col("Alcohol"), lit(0))
)
.withColumn(
  "incidents_per_student",
  col("total_safety_incidents") / col("Total_Enrollment")
)


## Correlation Inspect

### Global Level

In [5]:
val cols = Array("total_safety_incidents", "Total_Enrollment", "Total_Teachers", "Total_Staff", "Total_Funding", "Staff_per_Pupil", "Total_Funding_per_Pupil", "incidents_per_student")
val global_correlations = cols.flatMap { col1 =>
  cols.filter(c => c != col1).map { col2 =>
    val _corr = df.select(corr(col1, col2).alias("correlation")).first().getAs[Double](0)
    (col1, col2, _corr)
  }
}

In [6]:
val corrDF = global_correlations.map {
  case (col1, col2, _corr) => (col1, col2, if (_corr.isNaN) 0.0 else _corr)
}.toSeq.toDF("info1", "info2", "correlation")

z.show(corrDF)

###  Country Level

In [8]:
val countyMetrics = df.groupBy("County_Name")
.agg(
    avg("incidents_per_student").alias("avg_incidents_per_student"),
    avg("Total_Funding_per_Pupil").alias("avg_funding_per_student"),
    avg("Teacher_per_Pupil").alias("avg_teacher_ratio"),
    avg("Staff_per_Pupil").alias("avg_staff_ratio"),
    sum("Total_Enrollment").alias("total_county_enrollment"),
    sum("total_safety_incidents").alias("total_county_incidents"),
    count("School_BEDS_Code").alias("total_county_schools")
)

In [9]:
val columnsToCorrelate = countyMetrics.drop("County_Name").columns
val correlations = columnsToCorrelate.flatMap { col1 =>
  columnsToCorrelate.filter(c => c != col1).map { col2 =>
    val _corr = countyMetrics.select(corr(col1, col2).alias("correlation")).first().getAs[Double](0)
    (col1, col2, _corr)
  }
}

In [10]:
countyMetrics.show()

In [11]:
z.show(df)

In [12]:
// create correlation df from previous map values
val corrDF = correlations.map {
  case (col1, col2, _corr) => (col1, col2, if (_corr.isNaN) 0.0 else _corr)
}.toSeq.toDF("info1", "info2", "correlation")

z.show(corrDF)

## Linear Regression Prediction on incidents_per_student

In [14]:
// Preprocessing for null values
var df_dropped = df.withColumn("Total_Funding_per_Pupil", col("Total_Funding_per_Pupil").cast("double"))
                .withColumn("Staff_per_Pupil", col("Staff_per_Pupil").cast("double"))
                .withColumn("incidents_per_student", col("incidents_per_student").cast("double"))

df_dropped = df_dropped.na.drop(Seq("Total_Funding_per_Pupil", "Staff_per_Pupil", "incidents_per_student"))

In [15]:
// min-max normalization
import org.apache.spark.sql.functions._

val stats = df_dropped.agg(
  min(col("incidents_per_student")).alias("incidents_per_student_min"),
  max(col("incidents_per_student")).alias("incidents_per_student_max")
).collect()(0)

val incidents_min = stats.getAs[Double]("incidents_per_student_min")
val incidents_max = stats.getAs[Double]("incidents_per_student_max")

val df_normalized = df_dropped
  .withColumn("incidents_per_student_normalized", 
    (col("incidents_per_student").cast("double") - lit(incidents_min)) / lit(incidents_max - incidents_min))

In [16]:
z.show(df_normalized)

In [17]:
// train test split
val Array(trainDF, testDF) = df_normalized.randomSplit(Array(.8, .2), seed=42)

// feature columns
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}

val categoricalCols = Array("District_Name", "County_Name")
val indexOutputCols = categoricalCols.map(_ + "_index")
val oheOutputCols = categoricalCols.map(_ + "_OHE")

val stringIndexer = new StringIndexer()
  .setInputCols(categoricalCols)
  .setOutputCols(indexOutputCols)
  .setHandleInvalid("skip")

val oheEncoder = new OneHotEncoder()
  .setInputCols(indexOutputCols)
  .setOutputCols(oheOutputCols)

val numericCols = Array("Total_Funding_per_Pupil", "Staff_per_Pupil")
val assemblerInputs = oheOutputCols ++ numericCols

val vecAssembler = new VectorAssembler()
  .setInputCols(assemblerInputs)
  .setOutputCol("features")
  
// linear regression
import org.apache.spark.ml.regression.LinearRegression
val lr = new LinearRegression()
  .setLabelCol("incidents_per_student_normalized")
  .setFeaturesCol("features")

// prediction
import org.apache.spark.ml.Pipeline
val pipeline = new Pipeline()
  .setStages(Array(stringIndexer, oheEncoder, vecAssembler, lr))

val pipelineModel = pipeline.fit(trainDF)
val predDF = pipelineModel.transform(testDF)
z.show(predDF.select("features", "incidents_per_student_normalized", "prediction"))

In [18]:
import org.apache.spark.ml.evaluation.RegressionEvaluator

val regressionEvaluator = new RegressionEvaluator()
  .setPredictionCol("prediction")
  .setLabelCol("incidents_per_student_normalized")
  .setMetricName("rmse")

val rmse = regressionEvaluator.evaluate(predDF)
println(f"RMSE is $rmse%1.2f")

val r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
println(f"R2 is $r2%1.2f")

### Compute Incident Index 

Here we use (total incidetns count / enrolled students) as the index

In [20]:
val schoolTypeAnalysis = df
.groupBy("School_Type")
.agg(
    avg("total_safety_incidents").alias("Incidents"),
    avg("Total_Funding_per_Pupil").alias("Avg Funding"),
)

In [21]:
z.show(schoolTypeAnalysis.orderBy(desc("Incidents")))

In [22]:
z.show(schoolTypeAnalysis.orderBy(desc("Avg Funding")))

### Aera Analysis

In [24]:
// areal analysis
val countyTotalFundingAnaluysis = df
.groupBy("County_Name")
.agg(sum("Total_Funding"), avg("Total_Funding_per_Pupil"))

In [25]:
z.show(countyTotalFundingAnaluysis.orderBy(desc("sum(Total_Funding)")))

In [26]:
z.show(countyTotalFundingAnaluysis.orderBy(desc("avg(Total_Funding_per_Pupil)")))

In [27]:
val countySafetyAnalysis = df
.groupBy("County_Name")
.agg(
  avg("incidents_per_student").alias("avg_incidents_per_student"),
  sum("total_safety_incidents").alias("total_incidents")
)

In [28]:
z.show(countySafetyAnalysis.orderBy(desc("avg_incidents_per_student")))

In [29]:
z.show(countySafetyAnalysis.orderBy(desc("total_incidents")))

## Long Term Trend Analysis

In [31]:
import org.apache.spark.sql.expressions.Window
val windowBySchool = Window.partitionBy("School_BEDS_Code").orderBy("Year")
val schoolChanges = df
.withColumn("prev_incidents", lag("incidents_per_student", 1).over(windowBySchool).cast("int"))
.withColumn("prev_funding", lag("Total_Funding_per_Pupil", 1).over(windowBySchool).cast("double"))
.withColumn("incident_change", (col("incidents_per_student") - col("prev_incidents").cast("int")))
.withColumn("funding_change", (col("Total_Funding_per_Pupil") - col("prev_funding").cast("double")))
.withColumn("incident_change_pct", when(col("prev_incidents") === 0, 0).otherwise(col("incident_change") / col("prev_incidents") * 100))
.withColumn("funding_change_pct",col("funding_change") / col("prev_funding") * 100)

In [32]:
val overallChanges = schoolChanges
  .groupBy("School_BEDS_Code", "School_Name", "District_Name", "County_Name")
  .agg(
    max("Year").alias("last_year"),
    min("Year").alias("first_year"),
    max("incidents_per_student").alias("max_incidents"),
    min("incidents_per_student").alias("min_incidents"),
    max("Total_Funding_per_Pupil").alias("max_funding"),
    min("Total_Funding_per_Pupil").alias("min_funding"),
    avg("incidents_per_student").alias("avg_incidents"),
    avg("Total_Funding_per_Pupil").alias("avg_funding"),
    sum(abs($"incident_change")).alias("total_incident_change"),
    sum(abs($"funding_change")).alias("total_funding_change")
  )

In [33]:
val topIncidentChangeSchools = overallChanges.orderBy(desc("total_incident_change")).limit(10).select("School_Name", "District_Name", "County_Name", "total_incident_change", "total_funding_change", "avg_incidents", "avg_funding")
z.show(topIncidentChangeSchools)

In [34]:
val topFundingChangeSchools = overallChanges.orderBy(desc("total_funding_change")).limit(10).select("School_Name", "District_Name", "County_Name", "total_incident_change", "total_funding_change", "avg_incidents", "avg_funding")
z.show(topFundingChangeSchools)

In [35]:
val lowestIncidentSchools = overallChanges
.where("avg_incidents > 0")
.orderBy("avg_incidents")
.select("School_Name", "District_Name", "County_Name", "avg_incidents")

z.show(lowestIncidentSchools.limit(20))