# Data Source

The safety dataset is from [IRS SCHOOL SAFETY AND THE EDUCATIONAL CLIMATE](https://www.p12.nysed.gov/irs/school_safety/school_safety_data_reporting.htmll).


# Data Analysis on Safety Data

In [2]:
val safetyIssueColumns = Seq(
  "Homocide",
  "Sexual_Offense",
  "Assault",
  "Weapons_Possession",
  "Dignity Act-Excluding_Cyberbullying",
  "Dignity Act-Cyberbullying",
  "Bomb_Threat",
  "False_Alarm",
  "Drugs",
  "Alcohol"
)

In [3]:
val nyccounties = "('QUEENS', 'MANHATTAN', 'BROOKLYN', 'BRONX', 'RICHMOND')"

# Per District

## In NYC

In [6]:

import org.apache.spark.sql.DataFrame

var totalNYCCountyDF: DataFrame = spark.emptyDataFrame

for (year <- 2018 to 2023) {
    
    val filePath = s"project/cleaned_data/safety$year.csv"
    val safetydf = spark.read
        .option("multiLine", "true")
        .option("inferSchema", "true")
        .option("escape", "\"")
        .option("header", true)
        .csv(filePath)

    safetydf.createOrReplaceTempView(s"safety$year")

    var countydf = spark.sql(s"""
        select `County`, ${safetyIssueColumns.map(col => s"avg(`$col`) as Avg_${col.replace(" ", "_").replace("-", "_")}").mkString(", ")}
        from safety$year
        where `County` in ${nyccounties}
        group by `County`
    """).withColumn("Year", lit(year))
    
    val columns = Seq("Year") ++ countydf.columns.slice(0, 11)
    var countydfNew = countydf.select(columns.map(countydf.col): _*)

    totalNYCCountyDF = if (totalNYCCountyDF.isEmpty) countydfNew else totalNYCCountyDF.union(countydfNew)
}

z.show(totalNYCCountyDF)

In [7]:
val avgSafetyIssueColumns = totalNYCCountyDF.columns.filter(col => col != "Year" && col != "County")

In [8]:
totalNYCCountyDF.createOrReplaceTempView("totalNYCCounty")

val NYCcountyYoYDF = spark.sql(s"""
  select
    cur.County,
    cur.Year,
    ${avgSafetyIssueColumns.map{col =>
        val name = col.replace(" ", "_").replace("-", "_")
        s"round((cur.`$col` - prev.`$col`) / prev.`$col` * 100, 2) AS `${name}_YoY (%)`"
    }.mkString(",\n    ")}
  from
    totalNYCCounty cur
  join
    totalNYCCounty prev
  on
    cur.Year = prev.Year + 1
    and cur.County = prev.County
""")

z.show(NYCcountyYoYDF)


## Outside of NYC

In [10]:
var totalOutCountyDF: DataFrame = spark.emptyDataFrame

for (year <- 2018 to 2023) {
    var countydf = spark.sql(s"""
        select `County`, ${safetyIssueColumns.map(col => s"avg(`$col`) as Avg_${col.replace(" ", "_").replace("-", "_")}").mkString(", ")}
        from safety$year
        where `County` not in ${nyccounties}
        group by `County`
    """).withColumn("Year", lit(year))
    
    val columns = Seq("Year") ++ countydf.columns.slice(0, 11)
    var countydfNew = countydf.select(columns.map(countydf.col): _*)

    totalOutCountyDF = if (totalOutCountyDF.isEmpty) countydfNew else totalOutCountyDF.union(countydfNew)
}

z.show(totalOutCountyDF)

In [11]:
totalOutCountyDF.createOrReplaceTempView("totalOutCounty")

val outCountyYoYDF = spark.sql(s"""
  select
    cur.County,
    cur.Year,
    ${avgSafetyIssueColumns.map{col =>
        val name = col.replace(" ", "_").replace("-", "_")
        s"round((cur.`$col` - prev.`$col`) / prev.`$col` * 100, 2) AS `${name}_YoY (%)`"
    }.mkString(",\n    ")}
  from
    totalOutCounty cur
  join
    totalOutCounty prev
  on
    cur.Year = prev.Year + 1
    and cur.County = prev.County
""")

z.show(outCountyYoYDF)

# Per School

In [13]:
// concat all the cleaned datasets views from 2018 to 2023 into one
val allDF = spark.sql("""
  select * , 2018 as Year from safety2018
  union all
  select * , 2019 as Year from safety2019
  union all
  select * , 2020 as Year from safety2020
  union all
  select * , 2021 as Year from safety2021
  union all
  select * , 2022 as Year from safety2022
  union all
  select * , 2023 as Year from safety2023
""")

allDF.createOrReplaceTempView("all")

z.show(allDF)

## In NYC

In [15]:
val NYCschoolDF = spark.sql(s"""
    select BEDS_Code, County, ${safetyIssueColumns.map(col => s"avg(`$col`) as Avg_${col.replace(" ", "_").replace("-", "_")}").mkString(", ")}
    from all
    where County in ${nyccounties}
    group by BEDS_Code, County
""").withColumn("Sum_Avg_Safety_Issues", avgSafetyIssueColumns.map(col).reduce(_ + _))

z.show(NYCschoolDF)

In [16]:
NYCschoolDF.createOrReplaceTempView("NYCschool")

val topSchoolDF = spark.sql("""
    select *
    from NYCschool
    order by Sum_Avg_Safety_Issues desc
    limit 10
""")

z.show(topSchoolDF)

## Outside of NYC

In [18]:
val outSchoolDF = spark.sql(s"""
    select BEDS_Code, County, ${safetyIssueColumns.map(col => s"avg(`$col`) as Avg_${col.replace(" ", "_").replace("-", "_")}").mkString(", ")}
    from all
    where County not in ${nyccounties}
    group by BEDS_Code, County
""").withColumn("Sum_Avg_Safety_Issues", avgSafetyIssueColumns.map(col).reduce(_ + _))

z.show(outSchoolDF)

In [19]:
outSchoolDF.createOrReplaceTempView("outSchool")

val topOutSchoolDF = spark.sql("""
    select *
    from outSchool
    order by Sum_Avg_Safety_Issues desc
    limit 10
""")

z.show(topOutSchoolDF)

# Per Safety Issue

## In NYC

In [22]:
var avgNYCSafetyIssueDF: DataFrame = spark.emptyDataFrame

for (year <- 2018 to 2023) {
    
    val count = safetydf.count()
    // sum the number of each violations / number of schools
    var avg = spark.sql(s"""
        select ${safetyIssueColumns.map(col => s"sum(`$col`)/$count as Avg_${col.replace(" ", "_").replace("-", "_")}").mkString(", ")}
        from safety${year}
        where County in $nyccounties
    """).withColumn("Year", lit(year))
    
    val columns = Seq("Year") ++ avg.columns.slice(0, 11)
    var avgNew = avg.select(columns.map(avg.col): _*)
    
    avgNYCSafetyIssueDF = if (avgNYCSafetyIssueDF.isEmpty) avgNew else avgNYCSafetyIssueDF.union(avgNew)
}

z.show(avgNYCSafetyIssueDF)

In [23]:
avgNYCSafetyIssueDF.createOrReplaceTempView("avg_nyc_safety_issues")

val maxNYCSafetyIssuesDF = spark.sql(s"""
    select *,
            case
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Homocide` then 'Avg_Homocide'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Sexual_Offense` then 'Avg_Sexual_Offense'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Assault` then 'Avg_Assault'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Weapons_Possession` then 'Avg_Weapons_Possession'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Dignity_Act_Excluding_Cyberbullying` then 'Avg_Dignity_Act_Excluding_Cyberbullying'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Dignity_Act_Cyberbullying` then 'Avg_Dignity_Act_Cyberbullying'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Bomb_Threat` then 'Avg_Bomb_Threat'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_False_Alarm` then 'Avg_False_Alarm'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Drugs` then 'Avg_Drugs'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Alcohol` then 'Avg_Alcohol'
                else 'Unknown'
            end as Max_Safety_Issue
    from avg_nyc_safety_issues
""")

z.show(maxNYCSafetyIssuesDF)


## Outside of NYC

In [25]:
var avgOutSafetyIssueDF: DataFrame = spark.emptyDataFrame

for (year <- 2018 to 2023) {
    
    val count = safetydf.count()
    // sum the number of each violations / number of schools
    var avg = spark.sql(s"""
        select ${safetyIssueColumns.map(col => s"sum(`$col`)/$count as Avg_${col.replace(" ", "_").replace("-", "_")}").mkString(", ")}
        from safety${year}
        where County not in $nyccounties
    """).withColumn("Year", lit(year))
    
    val columns = Seq("Year") ++ avg.columns.slice(0, 11)
    var avgNew = avg.select(columns.map(avg.col): _*)
    
    avgOutSafetyIssueDF = if (avgOutSafetyIssueDF.isEmpty) avgNew else avgOutSafetyIssueDF.union(avgNew)
}

z.show(avgOutSafetyIssueDF)

In [26]:
avgOutSafetyIssueDF.createOrReplaceTempView("avg_out_safety_issues")

val maxOutSafetyIssuesDF = spark.sql(s"""
    select *,
            case
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Homocide` then 'Avg_Homocide'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Sexual_Offense` then 'Avg_Sexual_Offense'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Assault` then 'Avg_Assault'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Weapons_Possession` then 'Avg_Weapons_Possession'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Dignity_Act_Excluding_Cyberbullying` then 'Avg_Dignity_Act_Excluding_Cyberbullying'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Dignity_Act_Cyberbullying` then 'Avg_Dignity_Act_Cyberbullying'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Bomb_Threat` then 'Avg_Bomb_Threat'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_False_Alarm` then 'Avg_False_Alarm'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Drugs` then 'Avg_Drugs'
                when greatest(${avgSafetyIssueColumns.map(col => s"`${col}`").mkString(", ")}) = `Avg_Alcohol` then 'Avg_Alcohol'
                else 'Unknown'
            end as Max_Safety_Issue
    from avg_out_safety_issues
""")

z.show(maxOutSafetyIssuesDF)
