In [0]:
val filePath = "/user/yx3494_nyu_edu/scr_data/funding_safety.parquet"
val df = spark.read.parquet(filePath)
    
z.show(df)

df.createOrReplaceTempView("all")

In [1]:
val temp1 = spark.sql("""
    select count(distinct(School_BEDS_Code))
    from all
""")

z.show(temp1)

In [2]:
val safetyIssueColumns = df.columns.slice(5,15)

In [3]:
val fundingSafetyDF = df.withColumn(
  "Sum_Safety_Issues",
  safetyIssueColumns.map(colName => col(colName)).reduce(_ + _)
)

z.show(fundingSafetyDF)

fundingSafetyDF.createOrReplaceTempView("fundingSafety")

In [4]:
import org.apache.spark.sql.functions._

// Step 1: Calculate min and max for each column
val stats = fundingSafetyDF.agg(
  min(col("Total_Funding").cast("double")).alias("Total_Funding_min"),
  max(col("Total_Funding").cast("double")).alias("Total_Funding_max"),
  min(col("Sum_Safety_Issues").cast("double")).alias("Sum_Safety_Issues_min"),
  max(col("Sum_Safety_Issues").cast("double")).alias("Sum_Safety_Issues_max")
).collect()(0)

// Extract min and max values as scalars
val totalFundingMin = stats.getAs[Double]("Total_Funding_min")
val totalFundingMax = stats.getAs[Double]("Total_Funding_max")
val sumSafetyIssuesMin = stats.getAs[Double]("Sum_Safety_Issues_min")
val sumSafetyIssuesMax = stats.getAs[Double]("Sum_Safety_Issues_max")

// Step 2: Normalize the columns using Min-Max Normalization
val normalizedData = fundingSafetyDF
  .withColumn("Total_Funding_normalized", 
    (col("Total_Funding").cast("double") - lit(totalFundingMin)) / lit(totalFundingMax - totalFundingMin))
  .withColumn("Sum_Safety_Issues_normalized", 
    (col("Sum_Safety_Issues").cast("double") - lit(sumSafetyIssuesMin)) / lit(sumSafetyIssuesMax - sumSafetyIssuesMin))

// Step 3: Compute correlation between normalized columns
val correlation = normalizedData.stat.corr("Total_Funding_normalized", "Sum_Safety_Issues_normalized")

// Print the correlation
println(s"Correlation between Total_Funding and Sum_Safety_Issues: $correlation")

// Show normalized data if needed
z.show(normalizedData)

In [5]:
val filePath2 = "/user/yx3494_nyu_edu/scr_data/funding_safety_nrc_inexp_gradRate.parquet"
val df2 = spark.read.parquet(filePath2)
    
z.show(df2)

df2.createOrReplaceTempView("all2")

In [6]:
val temp2 = spark.sql("""
    select count(distinct(School_BEDS_Code))
    from all2
""")

z.show(temp2)

In [7]:
val fundingSafetyGradDF = df2.withColumn(
  "Sum_Safety_Issues",
  safetyIssueColumns.map(colName => col(colName)).reduce(_ + _))

z.show(fundingSafetyGradDF)

fundingSafetyGradDF.createOrReplaceTempView("fundingSafetyGrad")

In [8]:
val temp3 = spark.sql("""
    select *
    from fundingSafetyGrad
    where Graduation_Rate < 40
""")

z.show(temp3)

In [9]:
import org.apache.spark.sql.functions._

// Step 1: Calculate min and max for each column
val stats2 = fundingSafetyGradDF.agg(
  min(col("Total_Funding").cast("double")).alias("Total_Funding_min"),
  max(col("Total_Funding").cast("double")).alias("Total_Funding_max"),
  min(col("Sum_Safety_Issues").cast("double")).alias("Sum_Safety_Issues_min"),
  max(col("Sum_Safety_Issues").cast("double")).alias("Sum_Safety_Issues_max")
).collect()(0)

// Extract min and max values as scalars
val totalFundingMin2 = stats2.getAs[Double]("Total_Funding_min")
val totalFundingMax2 = stats2.getAs[Double]("Total_Funding_max")
val sumSafetyIssuesMin2 = stats2.getAs[Double]("Sum_Safety_Issues_min")
val sumSafetyIssuesMax2 = stats2.getAs[Double]("Sum_Safety_Issues_max")

// Step 2: Normalize the columns using Min-Max Normalization
val normalizedData2 = fundingSafetyGradDF
  .withColumn("Total_Funding_normalized", 
    (col("Total_Funding").cast("double") - lit(totalFundingMin2)) / lit(totalFundingMax2 - totalFundingMin2))
  .withColumn("Sum_Safety_Issues_normalized", 
    (col("Sum_Safety_Issues").cast("double") - lit(sumSafetyIssuesMin2)) / lit(sumSafetyIssuesMax2 - sumSafetyIssuesMin2))

// Step 3: Compute correlation between normalized columns
val correlation2 = normalizedData2.stat.corr("Total_Funding_normalized", "Sum_Safety_Issues_normalized")

// Print the correlation
println(s"Correlation between Total_Funding and Sum_Safety_Issues: ${correlation2}")

// Show normalized data if needed
z.show(normalizedData2)

In [10]:
import org.apache.spark.sql.functions._

val stats3 = fundingSafetyGradDF.agg(
  min(col("Graduation_Rate").cast("double")).alias("Graduation_Rate_min"),
  max(col("Graduation_Rate").cast("double")).alias("Graduation_Rate_max")
).collect()(0)

val graduationRateMin = stats3.getAs[Double]("Graduation_Rate_min")
val graduationRateMax = stats3.getAs[Double]("Graduation_Rate_max")

val normalizedData3 = fundingSafetyGradDF
  .withColumn("Graduation_Rate_normalized", 
    (col("Graduation_Rate").cast("double") - lit(graduationRateMin)) / lit(graduationRateMax - graduationRateMin))
  .withColumn("Sum_Safety_Issues_normalized", 
    (col("Sum_Safety_Issues").cast("double") - lit(sumSafetyIssuesMin2)) / lit(sumSafetyIssuesMax2 - sumSafetyIssuesMin2))

val correlation3 = normalizedData3.stat.corr("Graduation_Rate_normalized", "Sum_Safety_Issues_normalized")

println(s"Correlation between Graduation_Rate and Sum_Safety_Issues: ${correlation3}")

z.show(normalizedData3)