## Data Merge and validation

Here we will check and join the cleaned data from 3 sources and then for analysis.

In [1]:
// define the project root
import org.apache.spark.sql.DataFrame

var grad_rate_data = spark.read.parquet("scr_data/grad_rate_cleaned.parquet")
var nrc_data = spark.read.parquet("scr_data/nrc_cleaned.parquet")
var safety_data = spark.read.parquet("BDAD10/safety_dataset.parquet")
var funding_data = spark.read.parquet("BDAD10/2018-2023-schools-funding-processed.parquet")

### NR/C Data Validation

In [3]:
nrc_data.printSchema()
z.show(nrc_data.limit(5))
z.show(nrc_data.groupBy("Year").count().orderBy("Year"))

### Safety Data Validation

In [5]:
// pre safety datasets
safety_data.printSchema()
z.show(safety_data.limit(5))
z.show(safety_data.groupBy("Year").count().orderBy("Year"))

Fix the `School_BEDS_Code` type to string

In [7]:
// convert the School_BEDS_Code to string type in the safety data
safety_data = safety_data.withColumn("School_BEDS_Code", safety_data("School_BEDS_Code").cast("string"))
safety_data.printSchema()

### Funding Data Validation

In [9]:
// funding dataset
funding_data.printSchema()
z.show(funding_data.limit(5))
z.show(funding_data.groupBy("Year").count().orderBy("Year"))

### Graduate Rate Validation

In [11]:
// pre safety datasets
grad_rate_data.printSchema()
z.show(grad_rate_data.limit(5))
z.show(grad_rate_data.groupBy("Year").count().orderBy("Year"))

In [12]:
val checkDf = grad_rate_data.join(funding_data, Seq("School_BEDS_Code", "Year"), "right")

In [13]:
z.show(checkDf.filter("Graduation_Rate is not null").groupBy("Year").count())

In [14]:
z.show(checkDf.filter("Graduation_Rate is not null").groupBy("School_Type").count())

## Global Reference Table

This table will have all the list of school with name, id, district id, district name and school type

In [16]:
// define global ref table
val refDf = funding_data.select("School_BEDS_Code", "District_BEDS_Code", "School_Type")
// join the safety data with the ref table
.join(safety_data.select("District_Name", "School_BEDS_Code", "County_Name", "School_Name"), Seq("School_BEDS_Code"), "inner")
// drop duplicate cols
.dropDuplicates("School_BEDS_Code")
z.show(refDf.limit(10))
refDf.count

## Merge the funding + safety data

In [18]:
// all funding, safety and nrc and inexp data
val funding_safety_df = safety_data.join(funding_data, Seq("School_BEDS_Code", "Year"), "inner")

In [19]:
funding_safety_df.printSchema()
z.show(funding_safety_df.limit(5))
z.show(funding_safety_df.groupBy("Year").count().orderBy("Year"))
funding_safety_df.count()
funding_safety_df.write.mode("overwrite").parquet("scr_data/funding_safety.parquet")

## Merge the all the dataset except the graduation rate data to a combined dataframe

In [21]:
// all funding, safety and nrc and inexp data
val funding_safety_nrc_df = safety_data
.join(nrc_data.drop("School_Name", "Total_Teachers"), Seq("School_BEDS_Code", "Year"), "inner")
.join(funding_data, Seq("School_BEDS_Code", "Year"), "inner")

funding_safety_nrc_df.write.mode("overwrite").parquet("scr_data/funding_safety_nrc.parquet")

In [22]:
funding_safety_nrc_df.printSchema()
z.show(funding_safety_nrc_df.limit(5))
z.show(funding_safety_nrc_df.groupBy("Year").count().orderBy("Year"))
funding_safety_nrc_df.count()

## Merge the all the dataset

This dataset is for the analysis of the High School Performance becuase the graduation rate is only available for High school.

In [24]:
// merge the grad rate data with the all other data
val full_data_df = grad_rate_data.join(funding_safety_nrc_df.drop("School_Name"), Seq("School_BEDS_Code", "Year"), "inner")
full_data_df.write.mode("overwrite").parquet("scr_data/funding_safety_nrc_gradRate.parquet")

In [25]:
full_data_df.printSchema()
z.show(full_data_df.limit(5))
z.show(full_data_df.groupBy("Year").count().orderBy("Year"))
full_data_df.count()

## Update the funding table with the the global ref table to get full school info

In [27]:
// funding remap with global
val funding_refactor_df = funding_data
.join(refDf, Seq("School_BEDS_Code", "District_BEDS_Code", "School_Type"), "left")
.join(nrc_data.drop("School_Name", "Total_Teachers"), Seq("School_BEDS_Code", "Year"), "inner")

z.show(funding_refactor_df.limit(10))
funding_refactor_df.printSchema()

// count by years
z.show(funding_refactor_df.groupBy("Year").count().orderBy("Year"))

val count_1 = funding_data.count
val count_2 = funding_refactor_df.count
println(s"Count 1: $count_1, Count 2: $count_2, Diff: ${count_1 - count_2}")

// save the data
funding_refactor_df.write.mode("overwrite").parquet("scr_data/funding_refact.parquet")