<h1>Spark basics practice</h1>
<li>The code in the next cell extracts covid data from New York State's covid repository</li>
<li>The extracted data is stored in an RDD containing an Array of (String, String,Int,Int) matching (date, borough, positive cases, tests) for each day since March 1st 2020 (the data is ordered by time)</li>
<li>Use this RDD to answer the questions below</li>

In [1]:
val counties = Array("New+York", "Bronx","Kings","Queens","Richmond")
val base_url = "https://health.data.ny.gov/resource/xdss-u53e.json?County="
val urls = counties.map(a => base_url+a)
val results = urls.map(u => scala.io.Source.fromURL(u).mkString)
val data_rdd = spark.read.json(sc.parallelize(results)).rdd.map(r => (r(4).toString.slice(0,10), r(0).toString,r(3).toString.toInt,r(5).toString.toInt))

Intitializing Scala interpreter ...

Spark Web UI available at http://localhost:4041
SparkContext available as 'sc' (version = 3.2.0, master = local[*], app id = local-1646000884172)
SparkSession available as 'spark'


counties: Array[String] = Array(New+York, Bronx, Kings, Queens, Richmond)
base_url: String = https://health.data.ny.gov/resource/xdss-u53e.json?County=
urls: Array[String] = Array(https://health.data.ny.gov/resource/xdss-u53e.json?County=New+York, https://health.data.ny.gov/resource/xdss-u53e.json?County=Bronx, https://health.data.ny.gov/resource/xdss-u53e.json?County=Kings, https://health.data.ny.gov/resource/xdss-u53e.json?County=Queens, https://health.data.ny.gov/resource/xdss-u53e.json?County=Richmond)
results: Array[String] =
Array("[{"test_date":"2020-03-01T00:00:00.000","county":"New York","new_positives":"0","cumulative_number_of_positives":"0","total_number_of_tests":"0","cumulative_number_of_tests":"0"}
,{"test_date":"2020-03-02T00:00:00.000","county":"New York","new_positives...


In [2]:
data_rdd

res0: org.apache.spark.rdd.RDD[(String, String, Int, Int)] = MapPartitionsRDD[13] at map at <console>:28


In [3]:
data_rdd.take(3)

res1: Array[(String, String, Int, Int)] = Array((2020-03-01,New York,0,0), (2020-03-02,New York,0,0), (2020-03-03,New York,0,3))


<h1>Question 1</h1>
<li>Using <span style="color:blue">reduce</span> calculate the total number of cases and total number of tests in New York City</li>

In [4]:
val total_cases_tests = data_rdd.map(x => (x._3,x._4)).reduce((a,b) => (a._1+b._1, a._2+b._2))
println("NYC total: " + total_cases_tests._1 + " positive cases from " + total_cases_tests._2 + " tests")


NYC total: 2270783 positive cases from 48013596 tests


total_cases_tests: (Int, Int) = (2270783,48013596)


<h1>Question 2</h1>
Using <span style="color:blue">reduceByKey</span> calculate the number of cases and total number of tests by borough

In [5]:
val cases_tests_by_borough = data_rdd.map(x => (x._2, (x._3,x._4))).reduceByKey((a,b) => (a._1+b._1, a._2+b._2))
cases_tests_by_borough.foreach(x=> println(x._1 + ": " + x._2._1 + " positive cases from " + x._2._2 + " tests"))

Richmond: 161960 positive cases from 3043718 tests
Kings: 680089 positive cases from 14553145 tests
New York: 401978 positive cases from 11563532 tests
Bronx: 392556 positive cases from 7089709 tests
Queens: 634200 positive cases from 11763492 tests


cases_tests_by_borough: org.apache.spark.rdd.RDD[(String, (Int, Int))] = ShuffledRDD[16] at reduceByKey at <console>:24


<h1>Question 3</h1>
Using <span style="color:blue">reduceByKey</span> and <a href="http://homepage.cs.latrobe.edu.au/zhe/ZhenHeSparkRDDAPIExamples.html#sortBy">sortBy</a> calculate the number of cases and number of tests by month and return an RDD of Array(Month,(cases,tests)) sorted by the number of cases

In [6]:
val cases_tests_by_month = data_rdd.map(x => (x._1.slice(5,7), (x._3, x._4)))
                                   .reduceByKey((a,b) => (a._1+b._1, a._2+b._2))
                                   .sortBy(c => c._2._1,false) //descending
cases_tests_by_month.collect//.foreach(println)

cases_tests_by_month: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[23] at sortBy at <console>:26
res4: Array[(String, (Int, Int))] = Array((01,(809738,7659628)), (12,(576298,7019201)), (04,(200562,3039529)), (03,(169018,3020578)), (02,(150012,4526606)), (11,(86162,4329754)), (08,(66212,3255962)), (09,(58244,3620381)), (05,(53328,2658541)), (10,(51554,4235084)), (07,(31769,2280865)), (06,(17886,2367467)))


<h1>Question 4</h1>
Return an RDD of (date,borough,positivity) where positivity is the percentage of tests that are positive. For this problem, you must use the Option case class to handle the case where the divisor is zero

In [28]:
//double division, zerodivision handled by nan
val positivity_double = data_rdd.map(x => (x._1, x._2, 100.0*x._3/x._4))

positivity_double: org.apache.spark.rdd.RDD[(String, String, Double)] = MapPartitionsRDD[36] at map at <console>:24


In [49]:
//integer division, zerodivision handled using Option cases and None
val positivity_int = data_rdd.map(x => (x._1, x._2, try {Some(100*x._3/x._4)} catch {case e: Exception => None}))
        

positivity_int: org.apache.spark.rdd.RDD[(String, String, Option[Int])] = MapPartitionsRDD[52] at map at <console>:24


<h1>Question 5</h1>
Return the tuple (date,borough,positivity) where the positivity was the highest (use <span style="color:blue">takeOrdered</span>)

In [58]:
val highest_positive_double = positivity_double
            .takeOrdered(1)(Ordering[Double].reverse.on(x => if (x._3.isNaN) Double.MinValue else x._3))(0)

highest_positive_double: (String, String, Double) = (2020-03-29,Queens,65.69190600522194)


In [59]:
val highest_positive_int = positivity_int.map(x => (x._1, x._2, x._3 match {case Some(s) => s case None => Int.MinValue}))
                                         .takeOrdered(1)(Ordering[Int].reverse.on(y => y._3))(0)

highest_positive_int: (String, String, Int) = (2020-03-29,Queens,65)
