In [1]:

val sampleDF = spark
 .read
 .option("samplingRatio", 0.001)
 .option("header", true)
 .csv("""C:/Users/alejandro.perez/Documents/LearningSparkV2-master/chapter3/data/sf-fire-calls.csv""")


Intitializing Scala interpreter ...

Spark Web UI available at http://L2202039.bosonit.local:4040
SparkContext available as 'sc' (version = 3.1.3, master = local[*], app id = local-1649147642249)
SparkSession available as 'spark'


sampleDF: org.apache.spark.sql.DataFrame = [CallNumber: string, UnitID: string ... 26 more fields]


In [2]:
// In Scala it would be similar
import org.apache.spark.sql.types._
val fireSchema = StructType(Array(StructField("CallNumber", IntegerType, true),
 StructField("UnitID", StringType, true),
 StructField("IncidentNumber", IntegerType, true),
 StructField("CallType", StringType, true), 
 StructField("CallDate", StringType, true), 
 StructField("WatchDate", StringType, true),
 StructField("CallFinalDisposition", StringType, true),
 StructField("AvailableDtTm", StringType, true),
 StructField("Address", StringType, true), 
 StructField("City", StringType, true), 
 StructField("Zipcode", IntegerType, true), 
 StructField("Battalion", StringType, true), 
 StructField("StationArea", StringType, true), 
 StructField("Box", StringType, true), 
 StructField("OriginalPriority", StringType, true), 
 StructField("Priority", StringType, true), 
 StructField("FinalPriority", IntegerType, true), 
 StructField("ALSUnit", BooleanType, true), 
 StructField("CallTypeGroup", StringType, true),
 StructField("NumAlarms", IntegerType, true),
 StructField("UnitType", StringType, true),
 StructField("UnitSequenceInCallDispatch", IntegerType, true),
 StructField("FirePreventionDistrict", StringType, true),
 StructField("SupervisorDistrict", StringType, true),
 StructField("Neighborhood", StringType, true),
 StructField("Location", StringType, true),
 StructField("RowID", StringType, true),
 StructField("Delay", FloatType, true)))
// Read the file using the CSV DataFrameReader
val sfFireFile="C:/Users/alejandro.perez/Documents/LearningSparkV2-master/chapter3/data/sf-fire-calls.csv"
val fireDF = spark.read.schema(fireSchema)
 .option("header", "true")
 .csv(sfFireFile)

import org.apache.spark.sql.types._
fireSchema: org.apache.spark.sql.types.StructType = StructType(StructField(CallNumber,IntegerType,true), StructField(UnitID,StringType,true), StructField(IncidentNumber,IntegerType,true), StructField(CallType,StringType,true), StructField(CallDate,StringType,true), StructField(WatchDate,StringType,true), StructField(CallFinalDisposition,StringType,true), StructField(AvailableDtTm,StringType,true), StructField(Address,StringType,true), StructField(City,StringType,true), StructField(Zipcode,IntegerType,true), StructField(Battalion,StringType,true), StructField(StationArea,StringType,true), StructField(Box,StringType,true), StructField(OriginalPriority,StringType,true), StructField(Priority,StringType,true), StructField(FinalPriority,IntegerType,true), ...


In [3]:
val fewFireDF = fireDF
 .select("IncidentNumber", "AvailableDtTm", "CallType")
 .where($"CallType" =!= "Medical Incident") 
fewFireDF.show(5, false)


+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



fewFireDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [IncidentNumber: int, AvailableDtTm: string ... 1 more field]


In [4]:
import org.apache.spark.sql.functions._
fireDF
 .select("CallType")
 .where(col("CallType").isNotNull)
 .agg(countDistinct('CallType) as 'DistinctCallTypes)
 .show()

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



import org.apache.spark.sql.functions._


In [5]:
// In Scala
fireDF
 .select("CallType")
 .where($"CallType".isNotNull)
 .distinct()
 .show(10, false)


+-----------------------------------+
|CallType                           |
+-----------------------------------+
|Elevator / Escalator Rescue        |
|Marine Fire                        |
|Aircraft Emergency                 |
|Confined Space / Structure Collapse|
|Administrative                     |
|Alarms                             |
|Odor (Strange / Unknown)           |
|Citizen Assist / Service Call      |
|HazMat                             |
|Watercraft in Distress             |
+-----------------------------------+
only showing top 10 rows



In [6]:
// In Scala
val newFireDF = fireDF.withColumnRenamed("Delay", "ResponseDelayedinMins")
newFireDF
 .select("ResponseDelayedinMins")
 .where($"ResponseDelayedinMins" > 5)
 .show(5, false)

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



newFireDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [7]:
// In Scala
val fireTsDF = newFireDF
 .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
 .drop("CallDate")
 .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
 .drop("WatchDate")
 .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),
 "MM/dd/yyyy hh:mm:ss a"))
 .drop("AvailableDtTm")
// Select the converted columns
fireTsDF
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, false)

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



fireTsDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [8]:
// In Scala
fireTsDF
 .select(year($"IncidentDate"))
 .distinct()
 .orderBy(year($"IncidentDate"))
 .show()


+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



In [9]:
// In Scala 
fireTsDF
 .select("CallType")
 .where(col("CallType").isNotNull)
 .groupBy("CallType")
 .count()
 .orderBy(desc("count"))
 .show(10, false)

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



In [10]:
// In Scala
import org.apache.spark.sql.{functions => F}
fireTsDF
 .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
 F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
 .show()

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



import org.apache.spark.sql.{functions=>F}


In [11]:
//What were all the different types of fire call in 2018?


fireTsDF
 .select($"CallType")
 .where(year($"IncidentDate")===2018)
 .distinct()
 .show()

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|           Explosion|
|        Vehicle Fire|
|  Suspicious Package|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
|        Water Rescue|
|   Electrical Hazard|
|      Structure Fire|
|    Medical Incident|
|          Fuel Spill|
|Smoke Investigati...|
|Train / Rail Inci...|
+--------------------+



In [12]:
// What months whithin the year 2018 saw the highest number of fire calls?


fireTsDF
 .select(month($"IncidentDate"))
 .where(year($"IncidentDate")===2018 && month($"IncidentDate").isNotNull)
 .groupBy("month(IncidentDate)")
 .count()
 .orderBy(desc("count"))
 .show()



+-------------------+-----+
|month(IncidentDate)|count|
+-------------------+-----+
|                 10| 1068|
|                  5| 1047|
|                  3| 1029|
|                  8| 1021|
|                  1| 1007|
|                  7|  974|
|                  6|  974|
|                  9|  951|
|                  4|  947|
|                  2|  919|
|                 11|  199|
+-------------------+-----+



In [13]:
// Which neighborhood in San Francisco generated the most fire calls in 2018?

fireTsDF
 .select("Zipcode")
 .where(year($"IncidentDate")===2018)
 .groupBy("Zipcode")
 .count()
 .orderBy(desc("count"))
 .show()


+-------+-----+
|Zipcode|count|
+-------+-----+
|  94103| 1312|
|  94102| 1305|
|  94109|  865|
|  94110|  854|
|  94124|  508|
|  94112|  462|
|  94107|  432|
|  94115|  395|
|  94133|  354|
|  94122|  340|
|  94117|  308|
|  94114|  305|
|  94105|  280|
|  94118|  279|
|  94121|  263|
|  94132|  251|
|  94108|  243|
|  94134|  231|
|  94123|  204|
|  94111|  189|
+-------+-----+
only showing top 20 rows



In [14]:
//Which neighboorhoods had the worst response times to fire calls in 2018?

fireTsDF
 .select("Zipcode","ResponseDelayedinMins")        
 .where(year($"IncidentDate")===2018)
 .groupBy("Zipcode")
 .avg("ResponseDelayedinMins")
 .orderBy(desc("avg(ResponseDelayedinMins)"))
 .show()

+-------+--------------------------+
|Zipcode|avg(ResponseDelayedinMins)|
+-------+--------------------------+
|  94129|         5.981565614541371|
|  94130|         5.453703684111436|
|  94105|         5.363809512981347|
|  94133|         5.230131806626832|
|  94127|         4.818269235296891|
|  94124|        4.6581693111559535|
|  94111|         4.231922368050883|
|  94118|        4.0838112442830985|
|  94132|          3.98419654689937|
|  94107|        3.9793209974412567|
|  94109|         3.976974945812556|
|  94134|        3.9474026205993833|
|  94102|        3.9365134215903006|
|  94122|        3.8774509791065666|
|  94117|        3.8475649381032238|
|  94114|        3.7992349614373975|
|  94116|        3.7678315418381847|
|  94103|        3.7607596581404255|
|   null|         3.705555518468221|
|  94121|        3.6857414451842074|
+-------+--------------------------+
only showing top 20 rows



In [15]:
// Which week in the year in 2018 had the most fire calls?


fireTsDF
 .select(weekofyear($"IncidentDate"))
 .where(year($"IncidentDate") === 2018)
 .groupBy("weekofyear(IncidentDate)")
 .count()
 .orderBy(desc("count"))
 .show()


+------------------------+-----+
|weekofyear(IncidentDate)|count|
+------------------------+-----+
|                      22|  259|
|                      40|  255|
|                      43|  250|
|                      25|  249|
|                       1|  246|
|                      44|  244|
|                      32|  243|
|                      13|  243|
|                      11|  240|
|                       5|  236|
|                      18|  236|
|                      23|  235|
|                      31|  234|
|                       2|  234|
|                      42|  234|
|                      19|  233|
|                       8|  232|
|                      10|  232|
|                      34|  232|
|                      21|  231|
+------------------------+-----+
only showing top 20 rows



In [16]:
// Is there a correlation between neighboorhood, zip code and number of fire calls ?

fireTsDF
 .select($"Zipcode")
 .groupBy("Zipcode")
 .count()
 .show()

+-------+-----+
|Zipcode|count|
+-------+-----+
|  94109|14686|
|  94115| 7812|
|  94112| 8421|
|  94127| 1881|
|  94108| 4084|
|  94121| 4555|
|  94105| 4236|
|   null|  142|
|  94131| 3236|
|  94116| 3933|
|  94134| 5009|
|  94124| 9236|
|  94102|21840|
|  94114| 5175|
|  94107| 6941|
|  94111| 2974|
|  94103|20897|
|  94117| 5804|
|  94122| 6355|
|  94110|14801|
+-------+-----+
only showing top 20 rows



In [19]:
// How can we use Parquet files or SQL tables to stores this data and read it back?


fireTsDF.write.parquet("C:/Users/alejandro.perez/Documents/Ejercicios Spark/Tema3 Learning Spark/Guardar7/fire.parquet")



In [24]:
var fireDF2 = spark.read.schema(fireSchema)

fireDF2: org.apache.spark.sql.DataFrameReader = org.apache.spark.sql.DataFrameReader@ca19022


In [26]:

val parqDF = spark.read.parquet("C:/Users/alejandro.perez/Documents/Ejercicios Spark/Tema3 Learning Spark/Guardar7/fire.parquet")


parqDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [27]:
parqDF.show()

+----------+------+--------------+-----------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+--------------+---------------------+-------------------+-------------------+-------------------+
|CallNumber|UnitID|IncidentNumber|         CallType|CallFinalDisposition|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|         RowID|ResponseDelayedinMins|       IncidentDate|        OnWatchDate|      AvailableDtTS|
+----------+------+--------------+-----------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+-