## Hackathon 04: Performance Tuning
## Find Avg taxi-in (Flights) time per Description (Carrier)  per Airline Mfg (Planes)
### Take below query (run via RDD's) and Performance Tune it to see how fast you can get it to run

In [0]:
# First confirm have 3 files. If not notify instructor and he'll Email you the files
display(dbutils.fs.ls("dbfs:/FileStore/tables/header_flights.csv"))
display(dbutils.fs.ls("dbfs:/FileStore/tables/header_carriers.csv"))
display(dbutils.fs.ls("dbfs:/FileStore/tables/header_planes.csv"))

path,name,size
dbfs:/FileStore/tables/header_flights.csv,header_flights.csv,136035443


path,name,size
dbfs:/FileStore/tables/header_carriers.csv,header_carriers.csv,37794


path,name,size
dbfs:/FileStore/tables/header_planes.csv,header_planes.csv,432996


In [0]:
%scala

// First run as RDD to get benchmark of how much time to execute the query
val flight1 = sc.textFile("dbfs:/FileStore/tables/header_flights.csv").map(line => line.split(","))        
val carrier1 = sc.textFile("dbfs:/FileStore/tables/header_carriers.csv").map(line => line.split(","))        
val planes1 = sc.textFile("dbfs:/FileStore/tables/header_planes.csv").map(line => line.split(",")).filter(line => line.length == 9)               

val flight_header = flight1.first()
val carrier_header = carrier1.first()
val planes_header = planes1.first()

val flight2 = flight1.filter(x => x != flight_header) 
val carrier2 = carrier1.filter(x => x != carrier_header) 
val planes2 = planes1.filter(x => x != planes_header) 

val flightC = flight2.keyBy(line => line(5))
val carrierC = carrier2.keyBy(line => line(0))

flightC.first()
carrierC.first()

val join1 = flightC.join(carrierC)        
val join1P = join1.map{case (a,b) => (b._1(7), (a,b._1,b._2))} 

val planesP = planes2.keyBy(line => line(0))        
val finalRdd = join1P.join(planesP) 
finalRdd.first()
      
val neededData = finalRdd.map{case (a,b) => ((b._1._3(1), b._2(2)), b._1._2(10).toInt)}    
val avgTaxiInByAir = neededData.groupByKey().mapValues(list => list.sum.toFloat/list.size)   

avgTaxiInByAir.take(10)

In [0]:
flightsDF = spark.read.csv("dbfs:/FileStore/tables/header_flights.csv", header=True, inferSchema=True).repartition(2)
carriersDF = spark.read.csv("dbfs:/FileStore/tables/header_carriers.csv", header=True, inferSchema=True).repartition(2)
planesDF = spark.read.csv("dbfs:/FileStore/tables/header_planes.csv", header=True, inferSchema=True).repartition(2)
## airportsDF = spark.read.csv("dbfs:/FileStore/tables/header_airports.csv", header=True, inferSchema=True).repartition(2)

fDF = flightsDF.select("uniquecarrier", "tailnum", "taxiin")      
cDF = carriersDF.select("code", "description")                      
pDF = planesDF.select("tailnum", "manufacturer")                         

fDF.write.format("parquet").save("dbfs:/FileStore/tables/flight2", mode = "overwrite")
cDF.write.format("parquet").save("dbfs:/FileStore/tables/carrier2/", mode = "overwrite")
pDF.write.format("parquet").save("dbfs:/FileStore/tables/plane2/", mode = "overwrite")


In [0]:
flightsDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/flight2/").coalesce(2)
carriersDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/carrier2/").coalesce(1)
planesDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/plane2/").coalesce(1)
# airportsDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/airport1/")

In [0]:
flightsDF.createOrReplaceTempView("flights")
carriersDF.createOrReplaceTempView("carriers")
planesDF.createOrReplaceTempView("planes")
# airportsDF.coalesce(4).createOrReplaceTempView("airports")

In [0]:
spark.catalog.cacheTable("flights")
spark.catalog.cacheTable("carriers")
spark.catalog.cacheTable("planes")

spark.sql("select * from flights").show()
spark.sql("select * from carriers").show()
spark.sql("select * from planes").show()
spark.sql("set spark.sql.shuffle.partitions=2")

In [0]:
spark.sql("select c.description, p.manufacturer,  avg(f.taxiin) from flights f JOIN carriers c on f.uniquecarrier = c.code JOIN planes p on f.tailnum = p.tailnum group by c.description, p.manufacturer order by avg(taxiin) desc").show()