In [None]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

airportsFilePath = 'data/airline/airport_codes.txt'
flightDelayPath = 'data/airline/departuredelays.csv'

These are premade files collected from [Airline On-Time Performance and Causes of Flight Delays: On Time Data](https://catalog.data.gov/dataset/airline-on-time-performance-and-causes-of-flight-delays-on-time-data). The flight delay is missing the carrier id (i.e. which airline operated the plane)

You can generate your own data set with custom ranges for dates, and what data to include by selecting the appopriate boxes [here](https://transtats.bts.gov/DL_SelectFields.asp)

In [None]:
# Obtain Airports dataset
airports = spark.read.csv(airportsFilePath, header='true', inferSchema='true', sep='\t')
airports.createOrReplaceTempView("airports")

# Obtain Departure Delays dataset
flightPerf = spark.read.csv(flightDelayPath, header='true')
flightPerf.createOrReplaceTempView("FlightPerformance")

In [None]:
flightPerf.printSchema()

In [None]:
airports.printSchema()

In [None]:
spark.sql('SELECT CITY, IATA FROM airports WHERE state="IL" ').show()

In [None]:
spark.sql('SELECT * FROM FlightPerformance').show(5)

In [None]:
query = '''
SELECT destination, avg(distance) as distance, avg(delay) as avg_delay FROM FlightPerformance  
   WHERE origin="ORD"
   GROUP BY destination
   ORDER BY avg_delay DESC
'''

spark.sql(query).show()

## Exercise 1: 

What is the average delay for flights coming _into_ ORD?

In [None]:
query = '''
SELECT destination, avg(delay) as avg_delay FROM FlightPerformance  
   WHERE destination="ORD"
   GROUP BY destination
   ORDER BY avg_delay DESC
'''

spark.sql(query).show()

# Exercise 2:
What is the average delay for flights leaving from any airport in Illinois, broken up by destination?

In [None]:
query = '''
SELECT destination, avg(delay) as avg_delay, max(a.State), max(a.IATA)
   FROM FlightPerformance as f
   JOIN airports as a
   ON a.IATA == f.origin
   WHERE a.State="IL" AND delay > 0
   GROUP BY destination
   ORDER BY avg_delay 
'''

spark.sql(query).show()