In [6]:
import pyspark
import time
import os


os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'

# 0. Define Spark Datasets from CSVs 

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

spark = SparkSession \
    .builder \
    .appName("csv file reader") \
    .getOrCreate()

data_path = "../Data"

In [6]:
from pyspark.sql import DataFrame
import functools


# Define helper function to concatenate streaming DataFrames
def unionAll(df_list):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), df_list) 


dfs = []
for year in range(1995, 2009):
# for year in range(2000, 2009):
    print(year)
    for month in range(1, 13):
#     for month in range(1, 3):
        csv_path = data_path + "/" + str(year) + "/" + str(month)
#         Use of Spark static DataFrames because aggregation operations (e.g., JOINS) are not supported for streaming datasets 
#         dfs.append(spark.readStream.option("sep", ",").option("header", "True").schema(userSchema).csv(csv_path))
        
        dfs.append(spark.read.options(header='True', inferSchema='True', delimiter=',').csv(data_path + "/" + str(year) + "/" + str(month) + "/" + str(month) + ".csv"))
        
df = unionAll(dfs)


1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008


In [7]:
df

DataFrame[YEAR: int, MONTH: int, DAY_OF_MONTH: int, DAY_OF_WEEK: int, FL_DATE: string, OP_UNIQUE_CARRIER: string, OP_CARRIER_FL_NUM: int, ORIGIN_AIRPORT_ID: int, ORIGIN: string, DEST_AIRPORT_ID: int, DEST: string, CRS_DEP_TIME: int, DEP_TIME: int, DEP_DELAY: int, DEP_DELAY_NEW: int, CRS_ARR_TIME: int, ARR_TIME: int, ARR_DELAY: int, ARR_DELAY_NEW: int, CANCELLED: int]

# 1. Q1 - Queries

In [8]:
df.createOrReplaceTempView("flights")

## 1.1 Rank the top 10 most popular airports by numbers of flights to/from the airport.

In [9]:
from pyspark.sql.functions import col


origin = spark.sql("SELECT ORIGIN, count(OP_CARRIER_FL_NUM) AS all_departures FROM flights GROUP BY ORIGIN")
destination = spark.sql("SELECT DEST, count(OP_CARRIER_FL_NUM) AS all_arrivals FROM flights GROUP BY DEST")

joined_df = (origin
             .join(destination, origin.ORIGIN == destination.DEST)
             .withColumn("all_flights", col("all_departures") + col("all_arrivals"))
             .select("ORIGIN", "all_flights")
            )

joined_df.createOrReplaceTempView("Q1")


In [10]:
query_1_1 = spark.sql("SELECT * FROM Q1 GROUP BY ORIGIN, all_flights ORDER BY all_flights DESC")
query_1_1.show(10)

+------+-----------+
|ORIGIN|all_flights|
+------+-----------+
|   ORD|    9252980|
|   ATL|    8885867|
|   DFW|    7926248|
|   LAX|    5844795|
|   PHX|    5047005|
|   DEN|    4508601|
|   IAH|    4423809|
|   DTW|    4197592|
|   LAS|    4024215|
|   MSP|    3939329|
+------+-----------+
only showing top 10 rows



## 1.2 Rank the top 10 airlines by on-time arrival performance.

In [11]:
query_1_2 = spark.sql("SELECT\
                          OP_UNIQUE_CARRIER AS airline,\
                          round(avg(ARR_DELAY), 3) AS avg_arr_delay\
                      FROM flights\
                      WHERE CANCELLED=0 AND ARR_DELAY IS NOT NULL\
                      GROUP BY OP_UNIQUE_CARRIER\
                      ORDER BY avg_arr_delay ASC")
query_1_2.show(10)

+-------+-------------+
|airline|avg_arr_delay|
+-------+-------------+
|     HA|       -0.775|
|     KH|        1.157|
|     F9|        5.693|
|     WN|         5.84|
|     OO|        5.877|
|     9E|        6.108|
|     TZ|        6.129|
|     NW|        6.298|
|     US|        6.471|
|     DH|        6.798|
+-------+-------------+
only showing top 10 rows



## 1.3 Rank the days of the week by on-time arrival performance.

In [12]:
query_1_3 = spark.sql("SELECT\
                          DAY_OF_WEEK AS dow,\
                          round(avg(ARR_DELAY), 3) AS avg_arr_delay\
                      FROM flights\
                      WHERE CANCELLED=0 AND ARR_DELAY IS NOT NULL\
                      GROUP BY DAY_OF_WEEK\
                      ORDER BY avg_arr_delay ASC")
query_1_3.show()

+---+-------------+
|dow|avg_arr_delay|
+---+-------------+
|  6|        4.281|
|  2|        6.008|
|  3|        7.172|
|  7|        7.249|
|  1|        7.297|
|  4|        9.341|
|  5|       10.257|
+---+-------------+

