In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType, DateType, DoubleType
from pyspark.sql import functions

In [3]:
spark = SparkSession \
    .builder \
    .appName("airlinedelay") \
    .getOrCreate()

context = spark.sparkContext



In [4]:
schema = StructType([
    StructField("FL_DATE", DateType(), True),
    StructField("OP_CARRIER", StringType(), True),
    StructField("OP_CARRIER_FL_NUM", IntegerType(), True),
    StructField("ORIGIN", StringType(), True),
    StructField("DEST", StringType(), True),
    StructField("CRS_DEP_TIME", DoubleType(), True),
    StructField("DEP_TIME", DoubleType(), True),
    StructField("DEP_DELAY", DoubleType(), True),
    StructField("TAXI_OUT", DoubleType(), True),
    StructField("WHEELS_OFF", DoubleType(), True),
    StructField("WHEELS_ON", DoubleType(), True),
    StructField("TAXI_IN", DoubleType(), True),
    StructField("CRS_ARR_TIME", DoubleType(), True),
    StructField("ARR_TIME", DoubleType(), True),
    StructField("ARR_DELAY", DoubleType(), True),
    StructField("CANCELLED", DoubleType(), True),
    StructField("CANCELLATION_CODE", StringType(), True),
    StructField("DIVERTED", DoubleType(), True),
    StructField("CRS_ELAPSED_TIME", DoubleType(), True),
    StructField("ACTUAL_ELAPSED_TIME", DoubleType(), True),
    StructField("AIR_TIME", DoubleType(), True),
    StructField("DISTANCE", DoubleType(), True),
    StructField("CARRIER_DELAY", DoubleType(), True),
    StructField("WEATHER_DELAY", DoubleType(), True),
    StructField("NAS_DELAY", DoubleType(), True),
    StructField("SECURITY_DELAY", DoubleType(), True),
    StructField("LATE_AIRCRAFT_DELAY", DoubleType(), True)
])

df = spark.read.format("csv") \
    .option("sep", ",") \
    .option("header", True) \
    .schema(schema) \
    .load("*.csv")

df.createOrReplaceTempView("airlinedelay")

In [5]:
df.show()

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|2016-01-01|        DL|             1

TOTAL RECORDS

In [6]:
spark.sql("SELECT count(*) FROM airlinedelay").show()

+--------+
|count(1)|
+--------+
|18505725|
+--------+



AVERAGE DELAY PER CARRIER

In [7]:
spark.sql("SELECT OP_CARRIER, AVG(DEP_DELAY) AS AverageDelayPerCarrier FROM airlinedelay GROUP BY OP_CARRIER").show()

+----------+----------------------+
|OP_CARRIER|AverageDelayPerCarrier|
+----------+----------------------+
|        UA|    10.278980344114846|
|        NK|    10.700513437694813|
|        AA|     9.066959081056941|
|        EV|    11.148128206783854|
|        B6|    15.200864970979463|
|        DL|     7.960301749447361|
|        OO|     9.429711727536278|
|        F9|      15.2304576842065|
|        HA|    0.7659547001036925|
|        AS|    1.8643307703390406|
|        VX|    11.589373221751949|
|        WN|    10.171177289646476|
|        YV|    11.226540641280463|
|        MQ|     7.765102407574455|
|        OH|    12.038331703552174|
|        G4|    12.922574968839358|
|        YX|    7.3396549135325335|
|        9E|    10.732733205423273|
+----------+----------------------+



COMPANY WITH THE LARGEST AVERAGE DELAY

In [16]:
spark.sql("SELECT OP_CARRIER, AVG(DEP_DELAY) AS AverageDelayPerCarrier FROM airlinedelay GROUP BY OP_CARRIER ORDER BY AVG(DEP_DELAY) DESC LIMIT 1").show()

+----------+----------------------+
|OP_CARRIER|AverageDelayPerCarrier|
+----------+----------------------+
|        F9|      15.2304576842065|
+----------+----------------------+



AVERAGE DELAY PER DEPARTURE LOCATION

In [14]:
spark.sql("SELECT ORIGIN, AVG(DEP_DELAY) AS AverageDelayPerCarrier FROM airlinedelay GROUP BY ORIGIN ORDER BY AVG(DEP_DELAY) DESC").show(50)

+------+----------------------+
|ORIGIN|AverageDelayPerCarrier|
+------+----------------------+
|   ENV|                 157.0|
|   YNG|                  63.0|
|   TKI|                  45.0|
|   ART|    36.333333333333336|
|   PPG|     35.97506925207756|
|   OTH|     28.42184154175589|
|   OWB|    27.635514018691588|
|   HGR|     26.76086956521739|
|   LWB|     25.70921985815603|
|   MMH|    25.641196013289036|
|   UST|           24.77734375|
|   SCK|    24.265856950067477|
|   PSM|    24.225589225589225|
|   MVY|     23.75155925155925|
|   ACK|     22.99314442413163|
|   SLN|    22.279411764705884|
|   BLV|    21.368941641938676|
|   PGV|    20.689987937273823|
|   LCK|    19.771668219944083|
|   HYA|     19.64569536423841|
|   OGD|                19.064|
|   MEI|    18.834141610087293|
|   HTS|     18.10320284697509|
|   SWF|    18.082712765957446|
|   BKG|    17.536231884057973|
|   CKB|    17.374164810690424|
|   EGE|     17.31171101184696|
|   USA|    17.065372829417772|
|   TTN|

DEPARTURE LOCATION WITH THE LARGEST AVERAGE DELAY

In [11]:
spark.sql("SELECT ORIGIN, AVG(DEP_DELAY) AS AverageDelayPerCarrier FROM airlinedelay GROUP BY ORIGIN ORDER BY AVG(DEP_DELAY) DESC LIMIT 1").show()

+------+----------------------+
|ORIGIN|AverageDelayPerCarrier|
+------+----------------------+
|   ENV|                 157.0|
+------+----------------------+

