In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType, DateType, DoubleType
from pyspark.sql import functions

In [12]:
spark = SparkSession \
    .builder \
    .appName("airlinedelay") \
    .getOrCreate()

context = spark.sparkContext

In [13]:
schema = StructType([
    StructField("FL_DATE", DateType(), True),
    StructField("OP_CARRIER", StringType(), True),
    StructField("OP_CARRIER_FL_NUM", IntegerType(), True),
    StructField("ORIGIN", StringType(), True),
    StructField("DEST", StringType(), True),
    StructField("CRS_DEP_TIME", DoubleType(), True),
    StructField("DEP_TIME", DoubleType(), True),
    StructField("DEP_DELAY", DoubleType(), True),
    StructField("TAXI_OUT", DoubleType(), True),
    StructField("WHEELS_OFF", DoubleType(), True),
    StructField("WHEELS_ON", DoubleType(), True),
    StructField("TAXI_IN", DoubleType(), True),
    StructField("CRS_ARR_TIME", DoubleType(), True),
    StructField("ARR_TIME", DoubleType(), True),
    StructField("ARR_DELAY", DoubleType(), True),
    StructField("CANCELLED", DoubleType(), True),
    StructField("CANCELLATION_CODE", StringType(), True),
    StructField("DIVERTED", DoubleType(), True),
    StructField("CRS_ELAPSED_TIME", DoubleType(), True),
    StructField("ACTUAL_ELAPSED_TIME", DoubleType(), True),
    StructField("AIR_TIME", DoubleType(), True),
    StructField("DISTANCE", DoubleType(), True),
    StructField("CARRIER_DELAY", DoubleType(), True),
    StructField("WEATHER_DELAY", DoubleType(), True),
    StructField("NAS_DELAY", DoubleType(), True),
    StructField("SECURITY_DELAY", DoubleType(), True),
    StructField("LATE_AIRCRAFT_DELAY", DoubleType(), True)
])

df = spark.read.format("csv") \
    .option("sep", ",") \
    .option("header", True) \
    .schema(schema) \
    .load("*.csv")

df.createOrReplaceTempView("airlinedelay")

In [14]:
df.show()

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|2016-01-01|        DL|             1

TOTAL RECORDS

In [15]:
spark.sql("SELECT count(*) FROM airlinedelay").show()

+--------+
|count(1)|
+--------+
|18505725|
+--------+



 AVERAGE DELAY PER TIME OF YEAR

In [16]:
spark.sql("SELECT CASE EXTRACT(month from FL_DATE) WHEN 1 THEN 'January'  WHEN 2 THEN 'February' WHEN 3 THEN 'March' WHEN 4 THEN 'April' WHEN 5 THEN 'May' WHEN 6 THEN 'June' WHEN 7 THEN 'July' WHEN 8 THEN 'August' WHEN 9 THEN 'September' WHEN 10 THEN 'October' WHEN 11 THEN 'November' WHEN 12 THEN 'December' ELSE 'OTHER' END AS Month, AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDelayPerMonth FROM airlinedelay GROUP BY Month ORDER BY second(Month)").show()

+---------+--------------------+
|    Month|AverageDelayPerMonth|
+---------+--------------------+
| February|  30.735431890196587|
|  January|   32.18828878699574|
|    March|  28.878667663204894|
|      May|   31.56993455797027|
|    April|   31.86281084247858|
|     July|  37.292714589718095|
|     June|   34.88871055929404|
|  October|   27.35363224271801|
|   August|   36.20854900123184|
|September|  28.909569347281945|
| November|   27.16239535928725|
| December|   31.40340308209513|
+---------+--------------------+



AVERAGE DELAY PER CARRIER

In [17]:
spark.sql("SELECT OP_CARRIER, AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDelayPerCarrier FROM airlinedelay GROUP BY OP_CARRIER").show()

+----------+----------------------+
|OP_CARRIER|AverageDelayPerCarrier|
+----------+----------------------+
|        UA|     35.27940895187003|
|        NK|    40.004839624648405|
|        AA|     32.15946837312331|
|        EV|     50.41203436537526|
|        B6|     41.84271868513793|
|        DL|    28.354805562577056|
|        OO|     46.45076746046316|
|        F9|     43.52142862418115|
|        HA|    15.389813961056571|
|        AS|    24.853551713125597|
|        VX|    32.075166777615685|
|        WN|    21.579956134966892|
|        YV|     43.88718991345109|
|        MQ|    34.695277302275755|
|        OH|     34.92862912682075|
|        G4|     38.84540625521069|
|        YX|     41.46272772954086|
|        9E|     52.29959991640044|
+----------+----------------------+



COMPANY WITH THE LARGEST AVERAGE DELAY

In [18]:
spark.sql("SELECT OP_CARRIER, AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDelayPerCarrier FROM airlinedelay GROUP BY OP_CARRIER ORDER BY AverageDelayPerCarrier DESC LIMIT 1").show()

+----------+----------------------+
|OP_CARRIER|AverageDelayPerCarrier|
+----------+----------------------+
|        9E|     52.29959991640044|
+----------+----------------------+



AVERAGE DELAY PER DEPARTURE LOCATION

In [19]:
spark.sql("SELECT ORIGIN, AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDelayPerCarrier FROM airlinedelay GROUP BY ORIGIN ORDER BY AverageDelayPerCarrier DESC").show(50)

+------+----------------------+
|ORIGIN|AverageDelayPerCarrier|
+------+----------------------+
|   ENV|                 157.0|
|   PPG|               121.875|
|   SLN|     88.72018348623853|
|   LWB|     87.99009900990099|
|   SHD|     82.14367816091954|
|   CMX|     75.48494983277592|
|   MQT|     74.36679536679537|
|   LBL|     74.19811320754717|
|   MEI|      73.9291553133515|
|   PUB|     72.18181818181819|
|   DVL|     71.73023255813953|
|   RDD|     68.96987253765933|
|   EGE|      68.9584552369807|
|   BGM|      68.3667711598746|
|   MBS|     68.17650834403081|
|   CKB|     67.97377049180328|
|   LCH|     67.80292942743009|
|   RHI|     67.14951456310679|
|   OTH|     66.96222222222222|
|   PIB|     66.46075085324232|
|   CGI|     66.34939759036145|
|   ITH|     66.28314917127072|
|   SAF|      65.7936210131332|
|   ISN|     65.73929471032746|
|   JMS|     64.92836257309942|
|   LRD|     64.62005856515373|
|   SCE|     64.50854700854701|
|   UIN|     64.31189710610933|
|   CLL|

DEPARTURE LOCATION WITH THE LARGEST AVERAGE DELAY

In [25]:
spark.sql("SELECT ORIGIN, AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDelayPerCarrier,EXTRACT(year from FL_DATE) AS Year FROM airlinedelay GROUP BY ORIGIN, Year ORDER BY Year").show()

+------+----------------------+----+
|ORIGIN|AverageDelayPerCarrier|Year|
+------+----------------------+----+
|   BUF|     32.75690419365837|2016|
|   CLL|     45.86407766990291|2016|
|   GSP|    36.842450765864335|2016|
|   MLB|     32.50222222222222|2016|
|   MLI|    55.489291598023065|2016|
|   EVV|     54.53286713286713|2016|
|   GUC|     65.82894736842105|2016|
|   LIH|      20.0140476968311|2016|
|   ABE|     41.85941644562334|2016|
|   LBE|     45.79381443298969|2016|
|   ADK|    18.627906976744185|2016|
|   MSP|    30.962899490504864|2016|
|   CVG|     37.61518003273322|2016|
|   TYR|     35.96610169491525|2016|
|   ORD|      34.5931868719955|2016|
|   BNA|    26.882162811100336|2016|
|   JNU|    28.637476459510356|2016|
|   PNS|     34.76499690785405|2016|
|   PIT|    26.614831076960606|2016|
|   BHM|      32.7430724355858|2016|
+------+----------------------+----+
only showing top 20 rows

