In [74]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType, DateType, DoubleType
from pyspark.sql import functions
import math
import plotly.offline as pyo
import plotly.graph_objs as go
from calendar import month
from pandas import to_datetime
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import col
import pandas as pd

In [75]:
spark = SparkSession \
    .builder \
    .appName("airlinedelay") \
    .getOrCreate()

context = spark.sparkContext

In [76]:
schema = StructType([
    StructField("FL_DATE", DateType(), True),
    StructField("OP_CARRIER", StringType(), True),
    StructField("OP_CARRIER_FL_NUM", IntegerType(), True),
    StructField("ORIGIN", StringType(), True),
    StructField("DEST", StringType(), True),
    StructField("CRS_DEP_TIME", DoubleType(), True),
    StructField("DEP_TIME", DoubleType(), True),
    StructField("DEP_DELAY", DoubleType(), True),
    StructField("TAXI_OUT", DoubleType(), True),
    StructField("WHEELS_OFF", DoubleType(), True),
    StructField("WHEELS_ON", DoubleType(), True),
    StructField("TAXI_IN", DoubleType(), True),
    StructField("CRS_ARR_TIME", DoubleType(), True),
    StructField("ARR_TIME", DoubleType(), True),
    StructField("ARR_DELAY", DoubleType(), True),
    StructField("CANCELLED", DoubleType(), True),
    StructField("CANCELLATION_CODE", StringType(), True),
    StructField("DIVERTED", DoubleType(), True),
    StructField("CRS_ELAPSED_TIME", DoubleType(), True),
    StructField("ACTUAL_ELAPSED_TIME", DoubleType(), True),
    StructField("AIR_TIME", DoubleType(), True),
    StructField("DISTANCE", DoubleType(), True),
    StructField("CARRIER_DELAY", DoubleType(), True),
    StructField("WEATHER_DELAY", DoubleType(), True),
    StructField("NAS_DELAY", DoubleType(), True),
    StructField("SECURITY_DELAY", DoubleType(), True),
    StructField("LATE_AIRCRAFT_DELAY", DoubleType(), True)
])

df = spark.read.format("csv") \
    .option("sep", ",") \
    .option("header", True) \
    .schema(schema) \
    .load("*.csv")

df.createOrReplaceTempView("airlinedelay")

In [77]:
df.show()

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|2016-01-01|        DL|             1

TOTAL RECORDS

In [78]:
spark.sql("SELECT count(*) FROM airlinedelay").show()

+--------+
|count(1)|
+--------+
|18505725|
+--------+



 AVERAGE DEPARTURE DELAY PER MONTH IN ALL YEARS

In [79]:

averageDepartureDelayPerMonth = spark.sql("SELECT CASE EXTRACT(month from FL_DATE) WHEN 1 THEN 'January'  WHEN 2 THEN 'February' WHEN 3 THEN 'March' WHEN 4 THEN 'April' WHEN 5 THEN 'May' \
WHEN 6 THEN 'June' WHEN 7 THEN 'July' WHEN 8 THEN 'August' WHEN 9 THEN 'September' \
WHEN 10 THEN 'October' WHEN 11 THEN 'November' ELSE 'December' END AS Month, \
AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDepartureDelayPerMonth, AVG(IF(ARR_DELAY >= 0 , ARR_DELAY, NULL)) AS AverageArrivalDelayPerMonth \
FROM airlinedelay GROUP BY Month")
averageDepartureDelayPerMonth.show()


+---------+-----------------------------+---------------------------+
|    Month|AverageDepartureDelayPerMonth|AverageArrivalDelayPerMonth|
+---------+-----------------------------+---------------------------+
| February|           30.735431890196587|          33.71100564562122|
|  January|            32.18828878699574|          35.25426456636053|
|    March|           28.878667663204894|         30.986196824550824|
|      May|            31.56993455797027|          34.25158585502806|
|    April|            31.86281084247858|          34.09016641899735|
|     July|           37.292714589718095|          40.61094899363225|
|     June|            34.88871055929404|          38.14195843797705|
|  October|            27.35363224271801|           29.2535701323158|
|   August|            36.20854900123184|         38.999591241859285|
|September|           28.909569347281945|          31.14594036226316|
| November|            27.16239535928725|         30.007738214623405|
| December|         

In [101]:
dfpandas = averageDepartureDelayPerMonth.toPandas()
months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]
dfpandasaverageDepartureDelayPerMonth = dfpandas.sort_values('Month', key = lambda x : pd.Categorical(x, categories=months, ordered=True))

averageDepartureDelayPerMonthGraph = go.Scatter(
    y = dfpandasaverageDepartureDelayPerMonth['AverageDepartureDelayPerMonth'],
    x = dfpandasaverageDepartureDelayPerMonth['Month'],
    mode = 'lines+markers',
    name = 'Average Departure Delay',
    marker = dict(color='red'),
    text = dfpandasaverageDepartureDelayPerMonth['Month'])

averageArrivalDelayPerMonthGraph = go.Scatter(
    y = dfpandasaverageDepartureDelayPerMonth['AverageArrivalDelayPerMonth'],
    x = dfpandasaverageDepartureDelayPerMonth['Month'],
    mode = 'lines+markers',
    name = 'Average Arrival Delay',
    marker = dict(color='yellow'),
    text = dfpandasaverageDepartureDelayPerMonth['Month'])

data = [averageDepartureDelayPerMonthGraph, averageArrivalDelayPerMonthGraph]

layout = dict(title = 'Evolution Of Average Departure/Arrival Delay Month-Over-Month Over All Years',
              xaxis= dict(title= 'Month',ticklen= 5,zeroline= False,tickformat='d'),
              yaxis= dict(title= 'Average Delay (Minutes)'),
              height= 600,
              width= 900
             )
fig = dict(data = data, layout = layout)

pyo.iplot(fig)

 AVERAGE DEPARTURE DELAY PER MONTH AND PER CARRIER IN ALL YEARS

In [81]:
averageDepartureArrivalDelayPerMonthPerCarier = spark.sql("SELECT OP_CARRIER,  EXTRACT(Month from FL_DATE) AS Month, \
AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDepartureDelayPerMonth, AVG(IF(ARR_DELAY >= 0 , ARR_DELAY, NULL)) \
AS AverageArrivalDelayPerMonth FROM airlinedelay GROUP BY OP_CARRIER, Month ORDER BY OP_CARRIER, Month")
averageDepartureArrivalDelayPerMonthPerCarier.show()

+----------+-----+-----------------------------+---------------------------+
|OP_CARRIER|Month|AverageDepartureDelayPerMonth|AverageArrivalDelayPerMonth|
+----------+-----+-----------------------------+---------------------------+
|        9E|    1|            58.31797235023041|           57.1671469740634|
|        9E|    2|            61.01737492149885|          57.75359424920128|
|        9E|    3|            42.15660542432196|          37.37869643934822|
|        9E|    4|           54.845143449706185|         49.817087845968715|
|        9E|    5|            51.54504504504504|          45.63453608247423|
|        9E|    6|            49.84719136781974|          47.91642824875772|
|        9E|    7|           60.290930910337735|         55.701403722917306|
|        9E|    8|            65.23912040596647|          60.75072744907857|
|        9E|    9|            49.55501321944275|          44.64089818306479|
|        9E|   10|            39.08286674132139|           34.0815987933635|

In [102]:
dfpandasaverageDepartureArrivalDelayPerMonthPerCarier = averageDepartureArrivalDelayPerMonthPerCarier.toPandas()
months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]

traces = []
changePerCarrierMap = dict()
        
for i in range(len(dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['OP_CARRIER'])):
    if not dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['OP_CARRIER'][i] in changePerCarrierMap:
        changePerCarrierMap[dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['OP_CARRIER'][i]] = []
        
    changePerCarrierMap[dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['OP_CARRIER'][i]].append(dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['AverageDepartureDelayPerMonth'][i])
    
for key in changePerCarrierMap:
    traces.append(go.Scatter(
        y = changePerCarrierMap[key],
        x = months,
        mode = 'lines+markers',
        name = key))
                  
layout = dict(title = 'Evolution of Average Departure Delay For Each Carrier Month-Over-Month Over All Years',
              xaxis= dict(title= 'Month',zeroline= False,tickformat='d'),
              yaxis= dict(title= 'Flight Delay (Minutes)',tickformat='d'),
              height= 650,
              width= 900
             )

fig = dict(data = traces, layout = layout)

pyo.iplot(fig)

In [103]:
dfpandasaverageDepartureArrivalDelayPerMonthPerCarier = averageDepartureArrivalDelayPerMonthPerCarier.toPandas()
months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]

traces = []
changePerCarrierMap = dict()
        
for i in range(len(dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['OP_CARRIER'])):
    if not dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['OP_CARRIER'][i] in changePerCarrierMap:
        changePerCarrierMap[dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['OP_CARRIER'][i]] = []
        
    changePerCarrierMap[dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['OP_CARRIER'][i]].append(dfpandasaverageDepartureArrivalDelayPerMonthPerCarier['AverageArrivalDelayPerMonth'][i])
    
for key in changePerCarrierMap:
    traces.append(go.Scatter(
        y = changePerCarrierMap[key],
        x = months,
        mode = 'lines+markers',
        name = key))
                  
layout = dict(title = 'Evolution of Average Arrival Delay For Each Carrier Month-over-Month Over All Years',
              xaxis= dict(title= 'Month',zeroline= False,tickformat='d'),
              yaxis= dict(title= 'Flight Delay (Minutes)',tickformat='d'),
              height= 650,
              width= 900
             )

fig = dict(data = traces, layout = layout)

pyo.iplot(fig)

In [84]:
numberOfDelaysPerMonth = spark.sql("SELECT EXTRACT(month from FL_DATE) AS Month, \
SUM(CASE WHEN DEP_DELAY<0 THEN 1 ELSE 0 END) / COUNT(*)*100 AS DepartureImmediaciesCountPerMonth, \
SUM(CASE WHEN ARR_DELAY<0 THEN 1 ELSE 0 END) / COUNT(*)*100 AS ArrivalImmediaciesCountPerMonth \
FROM airlinedelay GROUP BY Month ORDER BY Month")
numberOfDelaysPerMonth.show()

+-----+---------------------------------+-------------------------------+
|Month|DepartureImmediaciesCountPerMonth|ArrivalImmediaciesCountPerMonth|
+-----+---------------------------------+-------------------------------+
|    1|               58.266653569464964|              61.64927876711674|
|    2|                61.87064481303366|              64.71877013172838|
|    3|                 59.2748903909968|               61.7452234782928|
|    4|               61.248947414982254|              63.16224389828342|
|    5|               59.112596927393724|              61.89778623204434|
|    6|               54.689272282251814|              57.94759765586134|
|    7|               54.390322218506945|              57.75193049843496|
|    8|                 56.1293787432619|              58.35270168028348|
|    9|                  65.109040293353|              66.62749424085305|
|   10|                63.72628024818236|              65.49667075543633|
|   11|                63.164011002643

In [85]:
months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]
numberOfDelaysPerMonthBar = go.Bar(
    x = months,
    y = numberOfDelaysPerMonth.toPandas()['DepartureImmediaciesCountPerMonth'],
    name = 'Departure Delay Percentage',
    marker = dict(color='orange'),
    text = round(numberOfDelaysPerMonth.toPandas()['DepartureImmediaciesCountPerMonth']))

data = [numberOfDelaysPerMonthBar]

layout = dict(title = 'Departure Haste Percentage Relative to Total Departures Month-Over-Month',
              xaxis= dict(title= 'Month',ticklen= 5,zeroline= False,tickformat='d'),
              yaxis= dict(title= 'Percentage of Flights'),
              height= 450,
              width= 900
             )
fig = dict(data = data, layout = layout)

pyo.iplot(fig)

In [86]:
months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]
numberOfDelaysPerMonthBar = go.Bar(
    x = months,
    y = numberOfDelaysPerMonth.toPandas()['ArrivalImmediaciesCountPerMonth'],
    name = 'Departure Delay Percentage',
    marker = dict(color='brown'),
    text = round(numberOfDelaysPerMonth.toPandas()['ArrivalImmediaciesCountPerMonth']))

data = [numberOfDelaysPerMonthBar]

layout = dict(title = 'Arrival Haste Percentage Relative to Total Arrivals Month-Over-Month',
              xaxis= dict(title= 'Month',ticklen= 5,zeroline= False,tickformat='d'),
              yaxis= dict(title= 'Percentage of Flights'),
              height= 450,
              width= 900
             )
fig = dict(data = data, layout = layout)

pyo.iplot(fig)

AVERAGE DEPARTURE DELAY PER CARRIER FOR EACH YEAR

In [87]:
averageDelayPerCarrier = spark.sql("SELECT OP_CARRIER, AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDelayPerCarrier, EXTRACT(year from FL_DATE) AS Year FROM airlinedelay GROUP BY OP_CARRIER, YEAR ORDER BY OP_CARRIER, YEAR" )

averageDelayPerCarrier.show()

+----------+----------------------+----+
|OP_CARRIER|AverageDelayPerCarrier|Year|
+----------+----------------------+----+
|        9E|     52.29959991640044|2018|
|        AA|    32.083106782345574|2016|
|        AA|    31.191470074234218|2017|
|        AA|    33.104038099407575|2018|
|        AS|    22.391819772528432|2016|
|        AS|    25.650165550996906|2017|
|        AS|    25.775519848771268|2018|
|        B6|     35.79490556718489|2016|
|        B6|     45.86273824773883|2017|
|        B6|     43.72999099548286|2018|
|        DL|    27.463280200008672|2016|
|        DL|    29.959391775236142|2017|
|        DL|     27.67449819095246|2018|
|        EV|    45.038134348244974|2016|
|        EV|     54.49749016415683|2017|
|        EV|     56.60643408878703|2018|
|        F9|     43.09401987941991|2016|
|        F9|     38.60866406760782|2017|
|        F9|     47.33786048460303|2018|
|        G4|     38.84540625521069|2018|
+----------+----------------------+----+
only showing top

In [88]:
traces = []
changePerCarrierMap = dict()
years = []

for i in range(len(averageDelayPerCarrier.toPandas()['Year'])):
    if not averageDelayPerCarrier.toPandas()['Year'][i] in years:
        years.append(averageDelayPerCarrier.toPandas()['Year'][i])

years.sort()
        
for i in range(len(averageDelayPerCarrier.toPandas()['OP_CARRIER'])):
    if not averageDelayPerCarrier.toPandas()['OP_CARRIER'][i] in changePerCarrierMap:
        changePerCarrierMap[averageDelayPerCarrier.toPandas()['OP_CARRIER'][i]] = []
        
    changePerCarrierMap[averageDelayPerCarrier.toPandas()['OP_CARRIER'][i]].append(averageDelayPerCarrier.toPandas()['AverageDelayPerCarrier'][i])
    
for key in changePerCarrierMap:
    traces.append(go.Scatter(
        y = changePerCarrierMap[key],
        x = years,
        mode = 'lines+markers',
        name = key))
                  
layout = dict(title = 'Evolution Of Average Departure Delay For Each Carrier Year-over-Year',
              xaxis= dict(title= 'Year',zeroline= False,tickformat='d'),
              yaxis= dict(title= 'Flight Delay (Minutes)',tickformat='d'),
              height= 650,
              width= 900
             )

fig = dict(data = traces, layout = layout)

pyo.iplot(fig)

AVERAGE DEPARTURE/ARRIVAL DELAY PER YEAR

In [89]:
averageDepartureArrivalDelayPerYear = spark.sql("SELECT AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDepartureDelay, \
AVG(IF(ARR_DELAY >= 0 , ARR_DELAY, NULL)) AS AverageArrivalDelay, \
EXTRACT(year from FL_DATE) AS Year FROM airlinedelay \
GROUP BY Year ORDER BY Year ASC")

averageDepartureArrivalDelayPerYear.show()

+---------------------+-------------------+----+
|AverageDepartureDelay|AverageArrivalDelay|Year|
+---------------------+-------------------+----+
|   29.574317477001113|  32.79529582415297|2016|
|   32.134094929311296| 34.828257366891144|2017|
|    33.44112500620334|  35.75265236979934|2018|
+---------------------+-------------------+----+



In [90]:
averageDepartureDelayPerYearBar = go.Bar(
    x = averageDepartureArrivalDelayPerYear.toPandas()['Year'],
    y = averageDepartureArrivalDelayPerYear.toPandas()['AverageDepartureDelay'],
    name = 'Departure Delay Percentage',
    marker = dict(color='green'),
    text = round(averageDepartureArrivalDelayPerYear.toPandas()['AverageDepartureDelay']))

data = [averageDepartureDelayPerYearBar]

layout = dict(title = 'Evolution Of Average Departure Delay Year-over-Year',
              xaxis= dict(title= 'Carrier',ticklen= 5,zeroline= False,tickformat='d'),
              yaxis= dict(title= 'Average Departure Delay(Minutes)'),
              height= 450,
              width= 900
             )
fig = dict(data = data, layout = layout)

pyo.iplot(fig)

In [91]:
averageDepartureDelayPerYearBar = go.Bar(
    x = averageDepartureArrivalDelayPerYear.toPandas()['Year'],
    y = averageDepartureArrivalDelayPerYear.toPandas()['AverageDepartureDelay'],
    name = 'Departure Delay Percentage',
    marker = dict(color='purple'),
    text = round(averageDepartureArrivalDelayPerYear.toPandas()['AverageArrivalDelay']))

data = [averageDepartureDelayPerYearBar]

layout = dict(title = 'Evolution Of Average Departure Delay Year-over-Year',
              xaxis= dict(title= 'Carrier',ticklen= 5,zeroline= False,tickformat='d'),
              yaxis= dict(title= 'Average Departure Delay(Minutes)'),
              height= 450,
              width= 900
             )
fig = dict(data = data, layout = layout)

pyo.iplot(fig)

AVERAGE DEPARTURE DELAY PER DEPARTURE LOCATION IN ALL YEARS

In [92]:
averageDelayPerDepartureLocationInAllYears = spark.sql("SELECT ORIGIN, AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDelayPerOrigin FROM airlinedelay  \
GROUP BY ORIGIN ORDER BY AverageDelayPerOrigin DESC")

averageDelayPerDepartureLocationInAllYears.show(50);

+------+---------------------+
|ORIGIN|AverageDelayPerOrigin|
+------+---------------------+
|   ENV|                157.0|
|   PPG|              121.875|
|   SLN|    88.72018348623853|
|   LWB|    87.99009900990099|
|   SHD|    82.14367816091954|
|   CMX|    75.48494983277592|
|   MQT|    74.36679536679537|
|   LBL|    74.19811320754717|
|   MEI|     73.9291553133515|
|   PUB|    72.18181818181819|
|   DVL|    71.73023255813953|
|   RDD|    68.96987253765933|
|   EGE|     68.9584552369807|
|   BGM|     68.3667711598746|
|   MBS|    68.17650834403081|
|   CKB|    67.97377049180328|
|   LCH|    67.80292942743009|
|   RHI|    67.14951456310679|
|   OTH|    66.96222222222222|
|   PIB|    66.46075085324232|
|   CGI|    66.34939759036145|
|   ITH|    66.28314917127072|
|   SAF|     65.7936210131332|
|   ISN|    65.73929471032746|
|   JMS|    64.92836257309942|
|   LRD|    64.62005856515373|
|   SCE|    64.50854700854701|
|   UIN|    64.31189710610933|
|   CLL|    64.19959473150962|
|   MHK|

In [93]:
# averageDelayPerDepartureLocationInAllYearsGraph = go.Pie(
#     values = averageDelayPerDepartureLocationInAllYears.toPandas()['AverageDelayPerOrigin'],
#     text = averageDelayPerDepartureLocationInAllYears.toPandas()['ORIGIN'])


# data = [averageDelayPerDepartureLocationInAllYearsGraph]

# layout = dict(title = 'AVERAGE DEPARTURE DELAY PER DEPARTURE LOCATION IN ALL YEARS',
#               height= 600,
#               width= 900
#              )
# fig = dict(data = data, layout = layout)

# pyo.iplot(fig)

AVERAGE DEPARTURE DELAY PER DEPARTURE LOCATION EACH YEAR

In [94]:
averageDelayPerDepartureLocationEachYear = spark.sql("SELECT ORIGIN, AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDelayPerOrigin,\
 EXTRACT(year from FL_DATE) AS Year FROM airlinedelay GROUP BY ORIGIN, Year \
 ORDER BY AverageDelayPerOrigin DESC")

averageDelayPerDepartureLocationEachYear.show(50)

+------+---------------------+----+
|ORIGIN|AverageDelayPerOrigin|Year|
+------+---------------------+----+
|   PPG|   201.55555555555554|2016|
|   ENV|                157.0|2016|
|   DVL|    122.1344537815126|2018|
|   PPG|    99.58064516129032|2018|
|   GCK|    99.08108108108108|2016|
|   JMS|    97.18536585365854|2018|
|   CMX|    96.56744186046511|2018|
|   UST|    95.85714285714286|2017|
|   GCK|    92.89655172413794|2017|
|   RHI|    92.57462686567165|2017|
|   SLN|    88.72018348623853|2018|
|   LWB|    87.99009900990099|2018|
|   LCH|    86.76878612716763|2017|
|   MQT|    84.84939759036145|2016|
|   SAF|    82.25088339222614|2018|
|   SHD|    82.14367816091954|2018|
|   MEI|    81.23869346733669|2017|
|   RHI|    80.50381679389314|2018|
|   FSM|     80.1043956043956|2017|
|   HYS|    80.08812260536398|2018|
|   ITH|    79.99586776859505|2018|
|   MHK|    79.28909952606635|2017|
|   BGR|    79.12786885245902|2017|
|   EGE|      78.440329218107|2018|
|   MBS|    78.3832335329341

In [95]:
traces = []
changePerCarrierMap = dict()
years = []

for i in range(len(averageDelayPerDepartureLocationEachYear.toPandas()['Year'])):
    if not averageDelayPerDepartureLocationEachYear.toPandas()['Year'][i] in years:
        years.append(averageDelayPerDepartureLocationEachYear.toPandas()['Year'][i])

years.sort()
        
for i in range(len(averageDelayPerDepartureLocationEachYear.toPandas()['ORIGIN'])):
    if not averageDelayPerDepartureLocationEachYear.toPandas()['ORIGIN'][i] in changePerCarrierMap:
        changePerCarrierMap[averageDelayPerDepartureLocationEachYear.toPandas()['ORIGIN'][i]] = []
        
    changePerCarrierMap[averageDelayPerDepartureLocationEachYear.toPandas()['ORIGIN'][i]].append(averageDelayPerDepartureLocationEachYear.toPandas()['AverageDelayPerOrigin'][i])
    
for key in changePerCarrierMap:
    traces.append(go.Scatter(
        y = changePerCarrierMap[key],
        x = years,
        mode = 'lines+markers',
        name = key))
                  
layout = dict(title = 'Evolution of AVERAGE DELAY PER DEPARTURE LOCATION IN EACH YEAR',
              xaxis= dict(title= 'Year',zeroline= False,tickformat='d'),
              yaxis= dict(title= 'Flight Delay (Minutes)',tickformat='d'),
              height= 650,
              width= 900
             )

fig = dict(data = traces, layout = layout)

pyo.iplot(fig)

DEPARTURE LOCATION WITH THE LARGEST AVERAGE DELAY

In [96]:
spark.sql("SELECT ORIGIN, AVG(IF(DEP_DELAY >= 0 , DEP_DELAY, NULL)) AS AverageDelay,EXTRACT(year from FL_DATE) AS Year \
FROM airlinedelay GROUP BY ORIGIN, Year ORDER BY AverageDelay DESC").show()

+------+------------------+----+
|ORIGIN|      AverageDelay|Year|
+------+------------------+----+
|   PPG|201.55555555555554|2016|
|   ENV|             157.0|2016|
|   DVL| 122.1344537815126|2018|
|   PPG| 99.58064516129032|2018|
|   GCK| 99.08108108108108|2016|
|   JMS| 97.18536585365854|2018|
|   CMX| 96.56744186046511|2018|
|   UST| 95.85714285714286|2017|
|   GCK| 92.89655172413794|2017|
|   RHI| 92.57462686567165|2017|
|   SLN| 88.72018348623853|2018|
|   LWB| 87.99009900990099|2018|
|   LCH| 86.76878612716763|2017|
|   MQT| 84.84939759036145|2016|
|   SAF| 82.25088339222614|2018|
|   SHD| 82.14367816091954|2018|
|   MEI| 81.23869346733669|2017|
|   RHI| 80.50381679389314|2018|
|   FSM|  80.1043956043956|2017|
|   HYS| 80.08812260536398|2018|
+------+------------------+----+
only showing top 20 rows



TOTAL NUMBER OF DEPARTURE/ARRIVAL DELAYS PER YEAR

In [97]:
totalNumberOfDepartureArrivalDelaysPerYear = spark.sql("SELECT SUM(IF(DEP_DELAY >= 0 , 1, 0)) AS AverageDepartureDelay, SUM(IF(ARR_DELAY >= 0 , 1, 0)) \
AS AverageArrivalDelay, \
EXTRACT(year from FL_DATE) AS Year \
FROM airlinedelay GROUP BY Year ORDER BY AverageDepartureDelay DESC")

totalNumberOfDepartureArrivalDelaysPerYear.show()

+---------------------+-------------------+----+
|AverageDepartureDelay|AverageArrivalDelay|Year|
+---------------------+-------------------+----+
|              2800909|            2655267|2018|
|              2233030|            2056193|2017|
|              2230108|            2000648|2016|
+---------------------+-------------------+----+



In [98]:
totalNumberOfDeparturelDelaysPerYearTrace = go.Pie(
    values = totalNumberOfDepartureArrivalDelaysPerYear.toPandas()['AverageDepartureDelay'],
    text = totalNumberOfDepartureArrivalDelaysPerYear.toPandas()['Year'])


data = [totalNumberOfDeparturelDelaysPerYearTrace]

layout = dict(title = 'Total Number of Departure Delays Year-Over-Year',
              height= 600,
              width= 900
             )
fig = dict(data = data, layout = layout)

pyo.iplot(fig)

In [99]:
totalNumberOfArrivalDelaysPerYearTrace = go.Pie(
    values = totalNumberOfDepartureArrivalDelaysPerYear.toPandas()['AverageArrivalDelay'],
    text = totalNumberOfDepartureArrivalDelaysPerYear.toPandas()['Year'])


data = [totalNumberOfArrivalDelaysPerYearTrace]

layout = dict(title = 'Total Number of Arrival Delays Year-Over-Year',
              height= 600,
              width= 900
             )
fig = dict(data = data, layout = layout)

pyo.iplot(fig)