In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
import time

letter_list = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v"]

part_1_times = []
part_2_times = []

for item in letter_list:
    # New API
    spark = SparkSession\
            .builder\
            .master("local[3]")\
            .appName("team22")\
            .config("spark.shuffle.service.enabled", True)\
            .config("spark.dynamicAllocation.enabled", True)\
            .config("spark.dynamicAllocation.executorIdleTimeout","60s")\
            .config("spark.executor.cores",8)\
            .getOrCreate()
        
    # Old API (RDD)
    sc = spark.sparkContext
    
    flight_data = spark\
        .read\
        .format("csv")\
        .option("header","true")\
        .load("hdfs://192.168.1.153:9000/team22/data/[a-" + item + "].csv")

    flight_data = flight_data.drop("CancellationCode","Distance","AirTime","CRSElapsedTime","Diverted","SecurityDelay","LateAircraftDelay","UniqueCarrier","TaxiIn", "TaxiOut", "FlightNum", "TailNum","ActualElapsedTime","CarrierDelay" ,"WeatherDelay","NASDelay")
    
    ### Part 1 ###
    
    start_time = time.time()

    airports_Q1 = flight_data.filter(flight_data["Origin"]=="LAX").filter(flight_data["Dest"]=="HNL")
    # Week day delay
    week_day_delay = airports_Q1.select("DayOfWeek", "ArrDelay", "DepDelay")
    week_day_delay = week_day_delay.withColumn("TotalDelay", week_day_delay["ArrDelay"].cast("float")+week_day_delay["DepDelay"].cast("float")).orderBy("DayOfWeek")
    week_day_delay = week_day_delay.select(week_day_delay["DayOfWeek"].cast(IntegerType()),"TotalDelay")
    week_day_delay = week_day_delay.groupBy("DayOfWeek").agg(F.mean("TotalDelay"))
    week_day_delay.orderBy("avg(TotalDelay)").show()
    # Monthly delay
    month_delay = airports_Q1.select("DayOfMonth", "ArrDelay", "DepDelay")
    month_delay = month_delay.withColumn("TotalDelay", month_delay["ArrDelay"].cast("float")+month_delay["DepDelay"].cast("float"))
    month_delay = month_delay.select(month_delay["DayOfMonth"].cast(IntegerType()),"TotalDelay")
    month_delay = month_delay.groupBy("DayOfMonth").agg(F.mean("TotalDelay"))
    month_delay.orderBy("avg(TotalDelay)").show(5)

    end_time = time.time()
    part_1_times.append(end_time-start_time)
    
    ### Part 2 ###
    
    start_time = time.time()
    
    airports_Q2 = flight_data.select("Origin", "DepDelay")
    airports_Q2 = airports_Q2.groupBy("Origin").agg(F.mean("DepDelay"))
    airports_Q2.orderBy("avg(DepDelay)").show(5)
    
    end_time = time.time()
    part_2_times.append(end_time-start_time)

+---------+------------------+
|DayOfWeek|   avg(TotalDelay)|
+---------+------------------+
|        4|31.642201834862384|
|        6| 33.95049504950495|
|        1| 35.73267326732673|
|        2| 35.88421052631579|
|        7| 36.91794871794872|
|        3| 43.15873015873016|
|        5| 49.95169082125604|
+---------+------------------+

+----------+------------------+
|DayOfMonth|   avg(TotalDelay)|
+----------+------------------+
|         8| 8.533333333333333|
|        27| 9.938775510204081|
|         3|10.204081632653061|
|        26|10.434782608695652|
|        10|16.428571428571427|
+----------+------------------+
only showing top 5 rows

+------+-------------------+
|Origin|      avg(DepDelay)|
+------+-------------------+
|   ILG|-2.4074074074074074|
|   FOE|-1.3157894736842106|
|   LIH|               0.25|
|   EAU| 0.8081761006289309|
|   PIE| 0.9166666666666666|
+------+-------------------+
only showing top 5 rows

+---------+------------------+
|DayOfWeek|   avg(TotalDelay

+---------+------------------+
|DayOfWeek|   avg(TotalDelay)|
+---------+------------------+
|        1|17.910399776348896|
|        7|19.805551581055944|
|        2|19.959966168593176|
|        3|22.437748156551333|
|        6|23.611162544636954|
|        5| 24.34025130594381|
|        4| 24.95705782312925|
+---------+------------------+

+----------+------------------+
|DayOfMonth|   avg(TotalDelay)|
+----------+------------------+
|         8| 17.36543209876543|
|        24| 18.46565349544073|
|        10|  18.6139279169212|
|        27| 19.17153284671533|
|        25|19.327063106796118|
+----------+------------------+
only showing top 5 rows

+------+--------------------+
|Origin|       avg(DepDelay)|
+------+--------------------+
|   ILG| -2.4074074074074074|
|   ROP|  -2.062944718117132|
|   EAU| -1.9961089494163424|
|   FOE| -1.3157894736842106|
|   LIH|-0.13544124587069373|
+------+--------------------+
only showing top 5 rows

+---------+------------------+
|DayOfWeek|   avg(T

+------+-------------------+
|Origin|      avg(DepDelay)|
+------+-------------------+
|   MIB| -7.136363636363637|
|   LNY| -4.503623188405797|
|   MKK| -2.966292134831461|
|   GGG|-2.7617543207582234|
|   ILG|-2.4074074074074074|
+------+-------------------+
only showing top 5 rows

+---------+------------------+
|DayOfWeek|   avg(TotalDelay)|
+---------+------------------+
|        2|17.723862238622385|
|        1|18.470440155618277|
|        7|20.333281613653995|
|        3|  20.7716997145282|
|        6| 21.27141932235091|
|        5|24.026133743274404|
|        4|24.346032235675175|
+---------+------------------+

+----------+------------------+
|DayOfMonth|   avg(TotalDelay)|
+----------+------------------+
|        24|15.950678583250578|
|        23|17.142762284196547|
|         8|18.081216577540108|
|        27|18.386574074074073|
|        10|18.477626781571097|
+----------+------------------+
only showing top 5 rows

+------+-------------------+
|Origin|      avg(DepDelay)|
+

In [10]:
print(part_1_times)
print(part_2_times)

[23.193647146224976, 64.46241092681885, 107.31973528862, 158.79668354988098, 196.4667830467224, 240.09272360801697, 286.57261967658997, 325.163724899292, 372.9406177997589, 421.0189034938812, 467.2024419307709, 500.9623785018921, 552.6694476604462, 594.6372780799866, 656.2708122730255, 653.6656014919281, 694.7106463909149, 748.4223835468292, 804.3646903038025, 862.8263945579529, 912.1536905765533, 963.7272493839264]
[7.248520135879517, 19.770646333694458, 34.02608680725098, 48.076728105545044, 60.2615442276001, 73.52413535118103, 86.83222794532776, 100.09618043899536, 115.24827527999878, 129.7475244998932, 139.95707774162292, 152.66200160980225, 167.7023584842682, 180.7385196685791, 203.28664875030518, 198.19814205169678, 215.38269710540771, 229.66506695747375, 249.9467282295227, 265.0937829017639, 283.1196253299713, 297.19683623313904]


In [11]:
spark.stop()