In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
import time

# New API
spark = SparkSession\
        .builder\
        .master("local[1]")\
        .appName("team22")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
        
# Old API (RDD)
sc = spark.sparkContext

In [2]:
flight_data = spark\
    .read\
    .format("csv")\
    .option("header","true")\
    .load("hdfs://192.168.1.153:9000/team22/data/*.csv")

flight_data = flight_data.drop("CancellationCode","Distance","AirTime","CRSElapsedTime","Diverted","SecurityDelay","LateAircraftDelay","UniqueCarrier","TaxiIn", "TaxiOut", "FlightNum", "TailNum","ActualElapsedTime","CarrierDelay" ,"WeatherDelay","NASDelay")

# Grandma Question

Filter Origin and Destination

In [7]:
start_time = time.time()

########################

airports_Q1 = flight_data.filter(flight_data["Origin"]=="LAX").filter(flight_data["Dest"]=="HNL")

# Week day delay

week_day_delay = airports_Q1.select("DayOfWeek", "ArrDelay", "DepDelay")
week_day_delay = week_day_delay.withColumn("TotalDelay", week_day_delay["ArrDelay"].cast("float")+week_day_delay["DepDelay"].cast("float")).orderBy("DayOfWeek")
week_day_delay = week_day_delay.select(week_day_delay["DayOfWeek"].cast(IntegerType()),"TotalDelay")
week_day_delay = week_day_delay.groupBy("DayOfWeek").agg(F.mean("TotalDelay"))

week_day_delay.orderBy("avg(TotalDelay)").show()

# Monthly delay

month_delay = airports_Q1.select("DayOfMonth", "ArrDelay", "DepDelay")
month_delay = month_delay.withColumn("TotalDelay", month_delay["ArrDelay"].cast("float")+month_delay["DepDelay"].cast("float"))
month_delay = month_delay.select(month_delay["DayOfMonth"].cast(IntegerType()),"TotalDelay")
month_delay = month_delay.groupBy("DayOfMonth").agg(F.mean("TotalDelay"))

month_delay.orderBy("avg(TotalDelay)").show(5)

########################

end_time = time.time()
print("Time taken:", end_time-start_time)

+---------+------------------+
|DayOfWeek|   avg(TotalDelay)|
+---------+------------------+
|        2| 17.31062849621586|
|        1|18.857736510616192|
|        3| 20.42714399152991|
|        7|20.805879237288135|
|        6|21.071665846456693|
|        5| 23.74595660749507|
|        4| 24.01305174788483|
+---------+------------------+

+----------+------------------+
|DayOfMonth|   avg(TotalDelay)|
+----------+------------------+
|        24| 16.23353124116483|
|         7| 17.85451457684687|
|         8|17.889997150185238|
|        23|18.498298355076574|
|        12|18.641434262948206|
+----------+------------------+
only showing top 5 rows

Time taken: 2054.0958046913147


# Least delays/flights

In [6]:
start_time = time.time()

########################

airports_Q2 = flight_data.select("Origin", "DepDelay")
airports_Q2 = airports_Q2.groupBy("Origin").agg(F.mean("DepDelay"))
airports_Q2.orderBy("avg(DepDelay)").show(5)

########################

end_time = time.time()
print("Time taken:", end_time-start_time)

+------+------------------+
|Origin|     avg(DepDelay)|
+------+------------------+
|   MIB|-7.136363636363637|
|   GLH|              -5.5|
|   LNY|-4.503623188405797|
|   MKK|-2.966292134831461|
|   ROP|-2.062944718117132|
+------+------------------+
only showing top 5 rows

Time taken: 675.8642475605011


Time taken: 2.9802322387695312e-05
