In [15]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("FlightAnalysisApp") \
    .getOrCreate()

In [16]:
DATA = "../ds/flight-2008.csv.bz2"
TARGET = "ArrDelay"

In [17]:
flights_df = spark.read.csv(DATA, header=True, inferSchema=True)

                                                                                

In [18]:
flights_df.columns

['Year',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'DepTime',
 'CRSDepTime',
 'ArrTime',
 'CRSArrTime',
 'UniqueCarrier',
 'FlightNum',
 'TailNum',
 'ActualElapsedTime',
 'CRSElapsedTime',
 'AirTime',
 'ArrDelay',
 'DepDelay',
 'Origin',
 'Dest',
 'Distance',
 'TaxiIn',
 'TaxiOut',
 'Cancelled',
 'CancellationCode',
 'Diverted',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay']

In [19]:
flights_df.head(2)

23/12/21 10:37:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Row(Year=2008, Month=1, DayofMonth=3, DayOfWeek=4, DepTime='1343', CRSDepTime=1325, ArrTime='1451', CRSArrTime=1435, UniqueCarrier='WN', FlightNum=588, TailNum='N240WN', ActualElapsedTime='68', CRSElapsedTime='70', AirTime='55', ArrDelay='16', DepDelay='18', Origin='HOU', Dest='LIT', Distance=393, TaxiIn='4', TaxiOut='9', Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay='16', WeatherDelay='0', NASDelay='0', SecurityDelay='0', LateAircraftDelay='0'),
 Row(Year=2008, Month=1, DayofMonth=3, DayOfWeek=4, DepTime='1125', CRSDepTime=1120, ArrTime='1247', CRSArrTime=1245, UniqueCarrier='WN', FlightNum=1343, TailNum='N523SW', ActualElapsedTime='82', CRSElapsedTime='85', AirTime='71', ArrDelay='2', DepDelay='5', Origin='HOU', Dest='MAF', Distance=441, TaxiIn='3', TaxiOut='8', Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay='NA', WeatherDelay='NA', NASDelay='NA', SecurityDelay='NA', LateAircraftDelay='NA')]

# Preprocesing

In [5]:
features_to_drop = [
    "ArrTime", "ActualElapsedTime", "AirTime", "TaxiIn",
    "Diverted", "CarrierDelay", "WeatherDelay", "NASDelay",
    "SecurityDelay", "LateAircraftDelay"
]

flights_df = flights_df.drop(*features_to_drop)

In [6]:
analysis_result = (
    flights_df
    .groupBy("Year", "Month")
    .agg({"ArrDelay": "avg", "DepDelay": "avg", "Distance": "sum"})
    .orderBy("Year", "Month")
)
analysis_result.show()


[Stage 4:====>                                                    (1 + 11) / 12]

+----+-----+------------------+------------------+-------------+
|Year|Month|     avg(DepDelay)|     avg(ArrDelay)|sum(Distance)|
+----+-----+------------------+------------------+-------------+
|2008|    1| 11.47609595943289|10.188855960349496| 4.36306761E8|
|2008|    2|13.706226305045202|13.077836997760205| 4.12502573E8|
|2008|    3| 12.49126948010275| 11.19236458018227| 4.52090528E8|
|2008|    4| 8.201132754082797| 6.807297481094145| 4.34051353E8|
+----+-----+------------------+------------------+-------------+



                                                                                

In [14]:
spark.stop()