## Project Overview

This notebook demonstrates an Apache Spark pipeline for investigating flight delay patterns in US aviation data. The workflow encompasses data preprocessing, feature engineering, exploratory analysis through grouped aggregations, and predictive modeling using logistic regression.

In [None]:
import pyspark

pyspark.__version__

## Environment & Setup

In [None]:
import os
import shutil

# Create dirs for raw data and partitioned data (Parquet)
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/lake", exist_ok=True)

# Move CSVs to "raw" data dir
shutil.move("flights.csv", "data/raw/flights.csv")
shutil.move("airlines.csv", "data/raw/airlines.csv")
shutil.move("airports.csv", "data/raw/airports.csv")

In [None]:
from pyspark.sql import SparkSession

# Get Spark Session
spark = (SparkSession.builder
        .appName("Flights-Delay")
        .master("local[*]")
        .config("spark.sql.adaptive.enabled", "true")
        .getOrCreate())

## Schema & Data Ingestion

Data is read using explicit **schemas** (instead of inference) to ensure robust typing and data quality.

In [None]:
from pyspark.sql import types as T

# Explicit Flights Schema
flight_schema = T.StructType([
    T.StructField("YEAR", T.IntegerType(), True),
    T.StructField("MONTH", T.IntegerType(), True),
    T.StructField("DAY", T.IntegerType(), True),
    T.StructField("DAY_OF_WEEK", T.IntegerType(), True),
    T.StructField("AIRLINE", T.StringType(),  True),
    T.StructField("FLIGHT_NUMBER", T.StringType(),  True),
    T.StructField("TAIL_NUMBER", T.StringType(),  True),
    T.StructField("ORIGIN_AIRPORT", T.StringType(),  True),
    T.StructField("DESTINATION_AIRPORT", T.StringType(),  True),
    T.StructField("SCHEDULED_DEPARTURE", T.IntegerType(), True),
    T.StructField("DEPARTURE_TIME", T.IntegerType(), True),
    T.StructField("DEPARTURE_DELAY", T.DoubleType(),  True),
    T.StructField("TAXI_OUT", T.DoubleType(),  True),
    T.StructField("WHEELS_OFF", T.IntegerType(), True),
    T.StructField("SCHEDULED_TIME", T.DoubleType(),  True),
    T.StructField("ELAPSED_TIME", T.DoubleType(),  True),
    T.StructField("AIR_TIME", T.DoubleType(),  True),
    T.StructField("DISTANCE", T.DoubleType(),  True),
    T.StructField("WHEELS_ON", T.IntegerType(), True),
    T.StructField("TAXI_IN", T.DoubleType(),  True),
    T.StructField("SCHEDULED_ARRIVAL", T.IntegerType(), True),
    T.StructField("ARRIVAL_TIME", T.IntegerType(), True),
    T.StructField("ARRIVAL_DELAY", T.DoubleType(),  True),
    T.StructField("DIVERTED", T.IntegerType(), True),
    T.StructField("CANCELLED", T.IntegerType(), True),
    T.StructField("CANCELLATION_REASON", T.StringType(),  True),
    T.StructField("AIR_SYSTEM_DELAY", T.DoubleType(),  True),
    T.StructField("SECURITY_DELAY", T.DoubleType(),  True),
    T.StructField("AIRLINE_DELAY", T.DoubleType(),  True),
    T.StructField("LATE_AIRCRAFT_DELAY", T.DoubleType(),  True),
    T.StructField("WEATHER_DELAY", T.DoubleType(),  True),
])

fl_raw = (spark.read.option("header", True).schema(flight_schema)
          .csv("data/raw/flights.csv"))

fl_raw.show(10)


In [None]:
# Explicit Airlines Schema
airline_schema = T.StructType([
    T.StructField("IATA_CODE", T.StringType(), True),
    T.StructField("AIRLINE",   T.StringType(), True),
])

airlines = (spark.read
            .option("header", True)
            .schema(airline_schema)
            .csv("data/raw/airlines.csv"))

airlines.show()

In [None]:
# Explicit Airports Schema
airport_schema = T.StructType([
    T.StructField("IATA_CODE", T.StringType(), True),
    T.StructField("AIRPORT",   T.StringType(), True),
    T.StructField("CITY",      T.StringType(), True),
    T.StructField("STATE",     T.StringType(), True),
    T.StructField("COUNTRY",   T.StringType(), True),
    T.StructField("LATITUDE",  T.DoubleType(), True),
    T.StructField("LONGITUDE", T.DoubleType(), True),
])

airports = (spark.read
            .option("header", True)
            .schema(airport_schema)
            .csv("data/raw/airports.csv"))

airports.show()

## Data Preparation and Parquet Export

Creates a proper date column from separate *year/month/day* fields and saves all datasets in efficient `Parquet` format. Flight data is partitioned by `YEAR` and `MONTH` for optimal query performance.


In [None]:
from pyspark.sql import functions as F

fl = fl_raw.withColumn(
    "FL_DATE",
    F.to_date(F.format_string("%04d-%02d-%02d", F.col("YEAR"), F.col("MONTH"), F.col("DAY")))
)


(fl.write.mode("overwrite")
   .partitionBy("YEAR","MONTH")
   .parquet("data/lake/flights_parquet"))

airlines.write.mode("overwrite").parquet("data/lake/airlines_parquet")
airports.write.mode("overwrite").parquet("data/lake/airports_parquet")


## Read Parquet

In [None]:
# Lazy Parquet Reading
flights = spark.read.parquet("data/lake/flights_parquet")
airlines = spark.read.parquet("data/lake/airlines_parquet")
airports = spark.read.parquet("data/lake/airports_parquet")

# Transformations and UDFs

Native Spark functions are primarily used to derive evaluation characteristics because they are considerably faster than user-defined functions (UDFs). UDFs are only used where native functions are insufficient or where special business logic is required.

The following code illustrates the use of a combination of native functions for standard transformations and UDFs for more complex transformations, such as mapping cancellation codes to descriptive labels.

In [None]:
@F.udf(returnType=T.IntegerType())
def hhmm_to_hour(x):
    """
    Extract hour from time in HHMM format.
    
    Args:
        x: Time in HHMM format (e.g., 1430 for 14:30)
        
    Returns:
        Integer hour (0-23) or None for invalid input.
        Example: 1430 -> 14, 0630 -> 6, 2400 -> 0
    """
    if x is None:
        return None
    try:
        v = int(x)
        if v == 2400: # Handle midnight (2400 -> 0)
            return 0
        if v < 0: # Reject negative values
            return None
        hh = v // 100 # Extract hour (first 2 digits)
        mm = v % 100 # Extract minutes (last 2 digits)
        return hh if 0 <= hh <= 23 and 0 <= mm <= 59 else None
    except Exception:
        return None

In [None]:
code_mapping = {"A": "Air Carrier", "B": "Extreme Weather", 
            "C": "National Aviation System", "D": "Security"}

@F.udf(T.StringType())
def cancel_code_to_label(c):
    """
    Convert cancellation code to human-readable description.
    
    Args:
        c: Single letter cancellation code (A, B, C, D)
        
    Returns:
        String description of cancellation reason
    """
    if c is None:
        return "Not Canceled"
    return code_mapping.get(str(c).strip().upper(), "Other/Unknown")

In [None]:
# Apply custom UDFs for complex transformations (time parsing, code mapping) and native Spark functions for simple operations (comparisons, date formatting)
flights_transformed = (flights
    .withColumn("DEP_HOUR", hhmm_to_hour("SCHEDULED_DEPARTURE")) # UDF
    .withColumn("IS_DELAYED_15", (F.col("ARRIVAL_DELAY") >= 15).cast("int")) # native
    .withColumn("DOW", F.date_format("FL_DATE", "E")) # native
    .withColumn("CANCEL_REASON_LABEL", cancel_code_to_label("CANCELLATION_REASON")) # UDF
)

flights_transformed.select("FL_DATE","DOW","DAY_OF_WEEK", "DEP_HOUR", "SCHEDULED_DEPARTURE", "IS_DELAYED_15", "CANCEL_REASON_LABEL").show(10)


## Data Enrichment with Reference Tables

Enriches the flight dataset by joining with airline and airport reference data to replace short codes with human-readable values.

**Broadcast Join Optimization:** Small reference tables (airlines, airports) are broadcasted to all worker nodes to avoid expensive shuffle operations.

In [None]:
# Prepare airlines reference table 
alN = airlines.withColumnRenamed("AIRLINE", "CARRIER_NAME")

# Join flight data with airline info using broadcast join for performance
flights_airlines_enriched = flights_transformed.join(F.broadcast(alN), flights_transformed.AIRLINE == alN.IATA_CODE, "left")

flights_airlines_enriched.select("CARRIER_NAME", "AIRLINE").show(10)


In [None]:
# Create origin airports view
airports_origin = airports.select(F.col("IATA_CODE").alias("ORIGIN_CODE"),
                F.col("AIRPORT").alias("ORIGIN_AIRPORT_NAME"),
                F.col("STATE").alias("ORIGIN_STATE"))

# Create destination airports view
airports_dest = airports.select(F.col("IATA_CODE").alias("DEST_CODE"),
                F.col("AIRPORT").alias("DEST_AIRPORT_NAME"),
                F.col("STATE").alias("DEST_STATE"))

# Join
flights_enriched = (flights_airlines_enriched
  .join(F.broadcast(airports_origin), flights_airlines_enriched.ORIGIN_AIRPORT == F.col("ORIGIN_CODE"), "left")
  .join(F.broadcast(airports_dest), flights_airlines_enriched.DESTINATION_AIRPORT == F.col("DEST_CODE"), "left"))

flights_enriched.select("CARRIER_NAME", "ORIGIN_AIRPORT_NAME", "DEST_AIRPORT_NAME").show(10)

In [None]:
# Save enriched data partitioned by YEAR and MONTH
(flights_enriched.write.mode("overwrite")
   .partitionBy("YEAR","MONTH")
   .parquet("data/lake/enriched_parquet"))


## Exploratory Data Analysis through Filtering and Aggregation

This section demonstrates Spark's distributed aggregation capabilities through comprehensive flight delay analysis. We filter the dataset to focus on completed flights (no cancellations or diversions) and perform grouped aggregations across multiple dimensions to identify delay patterns.

In [None]:
# Load enriched dataset and filter for completed flights only
df = spark.read.parquet("data/lake/enriched_parquet")
df = df.filter((F.col("CANCELLED")==0) & (F.col("DIVERTED")==0))
df.show(5)

In [None]:
by_airline = (df.groupBy("AIRLINE","CARRIER_NAME")
  .agg(F.count("*").alias("flights"),
       F.avg("IS_DELAYED_15").alias("p_delay"),
       F.avg("ARRIVAL_DELAY").alias("avg_arr_delay"))
  .filter("flights >= 500")
  .orderBy(F.desc("p_delay")))

print(f"Input partitions: {df.rdd.getNumPartitions()}")

by_airline.show(10, truncate=False)

In [None]:
# Show detailed execution plan for airline aggregation query
by_airline.explain(mode="formatted")

In [None]:
by_origin = (df.groupBy("ORIGIN_AIRPORT","ORIGIN_AIRPORT_NAME","ORIGIN_STATE")
  .agg(F.count("*").alias("n"),
       F.avg("IS_DELAYED_15").alias("p_delay"))
  .filter("n >= 300")
  .orderBy(F.desc("p_delay")))

by_origin.show(10, truncate=False)


In [None]:
by_route = (df.groupBy("ORIGIN_AIRPORT","DESTINATION_AIRPORT")
  .agg(F.count("*").alias("n"),
       F.avg("IS_DELAYED_15").alias("p_delay"),
       F.avg("ARRIVAL_DELAY").alias("avg_arr_delay"))
  .filter("n >= 200")
  .orderBy(F.desc("p_delay")))

by_route.show(10, truncate=False)

In [None]:
by_hour = (df.groupBy("DEP_HOUR")
  .agg(F.count("*").alias("n"),
       F.avg("IS_DELAYED_15").alias("p_delay"))
  .orderBy("DEP_HOUR"))

by_hour.show(24, truncate=False)


In [None]:
by_dow = (df.groupBy("DOW")
  .agg(F.count("*").alias("n"),
       F.avg("IS_DELAYED_15").alias("p_delay"))
  .orderBy("DOW"))

by_dow.show(7, truncate=False)


In [None]:
# Create seasonal categories
season = (F.when(F.col("MONTH").isin(12,1,2), "winter")
            .when(F.col("MONTH").isin(3,4,5), "spring")
            .when(F.col("MONTH").isin(6,7,8), "summer")
            .otherwise("autumn"))
df = df.withColumn("SEASON", season)

by_season = (df.groupBy("SEASON")
  .agg(F.count("*").alias("n"),
       F.avg("IS_DELAYED_15").alias("p_delay"),
       F.avg("ARRIVAL_DELAY").alias("avg_arr_delay"))
  .orderBy("SEASON"))

by_season.show(truncate=False)


In [None]:
# Create distance categories
dist_bins = (F.when(F.col("DISTANCE") < 500, "short")
               .when(F.col("DISTANCE") < 1500, "medium")
               .otherwise("long"))
df = df.withColumn("DIST_BIN", dist_bins)


by_dist = (df.groupBy("DIST_BIN")
  .agg(F.count("*").alias("n"),
       F.avg("IS_DELAYED_15").alias("p_delay"),
       F.avg("ARRIVAL_DELAY").alias("avg_arr_delay"))
  .orderBy("DIST_BIN"))

by_dist.show(truncate=False)


## Spark ML

A complete machine learning pipeline is implemented in Apache Spark to predict flight delays. The aim is to use a logistic regression for binary classification to predict whether a flight will be delayed by at least 15 minutes.

In [None]:
# Select Training Features
ml_data = (df
    .withColumn("label", F.col("IS_DELAYED_15").cast("double"))  # Create Label Col (delayed = 1.0, on-time = 0.0)
    .select("label", "DEP_HOUR", "MONTH", "DISTANCE",
            "AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT")
    .na.drop())

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Categorical Features
categorical_columns = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"]

# Convert Categorical Features
string_indexers = [
    StringIndexer(inputCol=column, outputCol=f"{column}_indexed", handleInvalid="keep") 
    for column in categorical_columns
]

# One-Hot Encoding of Categorical Features (remove possible 'ranking') 
one_hot_encoder = OneHotEncoder(
    inputCols=[f"{column}_indexed" for column in categorical_columns],
    outputCols=[f"{column}_encoded" for column in categorical_columns]
)

In [None]:
from pyspark.ml.feature import VectorAssembler

# Create Feature Vectors
feature_assembler = VectorAssembler(
    inputCols=["DEP_HOUR", "MONTH", "DISTANCE"] + # Numerical Featured
              [f"{column}_encoded" for column in categorical_columns], # Categorical Encoded Features
    outputCol="features"
)

In [None]:
from pyspark.ml.classification import LogisticRegression

# Init Logistic Regression Model
logistic_regression = LogisticRegression(
    featuresCol="features", 
    labelCol="label", 
    maxIter=50
)

In [None]:
from pyspark.ml import Pipeline

# ML-Pipeline inlcuding defined Preprocessing
ml_pipeline = Pipeline(stages=string_indexers + [one_hot_encoder, feature_assembler, logistic_regression])

# Split data in train and test
training_data, test_data = ml_data.randomSplit([0.8, 0.2], seed=42)

# Train Model
trained_model = ml_pipeline.fit(training_data)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Init Evaluator
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="accuracy"
)

# Creatre Predictions for Evaluation
predictions = trained_model.transform(test_data)

# Evaluate
acc = accuracy_evaluator.evaluate(predictions)

acc
