In [0]:
# Dependencies to load moduels from this repo
import importlib.util
import sys

# Load cv module directly from file path
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

# Dependencies for graph features
from graphframes import *
from pyspark.sql import functions as F
from pyspark.sql.functions import col

# Other Dependencies
import time

# Path for persistent storage
FOLDER_PATH = "dbfs:/mnt/mids-w261/student-groups/Group_4_2/processed"

## Use Cross Validator Module to Generate Folds

In [0]:
data_loader = cv.FlightDelayDataLoader()
data_loader.load()

In [0]:
folds = data_loader.get_version("3M")

In [0]:
# Get training data from first fold to build graph
train_df, val_df = folds[0]

## Generate Graph from One Training Fold

### Graph Construction
Origin Airport --Flight--> Destination Airport

* **Nodes**: Airport Codes
* **Edges**: Flights
  * **Direction**: Origin to Destination
  * **Weight**: Number of Flights

In [0]:
# Build graph: nodes are airports, edges are flights (origin -> dest)
# Edge weights = number of flights between airports

# Create edges: (origin, dest) with count as weight
edges = (
    train_df
    .select("origin", "dest")
    .filter(col("origin").isNotNull() & col("dest").isNotNull())
    .groupBy("origin", "dest")
    .count()
    .withColumnRenamed("origin", "src")
    .withColumnRenamed("dest", "dst")
    .withColumnRenamed("count", "weight")
)

display(edges.limit(10))

In [0]:
# Checkpoint to run notebook more quickly in the future
edges_path = f"{FOLDER_PATH}/graph_edges.parquet"
edges.write.mode("overwrite").parquet(edges_path)

In [0]:
# If re-running this notebook, start here as edges are checkpointed
edges = spark.read.parquet(edges_path)
edges.count()  # Materialize
display(edges.limit(10))

In [0]:
# Create vertices: all unique airports (both origin and destination)
src_airports = edges.select(col("src").alias("id")).distinct()
dst_airports = edges.select(col("dst").alias("id")).distinct()
vertices = src_airports.union(dst_airports).distinct()

print(f"Number of airports (vertices): {vertices.count()}")
print(f"Number of routes (edges): {edges.count()}")

### Generate Graphframes
**NOTE**: GraphFrames PageRank does NOT use edge weights (HW5 Q5.f). It treats all edges equally (weight = 1), ignoring the "weight" column. For weighted PageRank, we'd need to use the RDD approach from HW5 Q4

**Correction**: GraphFrames automatically detects a column named weight in the edges DataFrame. Pagerank uses weights by default


In [0]:
# Create GraphFrame
start = time.time()
g = GraphFrame(vertices, edges)
print(f"Ran in {time.time() - start:.2f} seconds")

In [0]:
# Verify edges DataFrame has "weight" column (GraphFrames automatically detects this)
print("Edges DataFrame schema:")
edges.printSchema()
print("\nSample edges with weights:")
display(edges.limit(5))

### "Weighted" GraphFrame Workaround

In [0]:
# Create weighted graph using duplication workaround
# Duplicate each edge based on its weight (e.g., weight=3 means 3 copies of the edge)
# This simulates weighted PageRank since GraphFrames treats all edges equally
start = time.time()

# Get edges with weight column
edges_with_weights = g.edges

# Create sequence array [0, 1, 2, ..., weight-1] for each edge, then explode to duplicate
edges_weighted = (
    edges_with_weights
    .withColumn("seq", F.sequence(F.lit(0), F.col("weight").cast("int") - 1))
    .select("src", "dst", F.explode("seq").alias("_"))
    .select("src", "dst")
)

# Create weighted GraphFrame (same vertices, duplicated edges)
g_weighted = GraphFrame(vertices, edges_weighted)

print(f"Created weighted GraphFrame (duplication workaround) in {time.time() - start:.2f} seconds")
print(f"Original edges: {edges.count()}")
print(f"Weighted edges (after duplication): {edges_weighted.count()}")

## Graph Analysis

In [0]:
# Set checkpoint directory (required for GraphFrames algorithms like connectedComponents)
sc = spark.sparkContext
sc.setCheckpointDir("dbfs:/tmp/graphframes_checkpoint")

In [0]:
# Connectivity Analysis (referencing HW5 concepts)
# Check for connected components (islands) - weakly connected for directed graph
connected_components = g.connectedComponents()

# Count number of distinct components
num_components = connected_components.select("component").distinct().count()
print(f"Number of connected components (islands): {num_components}")

# Show component sizes
component_sizes = (
    connected_components
    .groupBy("component")
    .count()
    .orderBy(F.desc("count"))
)

print("\nTop 10 largest components:")
display(component_sizes.limit(10))

# Check if graph is strongly connected (all nodes reachable from all nodes)
# For directed graphs, we check strongly connected components
strongly_connected = g.stronglyConnectedComponents(maxIter=10)
num_strong_components = strongly_connected.select("component").distinct().count()
print(f"\nNumber of strongly connected components: {num_strong_components}")

In [0]:
# Check for dangling nodes (nodes with no outlinks) - key issue from HW5 Q2.b
# Dangling nodes are nodes that receive links but don't link to anything else
out_degree = g.outDegrees
in_degree = g.inDegrees

# Find nodes with out-degree = 0 (dangling nodes)
dangling_nodes = (
    vertices
    .join(out_degree, "id", "left_outer")
    .filter(col("outDegree").isNull() | (col("outDegree") == 0))
)

num_dangling = dangling_nodes.count()
print(f"Number of dangling nodes (no outlinks): {num_dangling}")

if num_dangling > 0:
    print("\nSample dangling nodes:")
    display(dangling_nodes.limit(10))


In [0]:
# Check for nodes with no inlinks (sources)
source_nodes = (
    vertices
    .join(in_degree, "id", "left_outer")
    .filter(col("inDegree").isNull() | (col("inDegree") == 0))
)

num_sources = source_nodes.count()
print(f"Number of source nodes (no inlinks): {num_sources}")

# Summary statistics
total_nodes = vertices.count()
print(f"\n=== Graph Connectivity Summary ===")
print(f"Total nodes: {total_nodes}")
print(f"Weakly connected components: {num_components}")
print(f"Strongly connected components: {num_strong_components}")
print(f"Dangling nodes (no outlinks): {num_dangling}")
print(f"Source nodes (no inlinks): {num_sources}")

## Run PageRank on Connectivity Graph

In [0]:
start = time.time()

# Run PageRank **without** edge weights
pagerank_results_unweighted = g.pageRank(resetProbability=0.15, maxIter=10)

# Display top airports by PageRank
top_airports_unweighted = pagerank_results_unweighted.vertices.orderBy(F.desc("pagerank")).limit(20)
display(top_airports_unweighted)

print(f"Ran in {time.time() - start:.2f} seconds")

In [0]:
# Run Weighted PageRank using the "weight" column (number of flights)
pagerank_results_weighted = g_weighted.pageRank(resetProbability=0.15, maxIter=10)

# Display top airports by PageRank
top_airports_weighted = pagerank_results_weighted.vertices.orderBy(F.desc("pagerank")).limit(20)
display(top_airports_weighted)