# Spark Streaming et fenêtres

Use case : Flight tracking with OpenSky

## Imports

In [None]:
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import col, count, desc
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, IntegerType, TimestampType, FloatType
from pyspark.sql.functions import col, desc, sum, mean, min, max, lit, coalesce, bucket, col, window, avg
from matplotlib.pylab import mean

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as md

import networkx as nx

from bokeh.io import output_notebook, output_file, show
from bokeh.plotting import figure, from_networkx
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.transform import linear_cmap
from bokeh.palettes import Viridis256

## Conf spark

In [None]:
conf = SparkConf() \
    .setAppName('SparkApp') \
    .setMaster('spark://spark:7077') \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3,graphframes:graphframes:0.8.4-spark3.5-s_2.12") \
    .set("spark.sql.shuffle.partitions", "10")



sc = SparkContext.getOrCreate(conf=conf)
sc.setCheckpointDir("/tmp/graphframes-checkpoint")
# Créer un SQLContext pour les opérations SQL
sql_context = SQLContext(sc)

In [None]:

from graphframes import GraphFrame

## Configuration kafka et spark (non obligatoire si définis ailleurs)

In [None]:
flights_df = sql_context.read.format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("subscribe", "opensky-flights") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

In [None]:
schema = StructType([
    StructField("icao24", StringType(), True),
    StructField("firstSeen", LongType(), True),
    StructField("estDepartureAirport", StringType(), True),
    StructField("lastSeen", LongType(), True),
    StructField("estArrivalAirport", StringType(), True),
    StructField("estDepartureAirportHorizDistance", LongType(), True),
    StructField("estDepartureAirportVertDistance", LongType(), True),
    StructField("estArrivalAirportHorizDistance", LongType(), True),
    StructField("estArrivalAirportVertDistance", LongType(), True),
    StructField("departureAirportCandidatesCount", LongType(), True),
    StructField("arrivalAirportCandidatesCount", LongType(), True)
])

flights_json = flights_df.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")


## Requêtes en mode batch, avec état.
### Mode batch avec fenêtre

In [None]:
# Parse Kafka messages
parsed_stream = flights_df.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema).alias("data")) \
    .select(
        col("data.lastSeen").cast(TimestampType()).alias("ArrivalTime"),
        col("data.estArrivalAirport").alias("Airport")
    )

# Compute number of arrivals over a 60-minute window
rolling_numbers = parsed_stream \
    .groupBy(window(col("ArrivalTime"), "60 minutes")) \
    .count() \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("count").alias("num_arrivals")
    ).orderBy("window_start")

# Collect the result as a Pandas DataFrame
pandas_df = rolling_numbers.toPandas()
pandas_df

g = sns.lineplot(data=pandas_df, x="window_start", y="num_arrivals")
g.xaxis.set_major_formatter(md.DateFormatter('%d/%m\n%H:%M'))
g.set_title("Nombre d'arrivés sur une heure")

### Mode batch sans fenêtre

In [None]:
flight_counts = flights_json \
    .groupBy("estDepartureAirport", "estArrivalAirport") \
    .count() \
    .withColumnRenamed("count", "flight_count")

# Créer un DataFrame pour les arêtes (vols)
nbVols = flight_counts.select(
    col("estDepartureAirport").alias("src"),
    col("estArrivalAirport").alias("dst"),
    col("flight_count")
)
# on se limite au 15 aéroports les plus fréquentés
top_n = 15

# Identifier les aéroports les plus fréquentés (combinaison de départs et arrivées)
top_airports = (
    flight_counts.select("estDepartureAirport").union(flight_counts.select("estArrivalAirport"))
    .groupBy("estDepartureAirport").count()
    .orderBy(desc("count"))
    .limit(top_n)
    .select("estDepartureAirport").collect()
)
top_airport_codes = [row['estDepartureAirport'] for row in top_airports]

# Filtrer le dataframe pour n'inclure que les vols entre ces aéroports principaux
filtered_counts = flight_counts.filter(
    (col("estDepartureAirport").isin(top_airport_codes)) & 
    (col("estArrivalAirport").isin(top_airport_codes))
)

filtered_counts

In [None]:
# Filtrer les données qui valent NULL
filtered_nbVols = nbVols.filter(
    (col("src").isNotNull()) & 
    (col("dst").isNotNull())
)

# Trouver les routes les plus empruntées
top_routes = filtered_nbVols.orderBy(desc("flight_count")).limit(top_n)
top_routes.show()

# Convertir en pandas pour visualisation avec Seaborn
top_routes_pd = top_routes.toPandas()

# Créer des visualisations
plt.figure(figsize=(14, 8))

# Graphique des routes les plus fréquentées
plt.subplot(1, 2, 2)
sns.barplot(x="flight_count", y=top_routes_pd.apply(lambda x: f"{x['src']}-{x['dst']}", axis=1), 
            data=top_routes_pd, orient="h")
plt.title("Routes les plus fréquentées")
plt.tight_layout()

plt.savefig("aviation_network_analysis.png")
plt.show()
plt.close()

In [None]:
# Convertir en pandas pour le pivot
matrix_data = filtered_counts.toPandas()

# Créer la matrice pivot
flight_matrix = matrix_data.pivot(index='estDepartureAirport', columns='estArrivalAirport', values='flight_count')
flight_matrix = flight_matrix.fillna(0)  # Remplacer les NaN par 0

# Créer le heatmap
plt.figure(figsize=(16, 12))
ax = sns.heatmap(
    flight_matrix,
    annot=True,           # Afficher les valeurs
    fmt='g',              # Format des nombres (entiers)
    cmap='YlGnBu',        # Palette de couleurs (jaune-vert-bleu)
    linewidths=0.5,       # Lignes entre les cellules
    cbar_kws={'label': 'Nombre de vols'}
)

# Ajuster les étiquettes et le titre
plt.title('Nombre de vols entre les principaux aéroports', fontsize=16)
plt.xlabel('Destinations', fontsize=12)
plt.ylabel('Origines', fontsize=12)

# Rotation des étiquettes pour la lisibilité
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

# Ajuster la mise en page
plt.tight_layout()

# Sauvegarder et afficher
plt.savefig("/tmp/flight_heatmap_origin_destination.png", dpi=300)
# plt.show()
plt.close()

# Obtenir quelques statistiques sur cette matrice
nonzero_routes = (flight_matrix > 0).sum().sum()
total_possible_routes = len(top_airport_codes) * len(top_airport_codes)
connectivity_ratio = nonzero_routes / total_possible_routes

print(f"Parmi les {top_n} principaux aéroports:")
print(f"Nombre total de routes possibles: {total_possible_routes}")
print(f"Nombre de routes effectivement desservies: {nonzero_routes}")
print(f"Ratio de connectivité: {connectivity_ratio:.2%}")
print(f"Nombre moyen de vols par route active: {flight_matrix.sum().sum() / nonzero_routes:.1f}")

## Requête avec SparkSQL

In [None]:
# Enregistrer le DataFrame en tant que table temporaire
flights_json.createOrReplaceTempView("flights")

# Top 5 de la correspondance la plus fréquente
query = """
SELECT estDepartureAirport, estArrivalAirport, COUNT(*) as count
FROM flights
WHERE estDepartureAirport IS NOT NULL AND estArrivalAirport IS NOT NULL AND estDepartureAirport != estArrivalAirport
GROUP BY estDepartureAirport, estArrivalAirport
ORDER BY count DESC
LIMIT 5
"""

top_5_couples = sql_context.sql(query)

# Afficher les résultats
top_5_couples.show()

## Requête en mode batch et résultats sous forme de graphiques

In [None]:
# Afficher un exemple brut
flights_df.selectExpr("CAST(value AS STRING)").show(5, False)

In [None]:
filtered_flights = flights_json.filter(
    (col("estDepartureAirport").isNotNull()) &
    (col("estArrivalAirport").isNotNull())
)

outbound = filtered_flights.groupBy("estDepartureAirport") \
    .agg(count("*").alias("vols_sortants")) \
    .orderBy(desc("vols_sortants"))

inbound = filtered_flights.groupBy("estArrivalAirport") \
    .agg(count("*").alias("vols_entrants")) \
    .orderBy(desc("vols_entrants"))


In [None]:
outbound_pd = outbound.limit(10).toPandas()
inbound_pd = inbound.limit(10).toPandas()

# 🎨 Configurer le style Seaborn
sns.set_theme(style="whitegrid", palette="muted")

# 📈 Vols sortants
plt.figure(figsize=(10, 5))
sns.barplot(
    x="vols_sortants",
    y="estDepartureAirport",
    data=outbound_pd,
    palette="Blues_r"
)
plt.title("Top 10 Aéroports - Vols sortants ", fontsize=14, fontweight='bold')
plt.xlabel("Nombre de vols")
plt.ylabel("Aéroport (Départ)")
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

# 📊 Vols entrants
plt.figure(figsize=(10, 5))
sns.barplot(
    x="vols_entrants",
    y="estArrivalAirport",
    data=inbound_pd,
    palette="Oranges_r"
)
plt.title("Top 10 Aéroports - Vols entrants ", fontsize=14, fontweight='bold')
plt.xlabel("Nombre de vols")
plt.ylabel("Aéroport (Arrivée)")
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

## GraphX with GraphFrames

> installation de bokeh si déjà installé via requierments alors ce n'est pas nécessaire.

In [None]:
!pip install bokeh

### Définition du schéma de données Kafka, lecture du feed et parsing

In [None]:
# Définition du schéma des données Kafka
schema = StructType([
    StructField("icao24", StringType(), True),
    StructField("firstSeen", LongType(), True),
    StructField("estDepartureAirport", StringType(), True),
    StructField("lastSeen", LongType(), True),
    StructField("estArrivalAirport", StringType(), True),
])

# Lecture du flux Kafka
flights_df = sql_context.read.format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("subscribe", "opensky-flights") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()


# Parsing du JSON contenu dans Kafka
flights_json = flights_df.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")


### Définition des sommets arrêtes et graphe

In [None]:
# Création du DataFrame des sommets (aéroports)
# Suppression des doublons dans les sommets (aéroports)
vertices = flights_json.select(col("estDepartureAirport").alias("id")).distinct() \
    .union(flights_json.select(col("estArrivalAirport").alias("id")).distinct()) \
    .filter(col("id").isNotNull()) \
    .distinct()  # Suppression finale des doublons


In [None]:
# Création du DataFrame des arêtes (vols entre aéroports)
edges = flights_json.select(
    col("estDepartureAirport").alias("src"),
    col("estArrivalAirport").alias("dst")
).filter(col("src").isNotNull() & col("dst").isNotNull()) \
 .groupBy("src", "dst").agg(count("*").alias("weight"))


In [None]:
graph = GraphFrame(vertices, edges)


### Actions sur Graphe

#### Page Rank

In [None]:
pagerank_results = graph.pageRank(resetProbability=0.15, maxIter=10)
pagerank_results.vertices.select("id", "pagerank").show(100)

#### Connected Components heap mem error

In [None]:
cc_results = graph.connectedComponents()
cc_results.select("id", "component").orderBy("component").show()

#### Strongly Connected Components

In [None]:
scc_results = graph.stronglyConnectedComponents(maxIter=10)
scc_results.select("id", "component").show()

#### Triangle Count

In [None]:
triangle_results = graph.triangleCount()
triangle_results.select("id", "count").show()

### Filtrer les données pour représentation

In [None]:
pagerank_df = pagerank_results.vertices.toPandas()
edges_df = edges.toPandas()
pagerank_dict = dict(zip(pagerank_df["id"], pagerank_df["pagerank"]))

# Définir un seuil pour afficher uniquement les hubs majeurs
pagerank_threshold = pagerank_df["pagerank"].quantile(0.90)  # Garde le top 10% des aéroports

# Filtrer les nœuds et les arêtes
important_nodes = pagerank_df[pagerank_df["pagerank"] >= pagerank_threshold]["id"].tolist()
edges_filtered = edges_df[edges_df["src"].isin(important_nodes) & edges_df["dst"].isin(important_nodes)]

In [None]:
G_filtered = nx.DiGraph()
for node in important_nodes:
    G_filtered.add_node(node, pagerank=pagerank_dict[node])

for _, row in edges_filtered.iterrows():
    G_filtered.add_edge(row["src"], row["dst"])

### Représentations Graphique

#### Bokeh

In [None]:
# output_notebook()  # Affichage dans le notebook

output_file("air_traffic_graph_kawai.html") # Affichange dans une page web


In [None]:
pos_atlas = nx.forceatlas2_layout(G_filtered) #utile pour les gros graph mais lent évite l'overlapping


In [None]:
pos_spring = nx.spring_layout(G_filtered) # rapide mais oberlapping pour gros graph


In [None]:
pos_kawai = nx.kamada_kawai_layout(G_filtered) # pour graph bien spaced très lent attention présentation intéressante

In [None]:
pos_fr = nx.fruchterman_reingold_layout(G_filtered) # large graph force directed on voit mieux les noeuds isolés


In [None]:
import builtins  # Import the built-in Python functions to avoid PySpark conflicts
# Convert PageRank values to node sizes (force conversion to float)
node_sizes = [float(pagerank_dict[node]) * 200 for node in G_filtered.nodes()]  # Scaling factor

# Ensure `node_sizes` is a pure Python list
node_sizes = [float(size) for size in node_sizes]

# Use built-in Python `min()` and `max()` to avoid PySpark conflicts
if isinstance(node_sizes, list) and len(node_sizes) > 0:
    min_size = builtins.min(node_sizes)  # Force Python's `min()`
    max_size = builtins.max(node_sizes)  # Force Python's `max()`
else:
    min_size, max_size = 5.0, 50.0  # Default values if list is empty


In [None]:
# Scale node sizes for better visualization
scaled_sizes = [
    ((size - min_size) / (max_size - min_size)) * 30 + 5 for size in node_sizes
] if max_size > min_size else [10] * len(node_sizes)  # Normalize between 5 and  35

In [None]:

node_data = ColumnDataSource(data={
    "index": list(G_filtered.nodes()),  # Add node indices
    "size": scaled_sizes
})


# Create the Bokeh graph
plot = figure(title="Air Traffic Graph (Bokeh)", x_range=(-1.5, 1.5), y_range=(-1.5, 1.5),
              tools="pan,wheel_zoom,reset,save", width=1920, height=1080)

In [None]:

graph_renderer = from_networkx(G_filtered, pos_kawai, scale=1, center=(0, 0))
graph_renderer.node_renderer.data_source = node_data
graph_renderer.node_renderer.glyph.size = "size"  # Taille de base
# Apply color mapping correctly
color_mapper = linear_cmap(field_name="size", palette=Viridis256,
                           low=builtins.min(scaled_sizes), high=builtins.max(scaled_sizes))

# Ensure `fill_color` is properly applied to the glyph
graph_renderer.node_renderer.glyph.fill_color = color_mapper

# Ajout des arêtes (transparence pour meilleure lisibilité)
graph_renderer.edge_renderer.glyph.line_alpha = 0.3

In [None]:

# Ajout d’un outil interactif pour afficher le PageRank au survol
tooltips = HoverTool(tooltips=[("Aéroport", "@index"), ("PageRank", "@size")])
plot.add_tools(tooltips)

# Ajout du graphe à la figure
plot.renderers.append(graph_renderer)

# Affichage du graphe interactif
show(plot)

#### Matplotlib (Favoriser Bokeh car interractif et plus propre)

In [None]:
node_sizes_filtered = [pagerank_dict[node] * 3000 for node in G_filtered.nodes()]

In [None]:
plt.figure(figsize=(12*10, 7*10))
pos = nx.forceatlas2_layout(G_filtered)
nx.draw(G_filtered, pos, with_labels=True, node_size=node_sizes_filtered, alpha=0.7)
plt.title("Graphe des Connexions Aériennes (Top 10% PageRank)")
plt.show()

In [None]:
# Dessin du graphe
plt.figure(figsize=(12*10, 7*10))
pos = nx.spring_layout(G_filtered, seed=42)
nx.draw(G_filtered, pos, with_labels=True, node_size=node_sizes_filtered, alpha=0.7)
plt.title("Graphe des Connexions Aériennes (Top 10% PageRank)")
plt.show()

In [None]:
weight_threshold = edges_df["weight"].quantile(0.80)  # Garde les 20% des connexions les plus fortes
edges_strong = edges_df[edges_df["weight"] >= weight_threshold]

# Création du sous-graphe avec connexions fortes
G_strong = nx.DiGraph()
for _, row in edges_strong.iterrows():
    G_strong.add_edge(row["src"], row["dst"], weight=row["weight"])

# Taille des nœuds en fonction du PageRank
node_sizes_strong = [pagerank_dict[node] * 3000 for node in G_strong.nodes()]

In [None]:
# Dessin du graphe
plt.figure(figsize=(12*5, 7*5))
pos = nx.spring_layout(G_strong, seed=42)
nx.draw(G_strong, pos, with_labels=True, node_size=node_sizes_strong, alpha=0.7)
plt.title("Graphe des Connexions Aériennes (Routes les plus fréquentées)")
plt.show()