# Imports

In [1]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
!cat /home/jovyan/.env

cat: /home/jovyan/.env: No such file or directory


In [9]:
import requests
from kafka import KafkaProducer
import json
from datetime import datetime, timedelta
from dotenv import load_dotenv
import os
import time

In [10]:
# Charger les variables d'environnement depuis le fichier .env copié dans le conteneur
load_dotenv('/home/jovyan/.env')

# Configuration de l'API OpenSky
OPENSKY_URL = "https://opensky-network.org/api/states/all"
USERNAME = os.environ.get('OPENSKY_USERNAME')
PASSWORD = os.environ.get('OPENSKY_PASSWORD')

# Fonction pour envoyer les données OpenSky à Kafka

In [11]:

# Kafka configuration
kafka_config = {
    'bootstrap_servers': 'kafka1:9092',  # Update with your Kafka broker
}

# Initialize Kafka Producer
producer = KafkaProducer(
    bootstrap_servers="kafka1:9092",
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

def send_opensky_to_kafka(topic, url, username, password):

    # Fetch data from OpenSky
    response = requests.get(url, auth=(username, password))

    if response.status_code == 200:
        data = response.json()
        states = data.get("states", [])

        # Publish each state to Kafka
        for state in states:
            producer.send(topic, value=state)
            #print(f"Sent: {state}")

        # Ensure all messages are sent
        producer.flush()
        print(f"Sent {len(states)} records.")
    else:
        print(f"Failed to fetch data: {response.status_code}")

In [12]:
states = {
    "icao24": "icao24",
    "callsign": "callsign",
    "origin_country": "origin_country",
    "time_position": "time_position",
    "last_contact": "last_contact",
    "longitude": "longitude",
    "latitude": "latitude",
    "baro_altitude": "baro_altitude",
    "on_ground": "on_ground",
    "velocity": "velocity",
    "true_track": "true_track",
    "vertical_rate": "vertical_rate",
    "sensors": "sensors",
    "geo_altitude": "geo_altitude",
    "squawk": "squawk",
    "spi": "spi",
    "position_source": "position_source",
    "category": "category"
}

inital_date_str = "2025-01-25 16:00:00"

# Durée de récupération (minutes)
step = 60

# date initiale
initial_date_str = "2025-01-25 16:00:00"
# durée de récupération (minutes)
step = 60

date_format = "%Y-%m-%d %H:%M:%S"
initial_date = datetime.strptime(initial_date_str, date_format)
end_date = initial_date + timedelta(minutes=step)


start = initial_date.strftime(date_format)
end = end_date.strftime(date_format)




In [None]:
# Send OpenSky data to Kafka

for i in range(10):
    print(f"{start}-{end}")
    send_opensky_to_kafka("opensky-flights", OPENSKY_URL, USERNAME, PASSWORD)
    time.sleep(5)
    initial_date = end_date
    end_date = initial_date + timedelta(minutes=step)
    start = initial_date.strftime(date_format)
    end = end_date.strftime(date_format)

In [30]:
TOPIC_ALL_FLIGHTS = "opensky_all_flights"
TOPIC_FILTERED_FLIGHTS = "opensky_filtered_flights"

current_datetime =  datetime.timestamp(datetime.now())

def send_filtered_flights_data(url, username, password):
    """Fetch flights for a specific timestamp and send only filtered flights to Kafka."""
    global current_datetime  # Keep track of the simulated time
    params = {"time": current_datetime}  # Request data for this timestamp

    # Include authentication in the GET request
    response = requests.get(url, auth=(username, password), params=params)

    if response.status_code == 200:
        data = response.json()
        states = data.get("states", [])

        for state in states:
            # Basic check: ensure the state exists and has all required fields
            if not state or len(state) < 17:
                continue

            # Extract and validate required fields
            icao24 = state[0]
            callsign = state[1].strip() if state[1] and isinstance(state[1], str) else None
            origin_country = state[2]
            time_position = state[3]
            altitude = state[7]

            # Skip the record if any of the essential fields are missing or invalid
            if not icao24 or not callsign or not origin_country or time_position is None or altitude is None:
                continue

            # Construct the message dictionary
            message = {
                "icao24": icao24,
                "callsign": callsign,
                "origin_country": origin_country,
                "time_position": time_position,
                "last_contact": state[4],
                "longitude": state[5],
                "latitude": state[6],
                "altitude": altitude,
                "on_ground": state[8],
                "velocity": state[9],
                "heading": state[10],
                "vertical_rate": state[11],
                "sensors": state[12],
                "geo_altitude": state[13],
                "squawk": state[14],
                "spi": state[15],
                "position_source": state[16]
            }

            producer.send(TOPIC_FILTERED_FLIGHTS, value=message)
            print(f"Sent to FILTERED_FLIGHTS: {message}")

        producer.flush()
        print(f"Sent {len(states)} records.")
        current_datetime += 20  # Increment simulated time
    else:
        print(f"Failed to fetch data: {response.status_code}")

## Stop kafka thread

In [28]:
def stop_kafka_producer():
    """ Stops the Kafka producer thread gracefully. """
    print("Stopping Kafka producer...")
    stop_event.set()  # Set the stop flag
    producer_thread.join()  # Wait for the thread to finish
    print("Kafka producer stopped.")

# Call this function when you want to stop the producer
stop_kafka_producer()

Stopping Kafka producer...
Kafka producer stopped.


# Spark streaming 

## Spark Session & Configurations

In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, window, current_timestamp, when
from pyspark.sql.types import StructType, StringType, DoubleType, BooleanType

# Create a single Spark session with the Kafka dependency configured
spark = SparkSession.builder \
    .appName("OpenSkySparkStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .getOrCreate()

print("Kafka packages:", spark.sparkContext.getConf().get("spark.jars.packages"))

# Define the schema matching the JSON structure of the flight data
schema = StructType() \
    .add("icao24", StringType()) \
    .add("callsign", StringType()) \
    .add("origin_country", StringType()) \
    .add("time_position", DoubleType()) \
    .add("last_contact", DoubleType()) \
    .add("longitude", DoubleType()) \
    .add("latitude", DoubleType()) \
    .add("altitude", DoubleType()) \
    .add("on_ground", BooleanType()) \
    .add("velocity", DoubleType()) \
    .add("heading", DoubleType()) \
    .add("vertical_rate", DoubleType()) \
    .add("sensors", StringType()) \
    .add("geo_altitude", DoubleType()) \
    .add("squawk", StringType()) \
    .add("spi", BooleanType()) \
    .add("position_source", StringType())

Kafka packages: org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3


25/02/20 09:55:47 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## Start Streaming Queries and Display Output

In [33]:

# Read from the Kafka topic 'opensky_filtered_flights'
df_filtered_flights = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("subscribe", "opensky_filtered_flights") \
    .option("startingOffsets", "earliest") \
    .load()

# Convert Kafka binary messages to JSON, then extract the fields using the defined schema,
# and add an event timestamp for windowing.
df_filtered_flights = df_filtered_flights.selectExpr("CAST(value AS STRING) as json_value") \
    .select(from_json(col("json_value"), schema).alias("data")) \
    .select("data.*") \
    .withColumn("event_time", current_timestamp())

# Query 1: Aggregate flights by 'origin_country' over a 1-minute window.
df_filtered_windowed = df_filtered_flights.groupBy(
    window(col("event_time"), "1 minute"),
    col("origin_country")
).count()

# Query 2: Bucket flights by altitude range and count them per 1-minute window.
df_bucketed = df_filtered_flights.withColumn(
    "altitude_range",
    when(col("altitude") < 5000, "Low")
    .when((col("altitude") >= 5000) & (col("altitude") < 15000), "Medium")
    .otherwise("High")
)

df_bucket_count = df_bucketed.groupBy(
    window(col("event_time"), "1 minute"),
    col("altitude_range")
).count()

## Start Streaming Queries and Display Output

In [34]:

# Write the altitude bucket count query output to the console
query_bucket = df_bucket_count.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

# Write the filtered flights aggregation (by country) to the console
query_filtered = df_filtered_windowed.writeStream \
    .outputMode("update") \
    .format("console") \
    .start()


# Uncomment the following lines if you wish to block execution:
# query_bucket.awaitTermination()
# query_filtered.awaitTermination()

25/02/20 09:55:53 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-a399a853-7bfa-45a3-b4fc-20872babf31d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/02/20 09:55:53 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/02/20 09:55:53 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-a5b7806b-e578-45cc-b042-bc6720e59562. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/02/20 09:55:53 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not support

25/02/20 09:55:53 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------+--------------------+-----+
|              window|      origin_country|count|
+--------------------+--------------------+-----+
|{2025-02-20 09:55...|      United Kingdom|16918|
|{2025-02-20 09:55...|              Canada| 4652|
|{2025-02-20 09:55...|               Spain| 6544|
|{2025-02-20 09:55...|             Germany| 9454|
|{2025-02-20 09:55...|            Portugal| 2632|
|{2025-02-20 09:55...|             Finland| 1037|
|{2025-02-20 09:55...|              Greece| 2060|
|{2025-02-20 09:55...|              Norway| 1638|
|{2025-02-20 09:55...|Libyan Arab Jamah...|  190|
|{2025-02-20 09:55...|         Philippines|  297|
|{2025-02-20 09:55...|               Japan| 5537|
|{2025-02-20 09:55...|          Bangladesh|  160|
|{2025-02-20 09:55...|        Saudi Arabia| 2186|
|{2025-02-20 09:55...|             Lebanon|   99|
|{2025-02-20 09:55...|          Seychelles|   99|
|{2

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------+--------------+------+
|              window|altitude_range| count|
+--------------------+--------------+------+
|{2025-02-20 09:55...|        Medium|207148|
|{2025-02-20 09:55...|          High|   183|
+--------------------+--------------+------+

