In [2]:
import requests
import json
import time
from pyspark.sql import SparkSession, functions as F, types as T

start = time.time()

with open('/home/gesser/air-traffic-data-pipeline/credentials.json') as f:
    creds = json.load(f)

client_id = creds.get("clientId")
client_secret = creds.get("clientSecret")

if not client_id or not client_secret:
    raise ValueError("Set CLIENT_ID and CLIENT_SECRET environment variables before running.")

token_url = "https://auth.opensky-network.org/auth/realms/opensky-network/protocol/openid-connect/token"

payload = {
    "grant_type": "client_credentials",
    "client_id": client_id,
    "client_secret": client_secret
}

headers = {
    "Content-Type": "application/x-www-form-urlencoded"
}

response = requests.post(token_url, data=payload, headers=headers)
response.raise_for_status()

access_token = response.json().get("access_token")

tok = time.time()

print(f"time taken to get token: {tok-start:.2f}")

#print(f"Access token: {access_token}")

url = "https://opensky-network.org/api/states/all"
params = {
    "lamin": 47.001917,
    "lomin": -1.919083, 
    "lamax": 47.340556, 
    "lomax": -1.181750  
}
headers = {
    "Authorization": f"Bearer {access_token}"
}

response = requests.get(url, headers=headers, params=params)
response.raise_for_status()  # raise error if request failed

data = response.json()

laoddata = time.time()

print(f"time taken to load data: {laoddata-tok:.2f}")

if response.status_code == 200:

    start = time.time()

    data_json = response.text

    spark = SparkSession.builder.appName("flight-data-pipeline").getOrCreate()

    sparkinit = time.time()
    print(f"time taken to initialize spark: {sparkinit-start:.2f}")


    # Create RDD from JSON string (split by lines if multiline JSON)
    rdd = spark.sparkContext.parallelize([data_json])

    # Read JSON from RDD
    df = spark.read.json(rdd)

    df_states = df.withColumn("state", F.explode("states"))

    schema_def = [
    (0,  "icao24",          T.StringType()),
    (1,  "callsign",        T.StringType()),
    (2,  "origin_country",  T.StringType()),
    (3,  "time_position",   T.LongType()),
    (4,  "last_contact",    T.LongType()),
    (5,  "longitude",       T.DoubleType()),
    (6,  "latitude",        T.DoubleType()),
    (7,  "baro_altitude",   T.DoubleType()),
    (8,  "on_ground",       T.BooleanType()),
    (9,  "velocity",        T.DoubleType()),
    (10, "true_track",      T.DoubleType()),
    (11, "vertical_rate",   T.DoubleType()),
    (12, "sensors",         T.ArrayType(T.IntegerType())),
    (13, "geo_altitude",    T.DoubleType()),
    (14, "squawk",          T.StringType()),
    (15, "spi",             T.BooleanType()),
    (16, "position_source", T.IntegerType())
    ]

    cols = []

    for idx, name, dtype in schema_def:
        c = F.col("state")[idx]

        # sensors comes in as a JSON-style string like "[1,2,3]"
        if name == "sensors":
            c = F.when(
                    c.isNull(), None
                ).otherwise(
                    F.split(                    
                        F.regexp_replace(c, r'[\[\]\s]', ''),
                        ','
                    ).cast(dtype)              
                )
        else:
            c = c.cast(dtype)

        cols.append(c.alias(name))

    
    df_typed = df_states.select(*cols)
    print(df_typed.count())

    print(f"time taken to treat dataset: {time.time()-sparkinit:.2f}")

    df_typed.show()
    
    spark.stop()
else:
    print(f"Error: {response.status_code} - {response.text}")


time taken to get token: 0.15
time taken to load data: 0.16
time taken to initialize spark: 0.06


                                                                                

3
time taken to treat dataset: 1.20
+------+--------+--------------+-------------+------------+---------+--------+-------------+---------+--------+----------+-------------+-------+------------+------+-----+---------------+
|icao24|callsign|origin_country|time_position|last_contact|longitude|latitude|baro_altitude|on_ground|velocity|true_track|vertical_rate|sensors|geo_altitude|squawk|  spi|position_source|
+------+--------+--------------+-------------+------------+---------+--------+-------------+---------+--------+----------+-------------+-------+------------+------+-----+---------------+
|40807a|TOM5385 |United Kingdom|   1751451336|  1751451336|   -1.205|  47.313|      11582.4|    false|  248.69|    343.29|          0.0|   NULL|    12131.04|  1045|false|              0|
|4ca63a|EIN5GV  |       Ireland|   1751451336|  1751451336|  -1.8086| 47.1274|     11285.22|    false|  223.45|    159.94|          0.0|   NULL|    11833.86|  6362|false|              0|
|4cabdc|DLH81E  |       Irela

#Collecting coordiantes

In [4]:
import math

coordinates = []

coLat = 47.1542
coLon = -1.6044

num_steps = 25 * 1000 / 50

total_points = (num_steps*2)**2

print(f"for a grid of 50 km centered arround Nantes Atlantique airport, we have {num_steps} steps each direction, and {total_points} points")

coLat_rad = math.radians(coLat)
coLon_rad = math.radians(coLon)

d = 50              # distance in meters (steps)
R = 6371000         # Earth radius in meters

for a grid of 50 km centered arround Nantes Atlantique airport, we have 500.0 steps each direction, and 1000000.0 points


In [5]:
# Moving north
bearing_north = math.radians(0)

lat2_north = math.asin(math.sin(coLat_rad) * math.cos(d / R) +
                       math.cos(coLat_rad) * math.sin(d / R) * math.cos(bearing_north))

lon2_north = coLon_rad + math.atan2(math.sin(bearing_north) * math.sin(d / R) * math.cos(coLat_rad),
                               math.cos(d / R) - math.sin(coLat_rad) * math.sin(lat2_north))

# Convert back to degrees
lat2_north_deg = math.degrees(lat2_north)
lon2_north_deg = math.degrees(lon2_north)

print(f"lon: {lon2_north_deg}, long: {lat2_north_deg}")

lon: -1.6044, long: 47.15464966080296


In [6]:
def stepslat(lat, lon, d, R, num):
    coLat_rad = math.radians(lat)
    coLon_rad = math.radians(lon)

    bearing_north = math.radians(0)
    bearing_south = math.radians(180)

    coords = [coLat_rad]

    temp1, temp2 = coLat_rad, coLat_rad
    for i in range(0,num):
        temp1 = math.asin(math.sin(temp1) * math.cos(d / R) +
                            math.cos(temp1) * math.sin(d / R) * math.cos(bearing_north))
        temp2 = math.asin(math.sin(temp2) * math.cos(d / R) +
                            math.cos(temp2) * math.sin(d / R) * math.cos(bearing_south))
        
        coords.append(temp1)
        coords.append(temp2)

    coords.sort()
    return [math.degrees(i) for i in coords]

In [7]:
def stepslong(lat, lon, d, R, num_steps):
    lat_rad = math.radians(lat)
    lon_rad = math.radians(lon)
    
    bearing_east = math.radians(90)
    bearing_west = math.radians(270)

    coords = [lon_rad]
    
    temp_east = lon_rad
    temp_west = lon_rad
    
    for i in range(num_steps):
        delta = math.atan2(
            math.sin(bearing_east) * math.sin(d / R) * math.cos(lat_rad),
            math.cos(d / R) - math.sin(lat_rad) * math.sin(lat_rad)
        )
        temp_east += delta
        coords.append(temp_east)
        
        delta = math.atan2(
            math.sin(bearing_west) * math.sin(d / R) * math.cos(lat_rad),
            math.cos(d / R) - math.sin(lat_rad) * math.sin(lat_rad)
        )
        temp_west += delta
        coords.append(temp_west)

    #coords = list(set(coords))  # optional: remove duplicates
    coords.sort()
    return [math.degrees(i) for i in coords]

In [None]:
from itertools import product

def genCoords(airportLat, airportLon, stepDist, stepNumber): 
    earthRad = 6371000
    lats = stepslat(airportLat, airportLon, stepDist, earthRad, stepNumber)
    lons = stepslong(airportLat, airportLon, stepDist, earthRad, stepNumber)
    return product(lats, lons)

coords = {}
for i, coord in enumerate(genCoords(coLat, coLon, d, int(num_steps))):
    if i%100 == 0:
        coords[coord] = True

print(coords)
        

# The elevation is a bit complicated to collect, since either APIs cannot be automated or can be pricey...
# So I'll use the altitude of Nantes airport: 27 m


{(46.92936959852044, -1.9350195231138896): True, (46.92936959852044, -1.8688956184911005): True, (46.92936959852044, -1.8027717138683115): True, (46.92936959852044, -1.7366478092455384): True, (46.92936959852044, -1.6705239046227693): True, (46.92936959852044, -1.6044): True, (46.92936959852044, -1.5382760953772308): True, (46.92936959852044, -1.4721521907544617): True, (46.92936959852044, -1.4060282861316924): True, (46.92936959852044, -1.339904381508923): True, (46.92936959852044, -1.273780476886154): True, (46.92981925932339, -1.8695568575373285): True, (46.92981925932339, -1.8034329529145394): True, (46.92981925932339, -1.7373090482917661): True, (46.92981925932339, -1.671185143668997): True, (46.92981925932339, -1.6050612390462278): True, (46.92981925932339, -1.5389373344234585): True, (46.92981925932339, -1.4728134298006894): True, (46.92981925932339, -1.4066895251779201): True, (46.92981925932339, -1.3405656205551508): True, (46.92981925932339, -1.2744417159323818): True, (46.93

test plane: 
+------+--------+--------------+-------------+------------+---------+--------+-------------+---------+--------+----------+-------------+-------+------------+------+-----+---------------+
|icao24|callsign|origin_country|time_position|last_contact|longitude|latitude|baro_altitude|on_ground|velocity|true_track|vertical_rate|sensors|geo_altitude|squawk|  spi|position_source|
+------+--------+--------------+-------------+------------+---------+--------+-------------+---------+--------+----------+-------------+-------+------------+------+-----+---------------+
|4cabdc|DLH81E  |       Ireland|   1751451336|  1751451336|  -1.3939| 47.1038|       2286.0|    false|  136.09|    229.45|         -5.2|   NULL|     2461.26|  1000|false|              0|

Calculating 3D distance between two coordinates, considering also the altitude of both points

In [None]:
def distance(grLat, grLon, plLat, plLon):
    R = 6371000

    # Calculating horizental distance using the haversine_distance
    radCoLat, radPlLat= math.radians(grLat), math.radians(plLat)
    deltaLat = math.radians(grLat - plLat)
    deltaLon = math.radians(grLon - plLon)

    a = math.sin(deltaLat/2) ** 2
    b = math.cos(radCoLat) * math.cos(radPlLat) * (math.sin(deltaLon/2) ** 2)
    horDist = 2 * R * math.asin(math.sqrt(a+b))

    # Calculating vertical distance, simple because variables in meters not degrees
    verDist = abs(2461.26 - 27)

    # Pythagores
    return math.sqrt((horDist**2)+(verDist**2))

print(distance(coLat, coLon, 47.1038, -1.3939))

17056.57358351798


testing with all coordinates

In [None]:
coords = {}
for coord in genCoords(coLat, coLon, d, int(num_steps)):

    # Pre-filter by bounding box (~10km) (or we think about other methods)
    if abs(coord[0] - 47.1038) > 0.09 or abs(coord[1] - (-1.3939)) > 0.135:
        continue

    dist = distance(coord[0], coord[1], 47.1038, -1.3939)
    if dist<=10000:
        coords.setdefault(coord, []).append(dist)

print(len(coords))

117057


calculating decibels, depending if the plane is taking off, landing or cruising

also, since I cannot get the details for each airplane, I'll define a fixed value for each phase

In [None]:
# formula used: General Decibel Attenuation Formula
# L = L0 - 20 * log10(d)
# With:
# L: decibels on ground level
# L0: decibels at 1m from source (variable depending on state)
# d: distance (!!! should not be 0 or negative at all cost because of the log10 !!!)