In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

tqdm.pandas()

from geopy import distance

## Data from zenodo archive

https://zenodo.org/record/7923702

In [2]:
january = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20220101_20220131.csv",
    dtype={1: "string"},
)
february = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20220201_20220228.csv",
    dtype={1: "string"},
)
march = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20220301_20220331.csv",
    dtype={1: "string"},
)
april = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20220401_20220430.csv",
    dtype={1: "string"},
)
may = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20220501_20220531.csv",
    dtype={1: "string"},
)
june = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20220601_20220630.csv",
    dtype={1: "string"},
)
july = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20220701_20220731.csv",
    dtype={1: "string"},
)
august = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20220801_20220831.csv",
    dtype={1: "string"},
)
september = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20220901_20220930.csv",
    dtype={1: "string"},
)
october = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20221001_20221031.csv",
    dtype={1: "string"},
)
november = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20221101_20221130.csv",
    dtype={1: "string"},
)
december = pd.read_csv(
    "D:/a.salgas/Documents/opensky_2022/flightlist_20221201_20221231.csv",
    dtype={1: "string"},
)


opensky_extract = pd.concat(
    [
        january,
        february,
        march,
        april,
        may,
        june,
        july,
        august,
        september,
        october,
        november,
        december,
    ],
    axis=0,
).reset_index()

In [3]:
opensky_2019 = opensky_extract.copy()

In [4]:
keep_time = False


# selecting only the columns interesting us
opensky_2019 = opensky_2019.loc[:, ["day", "destination", "origin", "icao24"]]
# Drop flight with no known origin and destination (mandatory because of our computation method)
size_before_drops = len(opensky_2019.index)
# opensky_2019.dropna(subset=["destination", "origin"], inplace=True)
size_after_drops = len(opensky_2019.index)
print(
    "{}% of flights deleted after removing "
    "flights with no origin and destination. {} Flights in the dataset".format(
        (size_before_drops - size_after_drops) / size_before_drops * 100,
        size_after_drops,
    )
)

# Opensky only gets aircraft transponder code, not its registration.
# Hopefully, it can be merged with an aircraft database, for example opensky aircraft database

ac_ref = pd.read_csv(
    "../04_bis_opensky_standalone/data/aircraft-database-complete-2022-05.csv", sep=";"
)


# Even if each aircraft has a unique icao24 code, some are present several times in the database.
# Duplicates are therefore dropped before merging the dataframes.
ac_ref = ac_ref[["icao24", "typecode", "operator_os", "operatoricao"]].drop_duplicates(
    subset="icao24", keep="last"
)
opensky_2019 = pd.merge(
    opensky_2019, ac_ref, left_on="icao24", right_on="icao24", how="left", sort=False
)

# we use ailine ICAO code, but it is named IATA in the following for compatibility reasons
opensky_2019.rename(
    columns={"operatoricao": "airline_iata", "typecode": "aircraft_type"}, inplace=True
)
# opensky_2019.aircraft_type = opensky_2019.aircraft_type.replace("ZZZZ", "zzz")
opensky_2019.aircraft_type.fillna("Unkwown", inplace=True)
opensky_2019.airline_iata.fillna("Unkwown", inplace=True)

# # Counting flight per aircraft type /route/airline each month
if keep_time:
    # Converting dates to a suitable format
    opensky_2019["year"] = opensky_2019.apply(
        lambda x: get_month_year(x["day"])[0], axis=1
    )
    opensky_2019["month"] = opensky_2019.apply(
        lambda x: get_month_year(x["day"])[1], axis=1
    )

    opensky_2019 = opensky_2019.groupby(
        ["airline_iata", "origin", "destination", "aircraft_type", "month", "year"],
        as_index=False,
        dropna=False,
    ).size()
else:
    # allows to significantly reduce the size of the data by eliminating the temporal variables
    #         by aggregating whole dataframe values
    opensky_2019 = opensky_2019.groupby(
        ["airline_iata", "origin", "destination", "aircraft_type"],
        as_index=False,
        dropna=False,
    ).size()

opensky_2019.rename(columns={"size": "n_flights"}, inplace=True)
print(
    "Size of df after_grouping: {}; number of flights: {}".format(
        len(opensky_2019.index), opensky_2019.n_flights.sum()
    )
)


ac_cla = pd.read_csv(
    "../03_routes_schedule/data/open_sky/aircraft_classification.csv", sep=";"
)
opensky_2019 = opensky_2019.merge(
    ac_cla, left_on="aircraft_type", right_on="aircraft_osky", how="left"
)
opensky_2019.drop(columns=["aircraft_osky"], inplace=True)

opensky_2019.acft_icao.fillna("Unknown", inplace=True)
opensky_2019.acft_class.fillna("Unknown", inplace=True)
opensky_2019.seymour_proxy.fillna("zzz", inplace=True)

print(
    "Size of df after ac info: {}; number of flights {}".format(
        len(opensky_2019.index), opensky_2019.n_flights.sum()
    )
)

# similarly, we merge an airport database to have info on airport iata designator, and gps coordinates.
# All major airports have IATA designator.

arpt_ref = pd.read_csv(
    "../03_routes_schedule/data/ourairports.csv",
    sep=";",
    keep_default_na=False,
    na_values="",
)

opensky_2019 = opensky_2019.merge(
    arpt_ref[
        [
            "ident",
            "iata_code",
            "longitude_deg",
            "latitude_deg",
            "iso_country",
            "continent",
        ]
    ],
    left_on="origin",
    right_on="ident",
    how="left",
    sort=False,
)
opensky_2019.rename(
    columns={
        "longitude_deg": "origin_lon",
        "latitude_deg": "origin_lat",
        "iso_country": "origin_country",
        "continent": "origin_continent",
        "iata_code": "origin_iata",
    },
    inplace=True,
)

opensky_2019[
    ["origin_lon", "origin_lat", "origin_country", "origin_continent", "origin_iata"]
] = opensky_2019[
    ["origin_lon", "origin_lat", "origin_country", "origin_continent", "origin_iata"]
].fillna(
    "Unknown Origin"
)

opensky_2019 = opensky_2019.merge(
    arpt_ref[
        [
            "ident",
            "iata_code",
            "longitude_deg",
            "latitude_deg",
            "iso_country",
            "continent",
        ]
    ],
    left_on="destination",
    right_on="ident",
    how="left",
    sort=False,
)
opensky_2019.rename(
    columns={
        "destination": "dest",
        "longitude_deg": "dest_lon",
        "latitude_deg": "dest_lat",
        "iso_country": "dest_country",
        "continent": "dest_continent",
        "iata_code": "dest_iata",
    },
    inplace=True,
)

opensky_2019[
    ["dest_lon", "dest_lat", "dest_country", "dest_continent", "dest_iata"]
] = opensky_2019[
    ["dest_lon", "dest_lat", "dest_country", "dest_continent", "dest_iata"]
].fillna(
    "Unknown Destination"
)

opensky_2019.drop(columns=["ident_x", "ident_y"], inplace=True)


opensky_2019.dropna(
    subset=["dest_lon", "origin_lon", "dest_lat", "origin_lat"], inplace=True
)

print(
    "Size of df after arpt info: {}, number of flights: {}".format(
        len(opensky_2019.index), opensky_2019.n_flights.sum()
    )
)

0.0% of flights deleted after removing flights with no origin and destination. 34901942 Flights in the dataset


  ac_ref=pd.read_csv('../04_bis_opensky_standalone/data/aircraft-database-complete-2022-05.csv', sep=';')


Size of df after_grouping: 3112906; number of flights: 34901942
Size of df after ac info: 3112906; number of flights 34901942
Size of df after arpt info: 3112906, number of flights: 34901942


In [5]:
os_df = opensky_2019.copy()

os_df

Unnamed: 0,airline_iata,origin,dest,aircraft_type,n_flights,acft_icao,acft_class,seymour_proxy,origin_iata,origin_lon,origin_lat,origin_country,origin_continent,dest_iata,dest_lon,dest_lat,dest_country,dest_continent
0,AA1,EG64,EGCF,AA1,2,Unknown,Unknown,zzz,Unknown Origin,-1.185575,53.94829,GB,EU,Unknown Destination,-0.858333,53.5597,GB,EU
1,AA1,EG64,EGCJ,AA1,4,Unknown,Unknown,zzz,Unknown Origin,-1.185575,53.94829,GB,EU,Unknown Destination,-1.216877,53.788458,GB,EU
2,AA1,EG64,EGLM,AA1,1,Unknown,Unknown,zzz,Unknown Origin,-1.185575,53.94829,GB,EU,Unknown Destination,-0.774444,51.500801,GB,EU
3,AA1,EG64,EGNV,AA1,1,Unknown,Unknown,zzz,Unknown Origin,-1.185575,53.94829,GB,EU,MME,-1.42941,54.509201,GB,EU
4,AA1,EGBJ,EGML,AA1,1,Unknown,Unknown,zzz,GLO,-2.16722,51.894199,GB,EU,Unknown Destination,0.245556,51.528599,GB,EU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112901,bfx,,LGRP,PRM1,1,PRM1,PJ,zzz,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,RHO,28.086201,36.405399,GR,EU
3112902,bfx,,LOKG,PRM1,1,PRM1,PJ,zzz,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Destination,14.330833,46.532778,AT,EU
3112903,bfx,,LSZB,PRM1,1,PRM1,PJ,zzz,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,BRN,7.499747,46.913419,CH,EU
3112904,bfx,,LSZH,PRM1,1,PRM1,PJ,zzz,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,ZRH,8.548056,47.458056,CH,EU


In [6]:
fleet = pd.read_excel("data/planespotters_fleet.xlsx")
fleet["Seat Total"] = fleet["Seat Total"].replace(0, np.nan)
fleet.dropna(subset=["Seat Total"], inplace=True)

# Adding the name/icao code match table
match = pd.read_csv("data/fleet_match.csv", sep=";")
fleet = fleet.merge(
    match, left_on="Aircraft Type", right_on="spotter_name", how="right"
).drop(columns="spotter_name")

aircraft_data = (
    fleet.groupby(["Aircraft Type", "ICAO_AC"])["Seat Total"]
    .mean()
    .reset_index()
    .drop(columns="Aircraft Type")
)

In [7]:
aircraft_data.head()

Unnamed: 0,ICAO_AC,Seat Total
0,AT43,46.897119
1,AT45,46.897119
2,AT72,70.13758
3,AT75,70.13758
4,AT76,70.13758


In [8]:
os_df = os_df.merge(aircraft_data, left_on="acft_icao", right_on="ICAO_AC", how="left")
os_df["Seat Total"] = os_df["n_flights"] * os_df["Seat Total"]
os_df = (
    os_df.groupby(
        [
            "origin",
            "dest",
            "origin_iata",
            "origin_lon",
            "origin_lat",
            "origin_country",
            "origin_continent",
            "dest_iata",
            "dest_lon",
            "dest_lat",
            "dest_country",
            "dest_continent",
            "airline_iata",
            "acft_icao",
            "acft_class",
            "seymour_proxy",
        ],
        dropna=False,
    )[["Seat Total", "n_flights"]]
    .sum()
    .reset_index()
    .rename(columns={"Seat Total": "seats"})
)

In [9]:
os_df.n_flights.sum()

34901942

In [10]:
column_mapping = {
    "origin_iata": "iata_departure",
    "dest_iata": "iata_arrival",
    "origin_lon": "departure_lon",
    "origin_lat": "departure_lat",
    "origin_country": "departure_country",
    "origin_continent": "departure_continent",
    "dest_lon": "arrival_lon",
    "dest_lat": "arrival_lat",
    "dest_country": "arrival_country",
    "dest_continent": "arrival_continent",
}


os_df.rename(columns=column_mapping, inplace=True)
os_df["source"] = "OpenSky"

In [11]:
iata_icao_convert = pd.read_csv("data/iata_icao_airline.csv", sep=";")
iata_icao_convert = iata_icao_convert.sort_values(
    by=["IATA"], ascending=False
).drop_duplicates(subset="ICAO", keep="first")

In [12]:
os_df = os_df.merge(
    iata_icao_convert[["IATA", "ICAO"]],
    left_on="airline_iata",
    right_on="ICAO",
    how="left",
)

In [13]:
os_df["IATA"] = os_df["IATA"].fillna(os_df["airline_iata"])
os_df = os_df.drop(columns=["airline_iata", "ICAO"]).rename(
    columns={"IATA": "airline_iata"}
)

In [14]:
os_df.loc[:, "distance_km"] = os_df.progress_apply(
    lambda x: distance.distance(
        (float(x.departure_lat), float(x.departure_lon)),
        (float(x.arrival_lat), float(x.arrival_lon)),
    ).km
    if not (
        pd.isna(x.departure_lat)
        or pd.isna(x.departure_lon)
        or (x.departure_lat == "Unknown Origin")
        or (x.departure_lon == "Unknown Origin")
        or pd.isna(x.arrival_lon)
        or pd.isna(x.arrival_lat)
        or (x.arrival_lat == "Unknown Destination")
        or (x.arrival_lon == "Unknown Destination")
    )
    else 0,
    axis=1,
)

  0%|          | 0/3083329 [00:00<?, ?it/s]

In [15]:
os_df
os_df["ask"] = os_df["distance_km"] * os_df["seats"]
os_df["rpk"] = os_df["ask"] * 0.824

Load factor source: https://www.icao.int/annual-report-2019/Pages/the-world-of-air-transport-in-2019.aspx

In [16]:
os_df["seats"].sum()

2988588216.835884

## Fuel burn computation process

Problem: we do not have a seymour proxi for all the aircraft types neither do we have the seats associated to each aircrfat like in AeroSCOPE core usage. 
One could extend the proxi list, but it is not without increasing the risk of errors.  
Helicopters are particularly often in this case as shown below, followed py piston private and private jets and few turbo props.  
Fortunately, this is not the majority of the flights.

If the interested reader is willing to increase the proxy list for a particular use case, he is free to do so ;) 

In [17]:
os_df[os_df.seymour_proxy == "zzz"].groupby("acft_class")["n_flights"].sum().nlargest(
    50
)

acft_class
Unknown    10742417
HE          1075163
TP           422671
PP           414040
PJ           237334
OTHER        141732
EL               92
Name: n_flights, dtype: int64

In [18]:
os_df[os_df.seymour_proxy != "zzz"].groupby("acft_class")["n_flights"].sum().nlargest(
    50
)

acft_class
NB       12348436
RJ        2735037
WB        2222971
TP        2165138
PJ        1846506
PP         523708
OTHER       26697
Name: n_flights, dtype: int64

Now, let's compute the fuel burn using the surrogate.

In [19]:
fuel_surrogate = pd.read_csv("data/FuelSurrogate.csv", sep=";")
fuel_surrogate_dict = (
    fuel_surrogate[
        ["ac_code_icao", "reduced_fuel_intercept", "reduced_fuel_a1", "reduced_fuel_a2"]
    ]
    .set_index("ac_code_icao")
    .transpose()
    .to_dict()
)
ac_list = fuel_surrogate["ac_code_icao"].unique()


def compute_fuel_seymour(fuel_surrogate_dict, icao_type, distance):
    if icao_type in ac_list and distance > 1:
        coefficients = fuel_surrogate_dict[icao_type]
        fuel_burn = (
            coefficients["reduced_fuel_intercept"]
            + coefficients["reduced_fuel_a2"] * distance
            + coefficients["reduced_fuel_a1"] * distance**2
        )
    elif distance < 1:
        fuel_burn = 0
    else:
        fuel_burn = np.nan
    return float(fuel_burn)


os_df.loc[:, "fuel_burn_seymour"] = os_df.progress_apply(
    lambda x: compute_fuel_seymour(
        fuel_surrogate_dict, x["seymour_proxy"], x["distance_km"]
    ),
    axis=1,
)

  0%|          | 0/3083329 [00:00<?, ?it/s]

In [20]:
os_df

Unnamed: 0,origin,dest,iata_departure,departure_lon,departure_lat,departure_country,departure_continent,iata_arrival,arrival_lon,arrival_lat,...,acft_class,seymour_proxy,seats,n_flights,source,airline_iata,distance_km,ask,rpk,fuel_burn_seymour
0,00AK,00AK,Unknown Origin,-151.692524,59.947733,US,,Unknown Destination,-151.692524,59.947733,...,HE,zzz,0.000000,2,OpenSky,Unkwown,0.000000,0.0,0.0,0.000000
1,00AK,00AK,Unknown Origin,-151.692524,59.947733,US,,Unknown Destination,-151.692524,59.947733,...,HE,zzz,0.000000,1,OpenSky,Unkwown,0.000000,0.0,0.0,0.000000
2,00AK,00AK,Unknown Origin,-151.692524,59.947733,US,,Unknown Destination,-151.692524,59.947733,...,Unknown,zzz,0.000000,7,OpenSky,Unkwown,0.000000,0.0,0.0,0.000000
3,00AK,0AK7,Unknown Origin,-151.692524,59.947733,US,,Unknown Destination,-150.963285,59.776005,...,Unknown,zzz,0.000000,1,OpenSky,Unkwown,45.118036,0.0,0.0,
4,00AK,3AK4,Unknown Origin,-151.692524,59.947733,US,,Unknown Destination,-151.337657,60.632522,...,PP,DHC3,0.000000,1,OpenSky,Unkwown,78.781228,0.0,0.0,29.269134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3083324,,,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Destination,Unknown Destination,Unknown Destination,...,TP,zzz,0.000000,22,OpenSky,YOG,0.000000,0.0,0.0,0.000000
3083325,,,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Destination,Unknown Destination,Unknown Destination,...,NB,B738,16834.327654,96,OpenSky,Y8,0.000000,0.0,0.0,0.000000
3083326,,,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Destination,Unknown Destination,Unknown Destination,...,WB,B744,1913.338369,5,OpenSky,Y8,0.000000,0.0,0.0,0.000000
3083327,,,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Origin,Unknown Destination,Unknown Destination,Unknown Destination,...,OTHER,IL96,0.000000,32,OpenSky,ZAV,0.000000,0.0,0.0,0.000000


In [21]:
os_df["fuel_burn"] = os_df["fuel_burn_seymour"] * os_df["n_flights"]
os_df["co2"] = os_df["fuel_burn"] * 3.16

os_df["domestic"] = (os_df.departure_country == os_df.arrival_country).astype(int)

In [22]:
os_df.to_csv("data/os_alone_test_22.csv")

In [23]:
os_df.n_flights.sum()

34901942