In [None]:
from pyspark.sql.functions import col
from IPython.core.display import HTML
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from matplotlib.animation import FuncAnimation
from scipy import interpolate

%matplotlib inline
plt.rcParams['animation.embed_limit'] = 2**128
plt.rcParams["animation.html"] = "jshtml"
display(HTML("<style>pre { white-space: pre !important; }</style>"))
sc.setLogLevel("ERROR")

# Description of the dataset

One file per month is provided as a csv file with the following
features:

- **callsign**: the identifier of the flight displayed on ATC screens
  (usually the first three letters are reserved for an airline: AFR
  for Air France, DLH for Lufthansa, etc.)
- **number**: the commercial number of the flight, when available (the
  matching with the callsign comes from public open API)
- **icao24**: the transponder unique identification number;
- **registration**: the aircraft tail number (when available);
- **typecode**: the aircraft model type (when available);
- **origin**: a four letter code for the origin airport of the flight
  (when available);
- **destination**: a four letter code for the destination airport of
  the flight (when available);
- **firstseen**: the UTC timestamp of the first message received by
  the OpenSky Network;
- **lastseen**: the UTC timestamp of the last message received by the
  OpenSky Network;
- **day**: the UTC day of the last message received by the OpenSky
  Network.

# Simple Data Separation and Exploration

In [None]:
df = spark.read.csv("/user/s1919377/flights/*", header='true')
df.limit(20).toPandas()

## Two Days With and Without COVID-19

In [None]:
two_days_non_covid = df.where((col('day') == "2019-08-01 00:00:00+00:00") | (col('day') == "2019-08-02 00:00:00+00:00"))
two_days_non_covid_pandas = two_days_non_covid.toPandas()

In [None]:
two_days_non_covid_pandas['firstseen'] = pd.to_datetime(two_days_non_covid_pandas['firstseen'], format="%Y-%m-%d %H:%M:%S")
two_days_non_covid_pandas['lastseen'] = pd.to_datetime(two_days_non_covid_pandas['lastseen'], format="%Y-%m-%d %H:%M:%S")
two_days_non_covid_pandas.head()

In [None]:
two_days_covid = df.where((col('day') == "2020-04-01 00:00:00+00:00") | (col('day') == "2020-04-02 00:00:00+00:00"))
two_days_covid_pandas = two_days_covid.toPandas()

In [None]:
two_days_covid_pandas['firstseen'] = pd.to_datetime(two_days_covid_pandas['firstseen'], format="%Y-%m-%d %H:%M:%S")
two_days_covid_pandas['lastseen'] = pd.to_datetime(two_days_covid_pandas['lastseen'], format="%Y-%m-%d %H:%M:%S")
two_days_covid_pandas.head()

# Data Visualization

In [None]:
def make_animation(dataframe, steps, interval, bounds=[[-180, -90], [180, 90]]):
    adjusted_pandas = dataframe
    times = adjusted_pandas[['firstseen', 'lastseen']].astype(np.int)
    values_lon = adjusted_pandas[['longitude_1', 'longitude_2']].astype(np.float).to_numpy()
    values_lat = adjusted_pandas[['latitude_1', 'latitude_2']].astype(np.float).to_numpy()
    min_time = times['firstseen'].min()
    max_time = times['lastseen'].max()
    
    time_per_step = (max_time - min_time) / steps
    starts_ends = np.round((times.to_numpy() - min_time) / time_per_step).astype(np.int)
    stepped_steps = np.arange(starts_ends.shape[0]) * steps
    adjusted_starts_ends = starts_ends + np.repeat(np.reshape(stepped_steps, (-1, 1)), starts_ends.shape[1], axis=1)

    known_values_lon = np.repeat(values_lon.flatten(), np.array([starts_ends[:, 0], steps - starts_ends[:, 1]]).T.flatten())
    known_values_lat = np.repeat(values_lat.flatten(), np.array([starts_ends[:, 0], steps - starts_ends[:, 1]]).T.flatten())

    unknown_ranges = np.concatenate([np.arange(x, y) for x, y in adjusted_starts_ends])
    known_ranges = np.delete(np.arange(steps * values_lon.shape[0]), unknown_ranges)
    
    lon_interp = interpolate.interp1d(known_ranges, known_values_lon, fill_value="extrapolate")
    lat_interp = interpolate.interp1d(known_ranges, known_values_lat, fill_value="extrapolate")
    fig, ax = plt.subplots(figsize=(20, 10))

    # MAP CODE
    m = Basemap(ax=ax, llcrnrlon=bounds[0][0], llcrnrlat=bounds[0][1], urcrnrlon=bounds[1][0], urcrnrlat=bounds[1][1])
    m.fillcontinents(color="#FFDDCC", lake_color='#DDEEFF')
    m.drawmapboundary(fill_color="#DDEEFF")
    m.drawcoastlines()

    # THE PLOT ITSELF
    scatter = ax.scatter(values_lon, values_lat, color='red', s=0.5, zorder=3)

    
    def animate(i):
        vals = i + stepped_steps
        data = np.column_stack((lon_interp(vals), lat_interp(vals)))
        scatter.set_offsets(data)
        return scatter

    return FuncAnimation(fig, animate, interval=interval, frames=steps)

## Flights Before COVID-19

In [None]:
anim_non_covid = make_animation(two_days_non_covid_pandas, 200, 50)
anim_non_covid

## Flights During COVID-19

In [None]:
anim_covid = make_animation(two_days_covid_pandas, 200, 50)
anim_covid

In [None]:
europe_bounds = [
    [-24.0, 34.41],
    [49.98, 71.28]
]
europe_flights_non_covid = two_days_non_covid.where(((col('longitude_1') >= europe_bounds[0][0]) & (col('longitude_1') <= europe_bounds[1][0])) &
                                                    ((col('latitude_1') >= europe_bounds[0][1]) & (col('latitude_1') <= europe_bounds[1][1])) |
                                                   ((col('longitude_2') >= europe_bounds[0][0]) & (col('longitude_2') <= europe_bounds[1][0])) &
                                                    ((col('latitude_2') >= europe_bounds[0][1]) & (col('latitude_2') <= europe_bounds[1][1]))
                                                   )
europe_flights_non_covid_pd = europe_flights_non_covid.toPandas()

In [None]:
europe_flights_non_covid_pd['firstseen'] = pd.to_datetime(europe_flights_non_covid_pd['firstseen'], format="%Y-%m-%d %H:%M:%S")
europe_flights_non_covid_pd['lastseen'] = pd.to_datetime(europe_flights_non_covid_pd['lastseen'], format="%Y-%m-%d %H:%M:%S")
europe_flights_non_covid_pd.head()

In [None]:
europe_non_covid_anim = make_animation(europe_flights_non_covid_pd, 200, 50, europe_bounds)
europe_non_covid_anim

In [None]:
df_origins_count = df.groupBy('origin').count()
