In [None]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import scipy.stats as stats
from pyspark.sql.functions import col, to_timestamp
import datetime

matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

# Aggregate Data by Top 100 Airports

## Aggregated By Day

In [None]:
spark_top_agg_df = spark.read.csv("/user/s1919377/top_100_airport_aggregates.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
top_agg = spark_top_agg_df.toPandas()
top_agg.head(10)

In [None]:
lag_mean = np.mean(top_agg['lag'])
lag_std = np.std(top_agg['lag'])
print("Lag Mean: ", lag_mean)
print("Lag Std Deviation: ", lag_std)
lag_normal_x = np.linspace(lag_mean - 3*lag_std, lag_mean + 3*lag_std, 100)
plt.plot(lag_normal_x, stats.norm.pdf(lag_normal_x, lag_mean, lag_std))
plt.show()

In [None]:
corr_mean = np.mean(top_agg['correlation'])
corr_std = np.std(top_agg['correlation'])
print("Correlation Mean: ", corr_mean)
print("Correlation Std Deviation: ", corr_std)
corr_normal_x = np.linspace(corr_mean - 3*corr_std, corr_mean + 3*corr_std, 100)
plt.plot(corr_normal_x, stats.norm.pdf(corr_normal_x, corr_mean, corr_std))
plt.show()

## Aggregated By Period

In [None]:
spark_top_agg_period_df = spark.read.csv("/user/s1919377/top_100_airport_aggregates_by_period.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
top_agg_period = spark_top_agg_period_df.toPandas()
top_agg_period.head(10)

In [None]:
period_lag_mean = np.mean(top_agg_period["lag"])
period_lag_std = np.std(top_agg_period["lag"])
print("Period Lag Mean: ", period_lag_mean)
print("Period Lag Std Deviation: ", period_lag_std)
period_lag_normal_x = np.linspace(period_lag_mean - 3*period_lag_std, period_lag_mean + 3*period_lag_std, 100)
plt.plot(period_lag_normal_x, stats.norm.pdf(period_lag_normal_x, period_lag_mean, period_lag_std))
plt.show()

In [None]:
period_corr_mean = np.mean(top_agg_period["correlation"])
period_corr_std = np.std(top_agg_period["correlation"])
print("Period Correlation Mean: ", period_corr_mean)
print("Period Correlation Std Deviation: ", period_corr_std)
period_corr_normal_x = np.linspace(period_corr_mean - 3*period_corr_std, period_corr_mean + 3*period_corr_std, 100)
plt.plot(period_corr_normal_x, stats.norm.pdf(period_corr_normal_x, period_corr_mean, period_corr_std))
plt.show()

# Aggregate Data by Mean of Top 100 Airports

In [None]:
spark_top_date_agg_df = spark.read.csv("/user/s1919377/top_100_date_aggregates.csv", header=True) \
    .withColumn("mean_count", col("mean_count").cast("float")) \
    .withColumn("mean_cases", col("mean_cases").cast("float")) \
    .withColumn("day",to_timestamp("day")) \
    .sort(col("day").asc())
top_date_agg = spark_top_date_agg_df \
    .toPandas() \
    .set_index("day")
top_date_agg.head(10)

In [None]:
spark_top_period_agg_df = spark.read.csv("/user/s1919377/top_100_date_aggregates_by_period.csv", header=True) \
    .withColumn("mean_count", col("mean_count").cast("float")) \
    .withColumn("mean_cases", col("mean_cases").cast("float")) \
    .withColumn("range_start", to_timestamp("range_start")) \
    .withColumn("range_end", to_timestamp("range_end")) \
    .sort(col("period").asc())
top_period_agg = spark_top_period_agg_df \
    .toPandas()
top_period_agg.head(10)

In [None]:
rolling_avg_agg = top_date_agg[["mean_count", "mean_cases"]].rolling(20).mean()
rolling_corr_agg = top_date_agg["mean_count"].rolling(40).corr(top_date_agg["mean_cases"])

In [None]:
x = top_date_agg.index.values

# plt.figure(figsize=(20,10)) 
plt.plot(x, rolling_avg_agg["mean_count"], label="Mean Incoming flights")
plt.plot(x, rolling_avg_agg["mean_cases"], label="Mean Cases per 1m people")
# plt.plot(x, rolling_avg["count"], label="Schiphol Incoming flights (SMA)")
# plt.plot(x, rolling_avg["rate_14_day_per_100k"], label="Schiphol Cases per 100k people (SMA)")
# plt.plot(x, np.pad(np.repeat(top_period_agg['mean_count'], (top_period_agg['range_end'] - top_period_agg['range_start']).dt.days), (3,0)), label="Average Incoming Flights in Period")
# plt.plot(x, np.pad(np.repeat(top_period_agg['mean_cases'], (top_period_agg['range_end'] - top_period_agg['range_start']).dt.days), (3,0)), label="Average Cases in Period")
plt.axvline(datetime.datetime(2020, 5, 31), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2020, 8, 31), color="grey")
plt.axvline(datetime.datetime(2020, 11, 30), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2021, 2, 28), color="grey")
plt.axvline(datetime.datetime(2021, 5, 31), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2021, 8, 31), color="grey")
plt.title("Flights and Cases of Top 100 Airports (Simple Moving Average)")
plt.legend(loc="upper left")
plt.gcf().autofmt_xdate()
# plt.show()
plt.savefig('top_means.pgf')

In [None]:
plt.figure(figsize=(15,10)) 
# plt.plot(x, rolling_corr_agg, label="Top 100 Correlation")
plt.plot(x, bot_rolling_corr_agg, label="Bottom 100 Correlation", color="C1")
# plt.plot(x, rolling_corr, label="Schiphol Correlation (Rolling)")
# plt.plot(x, np.pad(np.repeat(top_period_agg['mean_correlation'], (top_period_agg['range_end'] - top_period_agg['range_start']).dt.days), (3,0)), label="Mean Correlation by Period")
plt.axhline(0, color='black')
plt.axvline(datetime.datetime(2020, 5, 31), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2020, 8, 31), color="grey")
plt.axvline(datetime.datetime(2020, 11, 30), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2021, 2, 28), color="grey")
plt.axvline(datetime.datetime(2021, 5, 31), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2021, 8, 31), color="grey")
plt.title("Correlation of Flights and Cases Bottom 100 Airports")
# plt.legend(loc="upper left")
plt.gcf().autofmt_xdate()
# plt.ylim(0, 1000)
plt.savefig('corrs.pgf')

# Aggregate Data by Bottom 100 Airports

## Aggregated By Day

In [None]:
spark_bottom_agg_df = spark.read.csv("/user/s1919377/bottom_100_airport_aggregates.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
bot_agg = spark_bottom_agg_df.toPandas()
bot_agg.head(10)

In [None]:
bot_lag_mean = np.mean(bot_agg['lag'])
bot_lag_std = np.std(bot_agg['lag'])
print("Lag Mean: ", bot_lag_mean)
print("Lag Std Deviation: ", bot_lag_std)
bot_lag_normal_x = np.linspace(bot_lag_mean - 3*bot_lag_std, bot_lag_mean + 3*bot_lag_std, 100)
plt.plot(bot_lag_normal_x, stats.norm.pdf(bot_lag_normal_x, bot_lag_mean, bot_lag_std))
plt.show()

In [None]:
bot_corr_mean = np.mean(bot_agg['correlation'])
bot_corr_std = np.std(bot_agg['correlation'])
print("Correlation Mean: ", bot_corr_mean)
print("Correlation Std Deviation: ", bot_corr_std)
bot_corr_normal_x = np.linspace(bot_corr_mean - 3*bot_corr_std, bot_corr_mean + 3*bot_corr_std, 100)
plt.plot(bot_corr_normal_x, stats.norm.pdf(bot_corr_normal_x, bot_corr_mean, bot_corr_std))
plt.show()

## Aggregated By Period

In [None]:
spark_bottom_agg_period_df = spark.read.csv("/user/s1919377/bottom_100_airport_aggregates_by_period.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
bot_agg_period = spark_bottom_agg_period_df.toPandas()
bot_agg_period.head(10)

In [None]:
bot_period_lag_mean = np.mean(bot_agg_period["lag"])
bot_period_lag_std = np.std(bot_agg_period["lag"])
print("Period Lag Mean: ", bot_period_lag_mean)
print("Period Lag Std Deviation: ", bot_period_lag_std)
bot_period_lag_normal_x = np.linspace(bot_period_lag_mean - 3*bot_period_lag_std, bot_period_lag_mean + 3*bot_period_lag_std, 100)
plt.plot(bot_period_lag_normal_x, stats.norm.pdf(bot_period_lag_normal_x, bot_period_lag_mean, bot_period_lag_std))
plt.show()

In [None]:
bot_period_corr_mean = np.mean(bot_agg_period["correlation"])
bot_period_corr_std = np.std(bot_agg_period["correlation"])
print("Period Correlation Mean: ", bot_period_corr_mean)
print("Period Correlation Std Deviation: ", period_corr_std)
bot_period_corr_normal_x = np.linspace(bot_period_corr_mean - 3*bot_period_corr_std, bot_period_corr_mean + 3*bot_period_corr_std, 100)
plt.plot(bot_period_corr_normal_x, stats.norm.pdf(bot_period_corr_normal_x, bot_period_corr_mean, bot_period_corr_std))
plt.show()

# Aggregate Data by Mean of Bottom 100 Airports

In [None]:
spark_bottom_date_agg_df = spark.read.csv("/user/s1919377/bottom_100_date_aggregates.csv", header=True) \
    .withColumn("mean_count", col("mean_count").cast("float")) \
    .withColumn("mean_cases", col("mean_cases").cast("float")) \
    .withColumn("day",to_timestamp("day")) \
    .sort(col("day").asc())
bot_date_agg = spark_bottom_date_agg_df \
    .toPandas() \
    .set_index("day")
bot_date_agg.head(10)

In [None]:
spark_bot_period_agg_df = spark.read.csv("/user/s1919377/bottom_100_date_aggregates_by_period.csv", header=True) \
    .withColumn("mean_count", col("mean_count").cast("float")) \
    .withColumn("mean_cases", col("mean_cases").cast("float")) \
    .withColumn("range_start", to_timestamp("range_start")) \
    .withColumn("range_end", to_timestamp("range_end")) \
    .sort(col("period").asc())
bot_period_agg = spark_bot_period_agg_df \
    .toPandas()
bot_period_agg.head(10)

In [None]:
bot_rolling_avg_agg = bot_date_agg[["mean_count", "mean_cases"]].rolling(20).mean()
bot_rolling_corr_agg = bot_date_agg["mean_count"].rolling(40).corr(top_date_agg["mean_cases"])

In [None]:
x = top_date_agg.index.values

# plt.figure(figsize=(20,10)) 
plt.plot(x, bot_rolling_avg_agg["mean_count"], label="Mean Incoming flights")
plt.plot(x, bot_rolling_avg_agg["mean_cases"] / 50, label="Mean Cases per 50m people")
# plt.plot(x, rolling_avg["count"], label="Schiphol Incoming flights (SMA)")
# plt.plot(x, rolling_avg["rate_14_day_per_100k"], label="Schiphol Cases per 100k people (SMA)")
# plt.plot(x, np.pad(np.repeat(bot_period_agg['mean_count'], (bot_period_agg['range_end'] - bot_period_agg['range_start']).dt.days), (3,0)), label="Average Incoming Flights in Period")
# plt.plot(x, np.pad(np.repeat(bot_period_agg['mean_cases'] / 10, (bot_period_agg['range_end'] - bot_period_agg['range_start']).dt.days), (3,0)), label="Average Cases in Period")
plt.axvline(datetime.datetime(2020, 5, 31), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2020, 8, 31), color="grey")
plt.axvline(datetime.datetime(2020, 11, 30), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2021, 2, 28), color="grey")
plt.axvline(datetime.datetime(2021, 5, 31), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2021, 8, 31), color="grey")
plt.gcf().autofmt_xdate()
plt.legend(loc="upper left")
plt.title("Flights and Cases of Bottom 100 Airports (Simple Moving Average)")
plt.savefig('bot_means.pgf')

In [None]:
bot_period_agg["mean_correlation"]

In [None]:
plt.figure(figsize=(20,10)) 
plt.plot(x, bot_rolling_corr_agg, label="Average Correlation (Rolling)")
# plt.plot(x, rolling_corr, label="Schiphol Correlation (Rolling)")
plt.plot(x, np.pad(np.repeat(bot_period_agg['mean_correlation'], (bot_period_agg['range_end'] - bot_period_agg['range_start']).dt.days), (3,0)), label="Average Incoming Flights in Period")
plt.axhline(0, color='black')
plt.axvline(datetime.datetime(2020, 5, 31), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2020, 8, 31), color="grey")
plt.axvline(datetime.datetime(2020, 11, 30), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2021, 2, 28), color="grey")
plt.axvline(datetime.datetime(2021, 5, 31), color="grey", linestyle="dashed")
plt.axvline(datetime.datetime(2021, 8, 31), color="grey")
plt.legend(loc="upper left")
# plt.ylim(0, 1000)
plt.show()

In [None]:
spark_top_agg_df = spark.read.csv("/user/s1919377/top_100_airport_aggregates.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
top_agg = spark_top_agg_df.toPandas()
spark_bottom_agg_df = spark.read.csv("/user/s1919377/bottom_100_airport_aggregates.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
bot_agg = spark_bottom_agg_df.toPandas()
spark_eham_agg_df = spark.read.csv("/user/s1919377/eham_airport_aggregates.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
eham_agg = spark_eham_agg_df.toPandas()
spark_klax_agg_df = spark.read.csv("/user/s1919377/klax_airport_aggregates.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
klax_agg = spark_klax_agg_df.toPandas()
spark_omdb_agg_df = spark.read.csv("/user/s1919377/omdb_airport_aggregates.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
omdb_agg = spark_omdb_agg_df.toPandas()

In [None]:
spark_top_agg_period_df = spark.read.csv("/user/s1919377/top_100_airport_aggregates_by_period.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
top_agg_period = spark_top_agg_period_df.toPandas()
spark_bottom_agg_period_df = spark.read.csv("/user/s1919377/bottom_100_airport_aggregates_by_period.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
bot_agg_period = spark_bottom_agg_period_df.toPandas()
spark_eham_agg_period_df = spark.read.csv("/user/s1919377/eham_airport_aggregates_by_period.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
eham_agg_period = spark_eham_agg_period_df.toPandas()
spark_klax_agg_period_df = spark.read.csv("/user/s1919377/klax_airport_aggregates_by_period.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
klax_agg_period = spark_klax_agg_period_df.toPandas()
spark_omdb_agg_period_df = spark.read.csv("/user/s1919377/omdb_airport_aggregates_by_period.csv", header=True) \
    .withColumn("lag", col("lag").cast("float")) \
    .withColumn("correlation", col("correlation").cast("float"))
omdb_agg_period = spark_omdb_agg_period_df.toPandas()

In [None]:
print("Top Lag Mean: ", np.mean(top_agg["lag"]))
print("Top Lag Mean Confidence Interval: ", stats.t.interval(0.95, len(top_agg["lag"])-1, loc=np.mean(top_agg["lag"]), scale=stats.sem(top_agg["lag"])))
print("Top Lag Mean Period: ", np.mean(top_agg_period["lag"]))
print("Top Lag Mean Period Confidence Interval: ", stats.t.interval(0.95, len(top_agg_period["lag"])-1, loc=np.mean(top_agg_period["lag"]), scale=stats.sem(top_agg_period["lag"])))

In [None]:
print("Top Correlation Mean: ", np.mean(top_agg["correlation"]))
print("Top Correlation Mean Confidence Interval: ", stats.t.interval(0.95, len(top_agg["correlation"])-1, loc=np.mean(top_agg["correlation"]), scale=stats.sem(top_agg["correlation"])))
print("Top Correlation Mean Period: ", np.mean(top_agg_period["correlation"]))
print("Top Correlation Mean Period Confidence Interval: ", stats.t.interval(0.95, len(top_agg_period["correlation"])-1, loc=np.mean(top_agg_period["correlation"]), scale=stats.sem(top_agg_period["correlation"])))

In [None]:
print("Bottom Lag Mean: ", np.mean(bot_agg["lag"]))
print("Bottom Lag Mean Confidence Interval: ", stats.t.interval(0.95, len(bot_agg["lag"])-1, loc=np.mean(bot_agg["lag"]), scale=stats.sem(bot_agg["lag"])))
print("Bottom Lag Mean Period: ", np.mean(bot_agg_period["lag"]))
print("Bottom Lag Mean Period Confidence Interval: ", stats.t.interval(0.95, len(bot_agg_period["lag"])-1, loc=np.mean(bot_agg_period["lag"]), scale=stats.sem(bot_agg_period["lag"])))

In [None]:
print("Bottom Correlation Mean: ", np.mean(bot_agg["correlation"]))
print("Bottom Correlation Mean Confidence Interval: ", stats.t.interval(0.95, len(bot_agg["correlation"])-1, loc=np.mean(bot_agg["correlation"]), scale=stats.sem(bot_agg["correlation"])))
print("Bottom Correlation Mean Period: ", np.mean(bot_agg_period["correlation"][~np.isnan(bot_agg_period["correlation"])]))
print("Bottom Correlation Mean Period Confidence Interval: ", stats.t.interval(0.95, len(bot_agg_period["correlation"][~np.isnan(bot_agg_period["correlation"])])-1, loc=np.mean(bot_agg_period["correlation"][~np.isnan(bot_agg_period["correlation"])]), scale=stats.sem(bot_agg_period["correlation"][~np.isnan(bot_agg_period["correlation"])])))

In [None]:
print("EHAM Lag Mean: ", np.mean(eham_agg["lag"]))
print("EHAM Lag Mean Period: ", np.mean(eham_agg_period["lag"]))
print("EHAM Lag Mean Period Confidence Interval: ", stats.t.interval(0.95, len(eham_agg_period["lag"])-1, loc=np.mean(eham_agg_period["lag"]), scale=stats.sem(eham_agg_period["lag"])))
print("EHAM Correlation Mean: ", np.mean(eham_agg["correlation"]))
print("EHAM Correlation Mean Period: ", np.mean(eham_agg_period["correlation"]))
print("EHAM Correlation Mean Period Confidence Interval: ", stats.t.interval(0.95, len(eham_agg_period["correlation"])-1, loc=np.mean(eham_agg_period["correlation"]), scale=stats.sem(eham_agg_period["correlation"])))

In [None]:
print("KLAX Lag Mean: ", np.mean(klax_agg["lag"]))
print("KLAX Lag Mean Period: ", np.mean(klax_agg_period["lag"]))
print("KLAX Lag Mean Period Confidence Interval: ", stats.t.interval(0.95, len(klax_agg_period["lag"])-1, loc=np.mean(klax_agg_period["lag"]), scale=stats.sem(klax_agg_period["lag"])))
print("KLAX Correlation Mean: ", np.mean(klax_agg["correlation"]))
print("KLAX Correlation Mean Period: ", np.mean(klax_agg_period["correlation"]))
print("KLAX Correlation Mean Period Confidence Interval: ", stats.t.interval(0.95, len(klax_agg_period["correlation"])-1, loc=np.mean(klax_agg_period["correlation"]), scale=stats.sem(klax_agg_period["correlation"])))

In [None]:
print("OMDB Lag Mean: ", np.mean(omdb_agg["lag"]))
print("OMDB Lag Mean Period: ", np.mean(omdb_agg_period["lag"]))
print("OMDB Lag Mean Period Confidence Interval: ", stats.t.interval(0.95, len(omdb_agg_period["lag"])-1, loc=np.mean(omdb_agg_period["lag"]), scale=stats.sem(omdb_agg_period["lag"])))
print("OMDB Correlation Mean: ", np.mean(omdb_agg["correlation"]))
print("OMDB Correlation Mean Period: ", np.mean(omdb_agg_period["correlation"]))
print("OMDB Correlation Mean Period Confidence Interval: ", stats.t.interval(0.95, len(omdb_agg_period["correlation"])-1, loc=np.mean(omdb_agg_period["correlation"]), scale=stats.sem(omdb_agg_period["correlation"])))