In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import powerlaw
import numpy as np
import seaborn as sns
from collections import Counter
from operator import itemgetter


## Scale Free Analysis

In [None]:
# read stored data from data_preparation notebook
# @formatter:off
%store -r df_airports
%store -r airports_dict
%store -r df_merged
# @formatter:on
GG = nx.read_gml('Graphs/airlines.gml')


    Produce Powerlaw fit of the degree distribution

In [None]:
# TODO: latexify the graphs

## for in degree
# GG_in_degree_dict = dict(GG.out_degree)
GG_in_degree_dict = dict(GG.in_degree)
G_DEGS = [ GG_in_degree_dict[i] for i in GG.nodes() ]
# array with the degrees, sorted by node number
G_DEGS_NO_ZERO = [x for x in G_DEGS if x > 0]

## for "regular" degree
# G_DEGS = [ GG.degree()[i] for i in GG.nodes() ]
# array with the degrees, sorted by node number

fit = powerlaw.Fit(G_DEGS, discrete=True)
alpha = fit.power_law.alpha
sigma = fit.power_law.sigma

xx = np.arange(min(G_DEGS),max(G_DEGS))

fig, ax = plt.subplots(dpi=300)
plt.title("Powerlaw Fit (discrete) for Air Travel Network")
plt.xlabel("In Degree k")
plt.ylabel("P(k)")
fit.power_law.plot_pdf(color='r', ax=ax, label=f'Power Law Fit ($\gamma$={alpha:.2f})')
powerlaw.plot_pdf(G_DEGS_NO_ZERO, color="b", marker="o", label="data points (binned)")
plt.legend()
plt.show()

print("\n\nalpha (gamma in slides) :=", alpha)
print("sigma (error) :=", sigma)

# more or less diagonally down line with both axis in log is powerlaw
# -> https://en.wikipedia.org/wiki/Scale-free_network
# -> certainly, better sources exist or dig through the sources used on wiki
# also, value for gamma/alpha typically between 2 and 3, ours is not
#   -> sort of indicates might not be small world after all?


## BA Null Model
###  Compare degree distirution of airtraffic with BA null model
#### Construct undirect BA

In [None]:
# define parameters, N, m
N = GG.number_of_nodes()
# m := avg degree
m = int(nx.number_of_edges(GG) / float(GG.number_of_nodes()))

In [None]:
G_BA = nx.barabasi_albert_graph(N, m)

# convert to directed
GD_BA = nx.DiGraph()

# add nodes
for node in GG.nodes:
    GD_BA.add_node(node)

# add edges -> direction driven by random drawn with wieghts, taking into account
# the degrees of the two nodes
g_degrees = nx.degree(G_BA)
g_degrees_dict = dict(g_degrees)
for edge in G_BA.edges:
    node_a = edge[0]
    node_b = edge[1]

    deg_na = g_degrees_dict[node_a]
    deg_nb = g_degrees_dict[node_b]

    # probability proportional to degree comparison
    deg_total = deg_na + deg_nb
    # p(b->a) = deg_na/deg_total, p(a->b) = deg_nb/deg_total
    # so we split the space of probability into two sections proportional to the degrees
    rn = np.random.uniform(0,1)
    if rn <= (deg_na/deg_total):
        # b -> a
        GD_BA.add_edge(node_b, node_a)
    else:
        # a -> b
        GD_BA.add_edge(node_a, node_b)


In [None]:
# calculate BA graph metrics for plots
# GG_BA_in_degree_dict = dict(GD_BA.out_degree)
GG_BA_in_degree_dict = dict(GD_BA.in_degree)
GD_BA_DEGS = [ GG_BA_in_degree_dict[i] for i in GD_BA.nodes() ]
# for regular degrees
# GD_BA_DEGS = [ GD_BA.degree()[i] for i in GD_BA.nodes() ]
GD_BA_degree_counts = Counter(GD_BA_DEGS)

In [None]:
# degrees = GG.in_degree()
degree_counts = Counter(G_DEGS)
x, y = zip(*degree_counts.items())
x_BA, y_BA = zip(*GD_BA_degree_counts.items())

fig3 = plt.figure(dpi=300)

# prep axes
plt.xlabel('in degree $k$')
plt.xscale('log')
# plt.xlim(0, max(max(x), max(x_BA)))

plt.ylabel('counts of $k$')
plt.yscale('log')
# plt.ylim(0, max(max(y), max(y_BA)))
                                                                                                                                     # do plot
plt.scatter(x, y, marker='.', label='Air Traffic Network', alpha=0.5)
plt.scatter(x_BA, y_BA, marker='.', label='directed BA null model', alpha=0.5)
plt.legend()
plt.show()

In [None]:
# fig4, ax = plt.subplots(dpi=300)
fig4 = plt.figure(dpi=300)
ax = sns.kdeplot(GD_BA_DEGS, label="directed BA null model")
ax = sns.kdeplot(G_DEGS, ax=ax, label="Air Traffic network")
plt.title("comparison of PDFs")
plt.legend()
plt.ylabel("$p(k)$")
plt.xlabel("in degree $k$")
plt.show()

### Interpretation

<b>Assumption</b>: in-degree more important than out-degree (or just "regular" degree). That is because we assume that as a
small-ish airport you still want to connect to large (hub-like) airports, but they don't necessarily connect back to you.
For the journey back you may have to take an additional stop somewhere else <i>[there may be a source here, Fäbe mentioned
he saw it somewherer]</i>.

<b>Power Law Fit</b>: gamma(=alpha) is somewhat smaller than the range for scale-free as defined on wikipedia (2 < gamma < 3),
but we could say it is still sufficiently close. Additionally, the power-law fit is quite good.
<i>-> maybe don't go into the gamma range? not entirely sure what it means</i>

<b>in degree counts</b>: The null model we built using Barabasi-Albert as basis and then assigning directions based on chance
influenced by the degrees of the nodes, shows data that is similar to what we observe in our air traffic network. Because
we know that the BA model produces scale-free networks, we can thus deduce that our air traffic network is scale-free as well.

<b>in degree PDF comparison</b>: similarly, the comparison of the PDFs shows that the null model has a slightly narrower
(and consequently higher) curve than the air traffic network, but the location of the distribution as well as the
overall characteristics in terms of shape are definitively comparable, further hinting at the fact that the air-traffic
network is scale-free.

<b>applied to air ports context</b>: applying the findings to the context of airports, we can now say that there appears
to be some form of preferential attachment, where airports which are alerady central (i.e., act as the main hubs -> highest
degrees) are more likely to see higher amounts of incoming flights (i.e., in degrees) than less central airports.

One could for example see this coming into play when travelling longer distances internationally, when the start and end
airports are small provincial airports. It is likely in such a scenario (in both directions) one would first have to make
a stop at a large "central" airport, from where one would fly to another large "central" airport close the provincial
airport one would like to ultimately travel to.
<i>could also be central->central->provicial <b>or</b> provincial->central->central</i>
