In [None]:
cd ../..

In [None]:
import json
import glob
from pprint import pprint
import matplotlib.pyplot as plt


In [None]:
import seaborn as sns

In [None]:
sns.set_style("whitegrid")

In [None]:
import numpy as np

In [None]:
np.random.seed(42)

In [None]:
files = sorted(glob.glob("data/INFN-T1/*.json"))


In [None]:
selected_files = np.random.choice(files, 1000)

In [None]:
def load_json(file):
    with open(file) as f:
        return json.load(f)


In [None]:
len(files)

In [None]:
loaded_files = [load_json(x) for x in files]


In [None]:
import copy

file = copy.deepcopy(loaded_files[0])

In [None]:
subfields = {k:v for k,v in file.items() 
 if (k not in ['asns', 'hops', 'rtts', 'ttls']) and ( k.startswith('src_') or k.startswith('dest_') or k == 'src' or k == 'dest')}

for k,v in sorted(subfields.items()):
    print(k, v)
    del file[k]

In [None]:
print('asns', 'hops', 'rtts', 'ttls', sep=' ; ')
for items in zip(file['asns'], file['hops'], file['rtts'], file['ttls']):
    print(*items, sep=' ; ')
del file['asns']
del file['hops']
del file['rtts']
del file['ttls']

In [None]:
print("timestamp", file['timestamp'])
del file['timestamp']

In [None]:
print("path_complete", file['path_complete'])
del file['path_complete']
print("destination_reached", file['destination_reached'])
del file['destination_reached']
print("looping", file['looping'])
del file['looping']
print("ipv6", file['ipv6'])
del file['ipv6']
print("push", file['push'])
del file['push'] # maybe useless

In [None]:
pprint(file)

In [None]:
file.keys()

In [None]:
rtts = []
tss = []
mean_rtts = []
sum_rtts = []
n_hops = []
sites = []
max_ttl = []
for file in loaded_files:
    rtts.append(max(file['rtts']) if file['rtts'] else 0)
    tss.append(file['timestamp'])
    mean_rtts.append(np.mean(file['rtts']) if file['rtts'] else 0)
    sum_rtts.append(np.sum(file['rtts']) if file['rtts'] else 0)
    n_hops.append(len(file['hops']))
    sites.append(file['src_site'] if 'src_site' in file else '')
    max_ttl.append(max(file['ttls']) if file['ttls'] else 0)

In [None]:
len(loaded_files)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({'max_rtt': rtts, 'timestamp': tss, 'mean_rtt': mean_rtts, 'sum_rtt': sum_rtts, 'n_hops': max_ttl, "site": sites})

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

In [None]:
df.set_index('timestamp', inplace=True)

In [None]:
df.sort_index(inplace=True)


In [None]:
df.columns = ['Max RTT (ms)', 'Mean RTT (ms)', 'Total RTT (ms)', "Number of hops", "Site"]


In [None]:
fig = plt.figure(figsize=(10, 6))
# df.plot(y='mean_rtt', label='Mean RTT', ax=fig.gca(), alpha=.7)
# df.plot(y='max_rtt', label='Max RTT', ax=fig.gca(), alpha=.7)
t = df.reset_index().plot(y=df.columns[2], x='timestamp',  ax=fig.gca(), kind='scatter', c=df.columns[0], legend=False, alpha = 0.3)
plt.xlabel('Date')
plt.title('Duration of traceroutes over time')
# plt.yticks(np.arange(0, 150000, 5000))
fig.savefig('results/figures/max_rtt_over_time.png', dpi=350)


plt.show()

In [None]:
df.sort_index(inplace=True)

In [None]:
# (df
# # .loc['2023-02-01 00:00:00':'2023-02-16 00:00:00']
# ).sort_values(by='Site').plot(
#     kind='scatter', backend='plotly', y=df.columns[2], x=df.columns[3],
#     color="Site"
#     )


In [None]:
# df.loc['2023-02-01 00:00:00':'2023-02-16 00:00:00'].plot(
#     kind='scatter', backend='plotly', y=df.columns[2], x=df.columns[3],
#     color="Site"
#     )


In [None]:
from collections import defaultdict

In [None]:
destinations = defaultdict(int)

In [None]:
for file in loaded_files:
    if not 'src_site' in file:
        continue
    destinations[file['src_site']] += 1


In [None]:
from wordcloud import WordCloud

In [None]:
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
wc.generate_from_frequencies(destinations)

In [None]:
plt.figure()
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# create two lists of keys and values sorted by values (descending)
keys = sorted(destinations, key=destinations.get, reverse=True)
values = sorted(destinations.values(), reverse=True)
df = pd.DataFrame({'keys': keys, 'values': values})


In [None]:
df['color'] = 'blue'
df['keys_lower'] = df['keys'].str.lower()
df.loc[(df['keys_lower'].str.contains('t1'))|(df['keys_lower'].str.contains('tier1')), 'color'] = 'red'
df.loc[(df['keys_lower'].str.contains('t2'))|(df['keys_lower'].str.contains('tier2')), 'color'] = 'green'
df.loc[(df['keys_lower'].str.contains('lcg2')), 'color'] = 'yellow'

In [None]:
df['keys'] = df['keys'].astype('category')

In [None]:
# df['values'] /= df['values'].sum()

In [None]:
len(files)

In [None]:
fig = plt.figure(figsize=(15, 4))
df.plot( kind='bar', x='keys', y='values', title='Number of incoming traceroutes per site', ax=fig.gca(), label='Number of incoming traceroutes')
plt.xlabel('Site')
fig.savefig('results/figures/number_of_incoming_traceroutes_per_site.svg')

In [None]:
len(keys)

In [None]:
tss = []
site = []
nhops= []
for file in loaded_files:
    if not 'src_site' in file:
        continue
    tss.append(file['timestamp'])
    site.append(file['src_site'])
    nhops.append(len(file['hops']))

In [None]:
df = pd.DataFrame({'timestamp': tss, 'site': site, 'nhops': nhops})
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df.set_index('timestamp', inplace=True)

In [None]:
df

In [None]:
site = []
delta_t = []
hops = []

for i, x in df.reset_index().groupby(by=['site']):
    # display(i)
    # display(x['timestamp'].diff().dropna().dt.seconds.abs().mean() / 3600)
    site.append(i)
    delta_t.append(x['timestamp'].diff().dropna().dt.seconds.abs().mean() / 3600)
    hops.append(x['nhops'].diff().dropna().abs().mean())
    # display(x.diff().dropna().abs().mean(axis=1))

In [None]:
df2 = pd.DataFrame({'site': site, 'delta_t': delta_t, 'mean_hops': hops})

In [None]:
pd.infer_freq(df.index)


In [None]:
max_rtts  = [x.get('max_rtt', 0) for x in loaded_files]
n_hops  = [x.get('n_hops', 0) for x in loaded_files]
destination_reached = [x.get('destination_reached', False) for x in loaded_files]

In [None]:

fig = plt.figure(figsize=(8, 3))
plt.hist(sum_rtts, log=True)
plt.title("Histogram of sum of RTT values (log scale)")
plt.xlabel("Sum of RTT (ms)")
plt.ylabel("log of Count")
fig.savefig("results/figures/max_rtts.svg")
plt.show()


In [None]:
fig = plt.figure(figsize=(8, 3))
plt.hist(n_hops, bins=25)
plt.title("Histogram of number of hops")
plt.xlabel("Number of hops")
plt.ylabel("Count")
fig.savefig("results/figures/n_hops.svg")
plt.show()


In [None]:
fig = plt.figure(figsize=(8, 3))
plt.hist(max_ttl, bins=25)
plt.title("Histogram of number of hops")
plt.xlabel("Number of hops")
plt.ylabel("Count")
fig.savefig("results/figures/max_ttl.svg")
plt.show()


In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({'max_rtt': max_rtts, 'max_ttl': max_ttl, 'Destination Reached': destination_reached})



In [None]:
df.groupby(by="ipv6")[['max_rtt', 'max_ttl']].describe()

In [None]:
counts = df.groupby(by=["Destination Reached",  "Looping", "ipv6", "Path Complete"]).size().unstack(fill_value=0)
normalized_counts = counts / counts.values.sum()


In [None]:
fig = plt.figure(figsize=(8, 4))
sns.heatmap(normalized_counts, annot=True, cmap="YlGnBu", fmt=".3%", cbar=False)

plt.xlabel( "Path Complete")
plt.ylabel("Destination Reached - Looping - IPv6")
# plt.title('Heatmap of ')
fig.savefig("results/figures/heatmap-counts.svg",)
plt.show()


In [None]:
import seaborn as sns

In [None]:
# make three subplots sharing the same x axis in one column
f, (ax1, ax2, ax3, ax4) = plt.subplots(4, sharex=True, figsize=(8, 8));


In [None]:
# fig = plt.figure(figsize=(8, 4))
sns.scatterplot(data=df, y='max_rtt', x='max_ttl', hue='Destination Reached', alpha=0.25, ax=ax1)
ax1.set_ylabel('Max RTT (ms)')

# plt.show()


In [None]:
path_complete = [x.get('path_complete', False) for x in loaded_files]

In [None]:
df['Path Complete'] = path_complete

In [None]:
# fig = plt.figure(figsize=(8, 4))
sns.scatterplot(data=df, y='max_rtt', x='max_ttl', hue='Path Complete', alpha=0.25, ax=ax2)
ax2.set_ylabel('Max RTT (ms)')

# plt.show()


In [None]:
looping = [x.get('looping', False) for x in loaded_files]
df['Looping'] = looping

In [None]:
# fig = plt.figure(figsize=(8, 4))
sns.scatterplot(data=df, y='max_rtt', x='max_ttl', hue='Looping', alpha=0.25, ax=ax3)
ax3.set_ylabel('Max RTT (ms)')

# plt.show()

In [None]:
ipv6 = [x.get('ipv6', False) for x in loaded_files]
df['ipv6'] = ipv6

In [None]:
# fig = plt.figure(figsize=(8, 4))
sns.scatterplot(data=df, y='max_rtt', x='max_ttl', hue='ipv6', alpha=0.25, ax=ax4)
ax4.set_ylabel('Max RTT (ms)')

# plt.show()

In [None]:
ax3.set_xlabel("Number of hops")

In [None]:
cd masters-thesis

In [None]:
f.suptitle("Max RTT vs Number of hops")
f.savefig("results/figures/max_rtt_vs_n_hops.png", dpi=350)

In [None]:
f

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
df[df['Path Complete'] & df['Destination Reached'] & df['Looping']].shape #.plot(kind='scatter', x='max_ttl', y='max_rtt', alpha=0.25)

In [None]:
df[df['Path Complete'] & df['Destination Reached'] & ~df['Looping']].shape #.plot(kind='scatter', x='max_ttl', y='max_rtt', alpha=0.25)

In [None]:
df[df['Path Complete'] & df['Destination Reached'] & df['Looping']].shape #.plot(kind='scatter', x='max_ttl', y='max_rtt', alpha=0.25)

In [None]:
# df.plot(kind='scatter', x='max_ttl', y='max_rtt', color='Looping', backend='plotly')

In [None]:
# df[df['Looping']].plot(kind='scatter', x='max_ttl', y='max_rtt', color='red', backend='plotly')

In [None]:
ttls = []
rtts = []
for file in loaded_files:
    ttls.extend(file['ttls'])
    rtts.extend(file['rtts'])


In [None]:
df3 = pd.DataFrame({'rtt': rtts, 'ttl': ttls})

In [None]:
df3.corr(method='spearman')

In [None]:
fig = plt.figure()
plt.scatter(max_ttl, (max_rtts), c='r', alpha=0.5)
plt.title("Number of hops vs Max RTT")
plt.xlabel("Number of hops")
plt.ylabel("Max RTT (ms)")
fig.savefig("results/figures/n_hops_vs_max_rtts.png", dpi=350)
plt.show()

In [None]:
from collections import defaultdict

In [None]:
counts = defaultdict(lambda: defaultdict(lambda : defaultdict(lambda: defaultdict(int))))

In [None]:
counts[True][True][True]

In [None]:
for a, b, c, d in df[['Destination Reached', 'Path Complete', 'Looping', 'ipv6']].values.tolist():
    counts[a][b][c][d] += 1


In [None]:
for x in [False, True]:
    for y in [False, True]:
        for z in [False, True]:
            for a in [False, True]:
                print(x, y, z, a, np.round(100*counts[x][y][z][a]/len(df), 2))
        

In [None]:
df.apply(lambda x: (counts[x['Destination Reached']])[x['Path Complete']][x['Looping']] += 1, axis=0)


In [None]:
df

In [None]:
from collections import Counter

ctr = Counter(destinations)

In [None]:
sites = [x for (x, y) in ctr.most_common(8)]

In [None]:
#  color palette of 10 colors

cmap = plt.get_cmap("tab10")
# create a color generator
colors = cmap(np.linspace(0, 1, 8))


In [None]:
# colors[:,3] = 1

In [None]:
colors = {k: v for k, v in zip(sites, colors)}

In [None]:
fig = plt.figure(figsize=(10, 4))
ax = plt.gca() 
seen = set()
sample =  np.random.choice(loaded_files, 6000)
for item in sample:
    if item.get('src_site', None) not in sites:
        continue
    try:
        if item['src_site'] not in seen:
            seen.add(item['src_site'])
            ax.plot(item['ttls'], item['rtts'], label=f"{item['src_site']} -> {item['dest_site']}", alpha=.5, c=colors[item['src_site']])
    
        else:
            ax.plot(item['ttls'], item['rtts'], label=None, alpha=.5, c=colors[item['src_site']])

    except KeyError:
        pass
    
ax.set_ylabel("RTT (ms)")
ax.set_xlabel("TTL (hops)")
plt.title("RTT and TTL for each traceroute ")
plt.legend()

fig.savefig("results/figures/rtt_vs_ttl.svg")

plt.show()

In [None]:
loaded_files

In [None]:
np.diff(range(5))

In [None]:
ttl_diffs = []

pctg = []

for file in loaded_files:
    if not 'ttls' in file:
        continue

    seq = np.diff(file.get('ttls'))
    if not seq.any():
        continue

    seq -= 1
    seq = ([seq] if isinstance(seq, int) else seq.tolist())
    pctg.append(sum(seq) / max(file['ttls']))    
    # for x in seq:
    #     if x:
    #         ttl_diffs.append(x)

In [None]:
np.mean(pctg)

In [None]:
from collections import Counter

In [None]:
ctr = Counter(ttl_diffs)

In [None]:
{ k:round(100*v/sum(ctr.values()), 3) for k,v in ctr.most_common(7) }

In [None]:
ctr = dict(ctr)

In [None]:
# create a histogram from the ctr
fig = plt.figure(figsize=(10, 4))
plt.bar(ctr.keys(), np.log(list(ctr.values())), width=1)