In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import time
#import matplotlib.pyplot as plt
import gzip
import json
#from tabulate import tabulate
import bz2
#import os
from tqdm import tqdm

  from pandas.core.computation.check import NUMEXPR_INSTALLED


# Load the Data

## Helper functions and variables

In [2]:
def delta_date(x):
    return (str_to_date(x['end_date']) - str_to_date(x['start_date'])).days + 1

In [3]:
def str_to_date(d):
    return datetime.strptime(d, "%Y-%m-%d")

In [4]:
datapath="irrs/"

In [5]:
def overlap_db(db):
    timeless_overlap = db.merge(bgp, left_on='route', right_on='prefix', suffixes=('_route', '_prefix'))
    timeless_overlap['end_date'] = pd.to_datetime(timeless_overlap['end_date'], format="%Y-%m-%d")
    timeless_overlap['start_date'] = pd.to_datetime(timeless_overlap['start_date'], format="%Y-%m-%d")
    timeless_overlap['start']=pd.to_datetime(timeless_overlap['start'], unit='s').dt.strftime('%Y-%m-%d')
    timeless_overlap['end']=pd.to_datetime(timeless_overlap['end'], unit='s').dt.strftime('%Y-%m-%d')
    #timeless_overlap['origin']=timeless_overlap['origin'].str.replace('AS',"")
    actual_overlap = timeless_overlap[(timeless_overlap.end_date > timeless_overlap.start)&(timeless_overlap.start_date < timeless_overlap.end)]
    actual_overlap['asn']='AS'+actual_overlap['asn'].astype(str)
    return actual_overlap

## Load BGP

In [6]:
bgp = pd.read_csv('pfx2as_2021_2023_merged.csv.gz', names=['prefix', 'asn', 'start', 'end'], delimiter=' ',low_memory=False)
bgp['duration'] = bgp['end'] - bgp['start']
bgpagg = bgp.groupby(['prefix', 'asn']).agg({'duration':'sum'}).reset_index()
bgpagg['asn']='AS'+bgpagg['asn'].astype(str)

## Load Altdb

In [7]:
altdb = pd.read_json(datapath+'altdb/altdb.route.json.gz', lines=True)
altdb['lifetime'] = altdb.apply(delta_date, axis=1)
altdb_overlap=overlap_db(altdb)
altdbagg = altdb_overlap.groupby(['route', 'origin']).agg({'lifetime':'sum'}).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_overlap['asn']='AS'+actual_overlap['asn'].astype(str)


# Section 7: Irregular Route Objects

## Helper functions

### as2org

In [8]:
orgs=pd.read_csv('as2org_clean.csv')
orgs['aut']=orgs['aut'].map(str)
orgs_ases_gt=list(orgs['aut'].unique())

### AS relationships

In [9]:
with bz2.open('20230401.as-rel.txt.bz2','rt') as f:
    content=f.read()
lines=content.split('\n')
clean_lines=[]
ases=[]
mypeers={}
mycustomers={}
myproviders={}
#the above dictionaries are organized s.t. mypeers[AS1]=AS2 and mypeers[AS2]=AS1
for l in tqdm(lines):
    if not l.startswith('#'):
        clean_lines.append(l)
        if len(l.split('|'))==3:
            if l.split('|')[2]=='0':
                if l.split('|')[0] in mypeers:
                    mypeers[l.split('|')[0]].append(l.split('|')[1])
                else:
                    mypeers[l.split('|')[0]]=[(l.split('|')[1])]
                if l.split('|')[1] in mypeers:
                    mypeers[l.split('|')[1]].append(l.split('|')[0])
                else:
                    mypeers[l.split('|')[1]]=[(l.split('|')[0])]
            if l.split('|')[2]=='-1': #AS1 is a provider of AS2, AS2 is a customer of AS1 AS1|AS2|-1
                if l.split('|')[1] in myproviders:
                    myproviders[l.split('|')[1]].append(l.split('|')[0])
                else:
                    myproviders[l.split('|')[1]]=[(l.split('|')[0])]
                if l.split('|')[0] in mycustomers:
                    mycustomers[l.split('|')[0]].append(l.split('|')[1])
                else:
                    mycustomers[l.split('|')[0]]=[(l.split('|')[1])]

100%|██████████| 494684/494684 [00:02<00:00, 233095.63it/s]


## Section 7.2: ALTDB Analysis

In [10]:
altdb_cover=pd.read_csv('altdb_cover.csv')

In [11]:
altdb_inconsistent=list(altdb_cover.nonauth.unique())

In [12]:
len(altdb_inconsistent)

1206

In [13]:
altdb[altdb['route'].isin(altdb_inconsistent)]['route'].nunique()

1206

### Appear in BGP and inconsistent

In [14]:
bgp_inconsistent_altdb=altdb_overlap[altdb_overlap['prefix'].isin(altdb_inconsistent)]
#using altdb_overlap to get overlapping time for both altdb and bgp

In [15]:
print(bgp_inconsistent_altdb.prefix.nunique(),"appear in BGP and are inconsistent.")

619 appear in BGP and are inconsistent.


### Full overlap

In [16]:
in_b_a=bgp_inconsistent_altdb.prefix.unique()

In [17]:
len(in_b_a)

619

In [18]:
df_in_b_a=altdb_overlap[altdb_overlap['route'].isin(in_b_a)]

In [19]:
df_in_b_a.route.nunique()

619

In [20]:
route_a_b=df_in_b_a.groupby('route').agg({'asn':set,'origin':set}).reset_index()
#route {asn set from altdb} {origin set from bgp}

In [21]:
route_a_b['related']=False #this is True when the ASes are the same or related (i.e. not irregular)

In [22]:
route_a_b['Fullmatch']=(route_a_b['asn']==route_a_b['origin']) #python discards order when evaluating set equality

In [23]:
full_overlap=route_a_b[route_a_b['Fullmatch']]['route'].nunique() 
#this variable will be added onto for the full overlap calculation

In [24]:
fully_matched=route_a_b[route_a_b['Fullmatch']]['route'].unique() 

In [25]:
for i in list(route_a_b.index[route_a_b['route'].isin(fully_matched)]):
    route_a_b.at[i,'related']=True

In [26]:
print(route_a_b[route_a_b['related']]['route'].nunique(),"prefixes have an exact match AS from ALTDB and BGP.")


130 prefixes have an exact match AS from ALTDB and BGP.


In [27]:
notfullmatch=route_a_b[~route_a_b['Fullmatch']]
print(notfullmatch['route'].nunique(),"prefixes do not have an exact match AS from ALTDB and BGP.")

489 prefixes do not have an exact match AS from ALTDB and BGP.


### Partial Overlap

Now we filter down the non exact match prefixes from above.

In [28]:
route_a_b['intersec'] = route_a_b.apply(lambda x: set(x['asn']).intersection(set(x['origin'])), axis=1)
route_a_b['partial_match']=(route_a_b['intersec']!=set())
route_a_b['all_ases']=route_a_b.apply(lambda x: (x['asn'] | x['origin']), axis=1)

In [29]:
partial=route_a_b[(~route_a_b['Fullmatch']) & (route_a_b['partial_match'])]

In [30]:
print(partial['route'].nunique(),"prefixes have a partial overlap.")
#i.e. if Altdb says AS1,AS2 and BGP says AS2,AS3

177 prefixes have a partial overlap.


#### Do the ASes belong to the same org?

In [31]:
pa_mis_rs=partial['route'].unique() #list of prefixes with partial overlap ASes from ALTDB and BGP
r2orgspa={}
for r in tqdm(pa_mis_rs):
    r2orgspa[r]=[]
    ases_to_check=list(partial[partial['route']==str(r)]['all_ases'])[0]
    clean=[i.replace("'","") for i in ases_to_check]
    clean=[i.replace("AS","") for i in ases_to_check]
    for a in clean:
        if a in orgs_ases_gt:
            check_org=list(orgs[orgs['aut']==a]['org_id'].unique())[0]
            r2orgspa[r].append(check_org)
        if a not in orgs_ases_gt: #if not in as 2 org file, then remove from org analysis
            r2orgspa[r].append('missing')
for r in r2orgspa:
    r2orgspa[r]=set(r2orgspa[r])

100%|██████████| 177/177 [00:05<00:00, 33.85it/s]


In [32]:
as_mismatch_same_org=[]
for r in r2orgspa:
    if (len(r2orgspa[r])==1) and (r2orgspa[r]!='missing'):
        as_mismatch_same_org.append(r)

In [33]:
for i in list(route_a_b.index[route_a_b['route'].isin(as_mismatch_same_org)]):
    route_a_b.at[i,'related']=True

In [34]:
full_overlap+=len(as_mismatch_same_org)

In [35]:
print(len(as_mismatch_same_org),"prefixes belong to the same org.")

0 prefixes belong to the same org.


#### If not, is there a customer or peer relationship?

In [36]:
partial[~partial['related']]['route'].nunique()

177

In [37]:
check_rel=partial[~partial['related']]['route'].unique()
related=[]
unrelated=[]
for r in tqdm(check_rel):
    check_ases=list(list(route_a_b[route_a_b['route']==str(r)]['all_ases'])[0])
    #use first AS to check relationships
    token_as=check_ases[0].lower().split('as')[1]
    #check if the rest of the ases are peers, customers, or providers
    token_peers=[]
    token_providers=[]
    token_customers=[]
    if token_as in myproviders:
        token_providers=myproviders[token_as]
    if token_as in mycustomers:
        token_customers=mycustomers[token_as]
    if token_as in mypeers:
        token_peers=mypeers[token_as]
    for a in check_ases:
        clean=a.lower().split('as')[1]
        if clean==token_as:
            continue
        if clean in token_peers:
            related.append(r)
        if clean in token_customers:
            related.append(r)
        if clean in token_providers:
            related.append(r)
    if r not in related:
        unrelated.append(r)
related=set(related)
unrelated=set(unrelated)

100%|██████████| 177/177 [00:00<00:00, 1470.16it/s]


In [38]:
len(related)+len(unrelated)

177

In [39]:
full_overlap+=len(related)

In [40]:
partial_overlap=len(unrelated)

In [41]:
for i in list(route_a_b.index[route_a_b['route'].isin(related)]):
    route_a_b.at[i,'related']=True

In [42]:
print(len(related),"routes have a customer or provider or peer relationship.")

15 routes have a customer or provider or peer relationship.


In [43]:
print(len(unrelated),"routes do not have a customer or provider or peer relationship.")

162 routes do not have a customer or provider or peer relationship.


### No Overlap

In [44]:
print(route_a_b[~route_a_b['partial_match']]['route'].nunique(),"total mismatches.")

312 total mismatches.


In [45]:
total=route_a_b[~route_a_b['partial_match']]

#### Do the ASes belong to the same org?

In [46]:
tot_mis_rs=total['route'].unique()
r2orgs={}
for r in tqdm(tot_mis_rs):
    r2orgs[r]=[]
    ases_to_check=list(total[total['route']==str(r)]['all_ases'])[0]
    clean=[i.replace("'","") for i in ases_to_check]
    clean=[i.replace("AS","") for i in ases_to_check]
    for a in clean:
        if a in orgs_ases_gt:
            check_org=list(orgs[orgs['aut']==a]['org_id'].unique())[0]
            r2orgs[r].append(check_org)
        if a not in orgs_ases_gt:
            r2orgs[r].append('missing')
for r in r2orgs:
    r2orgs[r]=set(r2orgs[r])

100%|██████████| 312/312 [00:11<00:00, 27.14it/s]


In [47]:
as_mismatch_same_org=[]
for r in r2orgs:
    if (len(r2orgs[r])==1) and (r2orgs[r]!='missing'):
        as_mismatch_same_org.append(r)

In [66]:
full_overlap+=len(as_mismatch_same_org)

In [48]:
print(len(as_mismatch_same_org),"prefixes belong to the same org.")

1 prefixes belong to the same org.


In [49]:
print(len(tot_mis_rs)-len(as_mismatch_same_org),"prefixes do not belong to the same org.")

311 prefixes do not belong to the same org.


In [50]:
route_a_b['same_org']=False
total['same_org']=False
for i in list(route_a_b.index[route_a_b['route'].isin(as_mismatch_same_org)]):
    route_a_b.at[i,'same_org']=True
    route_a_b.at[i,'related']=True
    total.at[i,'same_org']=True
    total.at[i,'related']=True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total['same_org']=False


#### If not, is there a customer or peer relationship?

In [51]:
check_rel=total[~total['related']]['route'].unique()
related=[]
unrelated=[]
for r in tqdm(check_rel):
    check_ases=list(list(route_a_b[route_a_b['route']==str(r)]['all_ases'])[0])
    #use first AS to check relationships
    token_as=check_ases[0].split('AS')[1]
    #check if the rest of the ases are peers, customers, or providers
    token_peers=[]
    token_providers=[]
    token_customers=[]
    if token_as in myproviders:
        token_providers=myproviders[token_as]
    if token_as in mycustomers:
        token_customers=mycustomers[token_as]
    if token_as in mypeers:
        token_peers=mypeers[token_as]
    for a in check_ases:
        clean=a.split('AS')[1]
        if clean==token_as:
            continue
        if clean in token_peers:
            related.append(r)
        if clean in token_customers:
            related.append(r)
        if clean in token_providers:
            related.append(r)
    if r not in related:
        unrelated.append(r)
related=set(related)
unrelated=set(unrelated)

100%|██████████| 311/311 [00:00<00:00, 1499.76it/s]


In [52]:
len(related)+len(unrelated)

311

In [53]:
full_overlap+=len(related)

In [54]:
no_overlap=len(unrelated)

In [55]:
for i in list(route_a_b.index[route_a_b['route'].isin(related)]):
    route_a_b.at[i,'related']=True

In [56]:
print(len(related),"routes have a customer or provider or peer relationship.")

13 routes have a customer or provider or peer relationship.


In [57]:
print(len(unrelated),"routes do not have a customer or provider or peer relationship.")

298 routes do not have a customer or provider or peer relationship.


### Final numbers for ALTDB

In [67]:
perc=str(round(bgp_inconsistent_altdb.prefix.nunique()*100/len(altdb_inconsistent),2))+"% "
fraction="("+str(bgp_inconsistent_altdb.prefix.nunique())+"/"+str(len(altdb_inconsistent))+")"
print(perc+fraction+" Appear in BGP and inconsistent.")

51.33% (619/1206) Appear in BGP and inconsistent.


In [68]:
perc=str(round(no_overlap*100/bgp_inconsistent_altdb.prefix.nunique(),2))+"% "
fraction="("+str(no_overlap)+"/"+str(bgp_inconsistent_altdb.prefix.nunique())+")"
print(perc+fraction+" No overlap.")
#(i.e., no org rel, no cust/peer/prov rel, and no overlapping ASes.")

48.14% (298/619) No overlap.


In [69]:
perc=str(round(full_overlap*100/bgp_inconsistent_altdb.prefix.nunique(),2))+"% "
fraction="("+str(full_overlap)+"/"+str(bgp_inconsistent_altdb.prefix.nunique())+")"
print(perc+fraction+" Full overlap.")
#(i.e., completely overlapping ASes, ASes belong to the same org, ASes have cust/peer/prov rel.")


25.69% (159/619) Full overlap.


In [70]:
perc=str(round(partial_overlap*100/bgp_inconsistent_altdb.prefix.nunique(),2))+"% "
fraction="("+str(partial_overlap)+"/"+str(bgp_inconsistent_altdb.prefix.nunique())+")"
print(perc+fraction+" Partial overlap.") 
#(i.e., some ASes overlap, others do not and the ones that do not have no org or cust/peer rel.)")


26.17% (162/619) Partial overlap.
