This software is Copyright © 2024 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting:

Office of Innovation and Commercialization

9500 Gilman Drive, Mail Code 0910

University of California

La Jolla, CA 92093-0910

(858) 534-5815

invent@ucsd.edu

This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied “as is”, without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason.

IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN “AS IS” BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

In [None]:
import pandas as pd
import gzip
from collections import OrderedDict, defaultdict
import ipaddress
import math
import radix
import re
import numpy as np
import matplotlib.pyplot as plt
import bz2
from fuzzywuzzy import fuzz
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
split = re.compile(r'(.+?):\s*(.*)')

class Entry(OrderedDict):
    def __repr__(self):
        output = []
        for key, value in self.items():
            output.append('{}:\t{}'.format(key, value))
        return '\n'.join(output)
    
    @property
    def date(self):
        # self refers to the object, which in this case is a subclass of dict
        # the .get method for dict objects retrieves the value if the key exists,
        # otherwise it returns the default value
        changed = self.get('changed', None)
        if changed is not None:
            try:
                date = changed.split()[1]
            except IndexError:
                return '17000101'
        else:
            try:
                date = self['last-modified'].replace('-', '')
            except KeyError:
                return '16000101'
        return date
    
    
def parse_whois(filename):
    with gzip.open(filename, 'rt', encoding='latin-1') as f:
        items = []  # list to hold whois items
        item = Entry()  # an object to hold an individual entry

        # Iterate over the lines in the whois file
        for line in f:
            ol = line  # original version of the line
            line = line.strip()  # get rid of whitespace at beginning and end of the line

            # If the original line is not just a newline character,
            # and the line is only whitespace or starts with a comment character,
            # skip the line
            if ol != '\n' and (not line or line[0] == '#'):
                continue

            # If the line is not just whitespace
            if line:
                
                # if original line start with whitespace, append it to previous value
                if re.match(r'^[^a-zA-Z0-9]', ol):
                    try:
                        item[k] += '\n' + line
                        continue
                    except:
                        # print(item)
                        continue

                
                # See if the line matches the regex
                m = split.match(line)
                # If it does
                if m:
                    # There are 2 possible matching groups, so assign them to k and v
                    k, v = m.groups()
                    
                    # If the key is already in the item, concatenate the value to the existing value
                    if k in item and k != 'origin':
                        try:
                            item[k] += '\n' + v
                        except:
                            pass
                            # print(item)
                    # When the key does not yet exist in the item, add the key and value
                    else:
                        try:
                            item[k] = v
                        except:
                            pass
                            # print(item)
                # If it does not match
                else:
                    # Add the value to the previous key in the item
                    # This is a value with newline breaks
                    try:
                        item[k] += '\n' + line
                    except:
                        item[k] = line

            # If the line is just a newline break, finish the current item, and start a new one
            else:
                if item:
                    if set(['organisation', 'inetnum', 'aut-num']) & set(item.keys()):
                        items.append(item)  # Add item to list of items
                    item = Entry()  # Start new item
    return items

In [None]:
def build_as_rel():
    path = # Path to your AS Relationship dataset
    as_rel = defaultdict(lambda: defaultdict(set))
    with bz2.open(path, 'rt') as file:
        for line in file:
            if line.startswith('#'):
                continue
            data = line.strip().split('|')
            AS1 = 'AS'+data[0]
            AS2 = 'AS'+data[1]
            rel = data[2]
            if rel == '0':
                as_rel[AS1]['peer'].add(AS2)
                as_rel[AS2]['peer'].add(AS1)
            elif rel == '1':
                as_rel[AS1]['provider'].add(AS2)
                as_rel[AS2]['customer'].add(AS1)
            elif rel == '-1':
                as_rel[AS1]['customer'].add(AS2)
                as_rel[AS2]['provider'].add(AS1)
    return as_rel


def check_asrel(left, right): # left has to be customer of right
    try:
        for i in left:
            for j in right:
                if i in as_rel[j]['customer']:
                    return True
        return False
    except:
        return False

def build_as2org():
    path = # Path to your AS Organizations dataset
    as2org = gzip.open(path)
    mapping = {}
    companyname = {}
    for line in as2org:
        l = line.decode().strip('\n').split('|')
        asn = None
        if l[0].isdigit():
            asn = 'AS'+str(l[0])
            mapping[asn] = l[3]
        elif len(l) == 5:
            companyname[l[0]] = l[2]
    for i in mapping:
        orgname = companyname[mapping[i]]
        mapping[i] = orgname
    return mapping

def check_as2org(left, right):
    try:
        for i in left:
            for j in right:
                if as2org[i] == as2org[j]:
                    return True
        return False
    except:
        return False

as_rel = build_as_rel()
as2org = build_as2org()

def to_org(asn):
    try:
        return as2org[asn]
    except:
        return None

# Load RIPE DB

In [None]:
ripe = parse_whois() # Path to your RIPE WHOIS database file
len(ripe)

In [None]:
ripeaddr = pd.DataFrame([i for i in ripe if 'inetnum' in i])

In [None]:
ripeorg = pd.DataFrame([i for i in ripe if 'organisation' in i])

In [None]:
ripeorg['organisation'] = ripeorg['organisation'].apply(lambda x: x.upper())

In [None]:
ripeas = pd.DataFrame([i for i in ripe if 'aut-num' in i])

In [None]:
print(len(ripeaddr), len(ripeorg), len(ripeas))

In [None]:
ripeaddr

# Prep for inetnum tree construction

In [None]:
def range_to_pfx(inetnum):
    start, end = inetnum.strip().split('-')
    start_ip = ipaddress.IPv4Address(start.strip())
    end_ip = ipaddress.IPv4Address(end.strip())
    if int(end_ip) - int(start_ip) + 1 < 256:
        return None
    cidr_blocks = ipaddress.summarize_address_range(start_ip, end_ip)
    res = []
    for i in cidr_blocks:
        if i.prefixlen > 24:
            return None
        res.append(str(i))
    return res

In [None]:
ripeaddr['prefix'] = ripeaddr['inetnum'].apply(range_to_pfx)

In [None]:
tree_candidate = ripeaddr.dropna(subset=['prefix'])

In [None]:
len(tree_candidate)

In [None]:
len(ripeaddr) - len(tree_candidate)

In [None]:
tree_candidate['blocks_in_chunk'] = tree_candidate['prefix'].apply(len)

In [None]:
tree = tree_candidate.explode('prefix')

In [None]:
tree

In [None]:
rtree = radix.Radix()
for i in tree.prefix:
    rtree.add(i)

In [None]:
tree.prefix.count()

In [None]:
tree.prefix.nunique()

In [None]:
tree.status.value_counts()

# match allocated PA to ASN and orgs

In [None]:
allocated = tree[tree.status == 'ALLOCATED PA']

In [None]:
allocated['org'] = allocated['org'].apply(lambda x: x.upper())

In [None]:
allocated = allocated.merge(ripeorg[['organisation', 'org-name']], left_on='org', right_on='organisation', how='left')

In [None]:
ripeasgroups1 = ripeas.groupby('org').agg({'aut-num':list}).reset_index()

In [None]:
ripeasgroups2 = ripeas.groupby('sponsoring-org').agg({'aut-num':list}).reset_index().rename(columns={'sponsoring-org':'org'})

In [None]:
ripeasgroups = ripeasgroups1.merge(ripeasgroups2, on='org', how='outer')

In [None]:
def combine_autnums(x):
    if isinstance(x['aut-num_x'], float):
        return x['aut-num_y']
    elif isinstance(x['aut-num_y'], float):
        return x['aut-num_x']
    else:
        return list(set(x['aut-num_x'] + x['aut-num_y']))

In [None]:
ripeasgroups['aut-num'] = ripeasgroups.apply(combine_autnums, axis=1)

In [None]:
allocated = allocated.merge(ripeasgroups[['org', 'aut-num']], how='left')

In [None]:
allocated

# find leaf nodes, suballocated PA or assigned PA

In [None]:
def is_leaf(prefix):
    res = rtree.search_covered(prefix)
    if len(res) == 1:
        return True
    return False

In [None]:
tree['leaf'] = tree['prefix'].apply(is_leaf)

In [None]:
tree['leaf'].value_counts()

In [None]:
leafs = tree[(tree.leaf == True) & tree.status.isin(['SUB-ALLOCATED PA', 'ASSIGNED PA'])]

In [None]:
len(leafs)

# find root of leaf

In [None]:
def find_root(prefix):
    curr = rtree.search_exact(prefix)
    while curr.parent and curr.parent.prefix != '0.0.0.0/0':
        curr = curr.parent
    return curr.prefix

In [None]:
leafs['root'] = leafs['prefix'].apply(find_root)

In [None]:
allocated_descr = dict(zip(allocated['prefix'], allocated['descr']))

In [None]:
leafs

# find BGP origin of leaf prefix

In [None]:
bgptree = radix.Radix()

# Path to your prefix2as dataset
with gzip.open('YOUR BGP DATASET HERE', 'rt') as file:
    for line in file:
        data = line.strip().split()
        pfx = data[0]
        asns = []
        for i in data[1].split('_'):
            for j in i.split(','):
                asns.append('AS'+j)
        node = bgptree.add(pfx)
        node.data['asn'] = asns

In [None]:
def find_exact_origin(prefix):
    rnode = bgptree.search_exact(prefix)
    if rnode:
        return rnode.data['asn']
    return None

In [None]:
leafs['exact_origin'] = leafs['prefix'].apply(find_exact_origin)

In [None]:
print(len(leafs[~pd.isnull(leafs.exact_origin)]), len(leafs))

In [None]:
leafs['root_origin'] = leafs['root'].apply(find_exact_origin)

In [None]:
leafs

# Group 3: Child in BGP, parent not in BGP

In [None]:
c1 = leafs[(~pd.isnull(leafs.exact_origin)) & (pd.isnull(leafs.root_origin))]

In [None]:
c1

In [None]:
c1 = c1.merge(allocated[['prefix', 'org-name', 'aut-num']].rename(columns={'prefix': 'root'}))

## check if parent assigned ASN is related to child BGP origin AS

In [None]:
c1['sibling'] = c1.apply(lambda x: check_as2org(x['exact_origin'], x['aut-num']), axis=1)

In [None]:
c1['cp'] = c1.apply(lambda x: check_asrel(x['exact_origin'], x['aut-num']), axis=1)

In [None]:
c1infer = c1[(c1.sibling == False) & (c1.cp == False)]

In [None]:
c1infer

# Group 4: Both in BGP, unrelated

In [None]:
c2 = leafs[(~pd.isnull(leafs.exact_origin)) & (~pd.isnull(leafs.root_origin))]
c2 = c2.merge(allocated[['prefix', 'org-name', 'aut-num']].rename(columns={'prefix': 'root'}))

In [None]:
c2['aut_sibling'] = c2.apply(lambda x: check_as2org(x['exact_origin'], x['aut-num']), axis=1)
c2['aut_cp'] = c2.apply(lambda x: check_asrel(x['exact_origin'], x['aut-num']), axis=1)

In [None]:
c2['bgp_sibling'] = c2.apply(lambda x: check_as2org(x['exact_origin'], x['root_origin']), axis=1)
c2['bgp_cp'] = c2.apply(lambda x: check_asrel(x['exact_origin'], x['root_origin']), axis=1)

In [None]:
c2infer = c2[(c2.aut_sibling == False) & (c2.aut_cp == False) & (c2.bgp_sibling == False) & (c2.bgp_cp == False)]

In [None]:
c2infer

# Group 1: Neither in BGP

In [None]:
c3 = leafs[(pd.isnull(leafs.exact_origin)) & (pd.isnull(leafs.root_origin))]

In [None]:
c3

# Group 2: child not in BGP, parent in BGP

In [None]:
c4 = leafs[(pd.isnull(leafs.exact_origin)) & (~pd.isnull(leafs.root_origin))]

In [None]:
c4

# All categories: stats describe

In [None]:
print('Total:', len(leafs))
print('Group 1: Unused:', len(c3))
print('Group 2: Aggregated Customer:', len(c4))
print('Group 3: ISP customer', len(c1) - len(c1infer), 'leased prefixes:', len(c1infer))
print('Group 4: Delegated customer', len(c2) - len(c2infer), 'leased prefixes:', len(c2infer))

# Inferred leases

In [None]:
ripe_inferred_lease = pd.concat([c1infer, c2infer])

In [None]:
leased_prefix = set(ripe_inferred_lease['prefix'])

In [None]:
ripe_inferred_lease['country'].value_counts()[:20]

In [None]:
ripe_inferred_lease = ripe_inferred_lease.merge(allocated[['prefix', 'country']], left_on=['root'], right_on=['prefix'], suffixes=('', '_root'))

In [None]:
ripe_inferred_lease[ripe_inferred_lease['org-name'] == 'Resilans AB']['country'].nunique()

In [None]:
ripe_inferred_lease['out_of_country'] = ripe_inferred_lease['country'] != ripe_inferred_lease['country_root']

In [None]:
ripe_inferred_lease['out_of_country'].value_counts()

In [None]:
ripe_inferred_lease[ripe_inferred_lease.out_of_country]['country'].value_counts()[:20]

## Top IP Holders

In [None]:
ripe_inferred_lease['org-name'].value_counts()[:10]

## Top Originator

In [None]:
originator = pd.DataFrame(ripe_inferred_lease['exact_origin'].explode().value_counts().reset_index())

In [None]:
originator['org'] = originator['exact_origin'].apply(to_org)

In [None]:
originator[:20]

## Top maintainer

In [None]:
mnts = ripe_inferred_lease['mnt-by'].apply(lambda x: x.split('\n')).explode().value_counts().reset_index()

In [None]:
orgdict = defaultdict(set)
for a,b,c in zip(ripeorg['org-name'], ripeorg['mnt-ref'], ripeorg['mnt-by']):
    for i in b.split('\n'):
        if i != 'RIPE-NCC-RIS-MNT' and i != 'RIPE-NCC-HM-MNT':
            orgdict[i].add(a)
    for i in c.split('\n'):
        if i != 'RIPE-NCC-RIS-MNT' and i != 'RIPE-NCC-HM-MNT':
            orgdict[i].add(a)

In [None]:
mnts['org'] = mnts['mnt-by'].apply(lambda x: orgdict[x])

In [None]:
mnts[:20]

# Reference Dataset: RIPE Broker Leased Prefixes

In [None]:
brokers = []
with open('recognized_brokers_ripe.txt', 'rt') as f:
    for line in f:
        brokers.append(line.strip().upper().replace('\"', ''))

In [None]:
ripeorg['org-name'] = ripeorg['org-name'].apply(lambda x: x.upper().replace('\"', ''))

In [None]:
ripeorg[ripeorg['org-name'].isin(brokers)]['org-name'].nunique()

In [None]:
org_to_check = set(brokers) - set(ripeorg[ripeorg['org-name'].isin(brokers)]['org-name'].unique())

In [None]:
orgnames = ['ADDREX, INC.',
           'CLAUS PLACHETKA TRADING AS AIPI E. K.',
           'ALFA TELECOM S.R.O.',
           'ALFA TELECOM CJSC',
           'ALFA TELECOM LTD.',
           'ALPHA INFOLAB, INC.',
           'ASR-E DANESH AFZAR COMPANY (PRIVATE J.S.)',
           'AZERONLINE LTD JOINT ENTERPRISE',
           'BLUE NETWORKS TECHNOLOGIES SARL',
           'BRANDER GROUP INC.',
           'E-MONEY NET DEVELOPERS 24 COMPANY PRIVATE JOINT STOCK',
           'FIBERCLI PROYECTOS E INNOVACION S.L.',
           'IP COM LTD',
           'GCX US LLC',
           'I7 LLC',
           'INANA GROUP LLC',
           'INTELLIGENT ADVANCED SOLUTIONS S.R.O',
           'IPWAY LLC',
           'NATIONWIDE COMPUTER SYSTEMS, INC. TRADING AS IPTRADING.COM',
           'IT-SERVICE LLC',
           'KONNGRUENT MANAGEMENT S.R.L.',
           'VLADIMIR KORABLEV',
           'LEADERTELECOM LTD.',
           'LEADERTELECOM B.V.',
           'LLC LIR UKRAINE',
           'NETERRA LTD.',
           'NITRONET SP. Z O.O.',
           'PARSUN NETWORK SOLUTIONS PTY LTD',
           'PREFIX BROKER BV',
           'RIOTCLOUDS, INC.',
           'SEVEN NETWORKING UK LTD',
           'THERECOM LTD',
           'VOLDETA UAB',
           'ULF KIEBER',
           'WINDMILL TELECOM OU',
           'XTOM OU',
             'XTOM GMBH',
             'XTOM GLOBAL TELECOM INC.',
         'XTOM PTY LTD',
         'XTOM PTY LTD',
         'XTOM GMBH',
         'XTOM JAPAN CO., LTD.']

In [None]:
brokers += orgnames

In [None]:
ripeorg[ripeorg['org-name'].isin(brokers)]['org-name'].nunique()

In [None]:
brokermnt = set()
for i in ripeorg[ripeorg['org-name'].isin(brokers)]['mnt-by']:
    for j in i.strip().split('\n'):
        if j.strip() != 'RIPE-NCC-HM-MNT':
            brokermnt.add(j.strip())
for i in ripeorg[ripeorg['org-name'].isin(brokers)]['mnt-ref']:
    for j in i.strip().split('\n'):
        if j.strip() != 'RIPE-NCC-HM-MNT':
            brokermnt.add(j.strip())

In [None]:
len(brokermnt)

In [None]:
brokerstr = '|'.join(brokermnt)

In [None]:
tree[tree['mnt-by'].str.contains(brokerstr) & (tree.status == 'LEGACY')]

In [None]:
def check_mntner(x):
    x = x.strip().split('\n')
    x = set([i.strip() for i in x])
    if len(x.intersection(brokermnt)) > 0:
        return True
    return False

In [None]:
leafs['true_label'] = leafs['mnt-by'].apply(check_mntner)

In [None]:
tp = leafs[(leafs.true_label == True) & ~leafs.prefix.isin(c1[~c1.prefix.isin(c1infer.prefix)].prefix) & ~leafs.prefix.isin(c2[~c2.prefix.isin(c2infer.prefix)].prefix)]

## Caution: Not all broker prefixes are leased. Some brokers are also ISPs

In [None]:
tp