In [1]:
# Written by Constantijn Bicker Caarten
# Last updated: 07-07-2017
#
#
# This code gathers data on TLDs in the DNS. This data 
# consists out of A, AAAA, DNSKEY, DS and NS records, 
# as well as TCP and UDP support, response time and
# anycast support.

# Imports

In [2]:
from dns.resolver import Resolver, NXDOMAIN, query
from socket import error as socket_error
from urllib.request import urlopen
from dns.query import udp, tcp
from bs4 import BeautifulSoup
from dns.resolver import dns
from uuid import uuid4
from tqdm import tqdm

import matplotlib as plt
import pandas as pd
import numpy as np

import datetime
import requests
import socket
import copy
import json
import os

# Constants

In [3]:
ZSK = 256
KSK = 257

ATLAS_API_KEY = '' # Add your Atlas API key
ATLAS_BILL_TO = '' # Add your Atlas account email

URL_DNS_MEASUREMENT_CREATE = 'https://atlas.ripe.net:443/api/v2/measurements/dns/'
URL_DNS_MEASUREMENT_GET = 'https://atlas.ripe.net:443/api/v2/measurements/dns/'

HEADERS = {'Content-type': 'application/json', 'Accept': 'text/plain'}

NEWLINE = '\n'

In [4]:
payload = {
    'bill_to': ATLAS_BILL_TO,
    'is_oneoff': True,
    'definitions': [],
    'probes': []
}
    
definition = {
    'af': 4,
    'query_class': 'IN',
    'query_type': '',
    'query_argument':  '',
    'description': '',
    'use_probe_resolver': True,
    'resolve_on_probe': False,
    'set_nsid_bit': True,
    'protocol': 'UDP',
    'udp_payload_size': 512,
    'retry': 0,
    'skip_dns_check': False,
    'include_qbuf': False,
    'include_abuf': True,
    'prepend_probe_id': False,
    'set_rd_bit': False,
    'set_do_bit': False,
    'set_cd_bit': False,
    'type': 'dns',
    'is_public': True
}

# Functions

In [5]:
def write_list(fn, data):
    '''Writes a list to a file with each value on a new line'''
    with open(fn, 'w') as f:
        for datum in data:
            f.write(datum + NEWLINE)
        
def append_list(fn, data):
    '''Appends a list to a file with each value on a new line'''
    with open(fn, 'a') as f:
        for datum in data:
            f.write(datum + NEWLINE)
    
def read_list(fn):
    '''Reads a file and '''
    with open(fn, 'r') as f:
        return [line.strip(NEWLINE) for line in f]
    
def write_json(fn, data):
    with open(fn, 'w') as f:
        f.write(json.dumps(data))    
            
def read_json(fn):
    '''Read a json file (fn) and returns it as a dictionary'''
    with open(fn, 'r') as f:
        return json.loads(f.read())

In [6]:
def write_data(fn, data):
    """Backs up the previous version of the data if it exists and writes the new data to a file."""
    # Backs up the previous data if it exists.
    try:
        now = datetime.datetime.now().strftime('%H:%M-%d-%m-%Y')
        write_json("data/backup/{}_{}.json ".format(fn, now), 
                   read_json("data/{}.json".format(fn)))
    except:
        pass

    write_json("data/{}.json".format(fn), data)

In [7]:
def find(lst, key, value):
    '''Finds the first index of a list 
    lst where the key matches the value'''
    for index, dic in enumerate(lst):
        if dic[key] == value:
            return index
        
    return None

In [8]:
def ns_ips(fn):
    ns_ips = {}

    with open(fn, 'r') as f:
        for line in f:
            if not line.startswith(';;'):
                ns, _, _, _, ip = line.split()
                ns = ns[:-1]

                if ns in ns_ips and ip not in ns_ips[ns]:
                    ns_ips[ns].append(ip)
                else:
                    ns_ips[ns] = [ip]
                
    return ns_ips

In [9]:
class CustomDNSException(Exception):
    pass

def test_tcp_udp(data, timeout = 5):
    data_copy = copy.deepcopy(data)
    
    pbar = tqdm(total=len(data_copy))

    for datum in data_copy:
        protocols = []
        
        if 'tcp' in datum and not datum['tcp']:
            protocols.append(udp)
            
        if ('udp' in datum and not datum['udp']):
            protocols.append(tcp)
            
        for p in protocols:
            # Create SOA query
            m = dns.message.make_query(datum['tld'], dns.rdatatype.SOA)
            
            try: 
                a = p(m, datum['ip'], timeout = timeout)
                
                # We expect NOERROR RCODE (0) and an answer
                if a.rcode() == 0 and len(a.answer) > 0:
                    datum[p.__name__] = True
                else:
                    raise CustomDNSException('failed')
                    
            except (dns.exception.Timeout, socket_error, CustomDNSException):
                datum[p.__name__] = False

        pbar.update(1)
    pbar.close()
    
    return data_copy

In [10]:
def find_nxdomain(tld, max_tries = 3):
    for _ in range(max_tries):
        domain = '{}.{}'.format(str(uuid4()), tld)
        
        try:
            query(domain)
        except NXDOMAIN:
            return domain
        except:
            pass
    
    return None

def find_nxdomain_wildcard(tld, max_tries = 3):
    for _ in range(max_tries):
        domain = '{}.{}'.format(str(uuid4()), tld)

        response = !dig soa +noall +authority +noidn {domain}

        if len(response) > 0 and response[0].startswith(tld):
            return domain
    
    return None

# Init

In [None]:
# First time run
!mkdir data

!mkdir data/dig
!mkdir data/lists
!mkdir data/cymru
!mkdir data/whois
!mkdir data/atlas
!mkdir data/atlas/ns
!mkdir data/atlas/soa
!mkdir data/backup

# Top-Level Domains

In [11]:
!wget https://data.iana.org/TLD/tlds-alpha-by-domain.txt -O data/lists/tlds
!sed -i '1d' data/lists/tlds # remove header

--2017-07-12 20:37:59--  https://data.iana.org/TLD/tlds-alpha-by-domain.txt
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving data.iana.org... 72.21.81.189, 2606:2800:11f:bb5:f27:227f:1bbf:a0e
Connecting to data.iana.org|72.21.81.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10433 (10K) [text/plain]
Saving to: ‘data/lists/tlds’


2017-07-12 20:37:59 (189 MB/s) - ‘data/lists/tlds’ saved [10433/10433]



In [12]:
tlds = read_list('data/lists/tlds')

In [None]:
# Gathers WHOIS records for each TLD.

pbar = tqdm(total=len(tlds))

for tld in tlds:
    !whois -h whois.iana.org:43 {tld} > data/whois/{tld}  
        
    pbar.update(1)
pbar.close()

In [13]:
# Gathers empty or missing WHOIS records.

indir = 'data/whois/'

for root, dirs, filenames in os.walk(indir):
    print("Gathering empty WHOIS records.")
    for f in filenames:
        stat = os.stat(indir + f)
            
        if stat.st_size == 0:
            !whois -h whois.iana.org:43 {f} > data/whois/{f}
    
    print("Gathering missing WHOIS records.")
    for tld in tlds:
        if tld not in filenames:
            !whois -h whois.iana.org:43 {tld} > data/whois/{tld}

Gathering empty WHOIS records.
Gathering missing WHOIS records.


In [14]:
# Extracts the creation date and organisations for each TLD from the WHOIS record.

data_tlds = [{'tld': tld, 'organisations': []} for tld in tlds]

for root, dirs, filenames in os.walk(indir):
    for fn in filenames:
        with open(indir + fn, 'r') as f:
            for line in f:
                if line.startswith('created'):                    
                    index = find(data_tlds, 'tld', fn)
                    data_tlds[index]['creation_date'] = line.split()[-1]
                elif line.startswith('organisation'):
                    _, org = line.split('rganisation:')
                    index = find(data_tlds, 'tld', fn)
                    data_tlds[index]['organisations'].append(org.strip(NEWLINE))

In [15]:
# Gets the type of each TLD listed in the table of the url.

url = "https://www.iana.org/domains/root/db/"
html = urlopen(url)
soup = BeautifulSoup(html, 'html5lib')

for table in soup.find_all(attrs={'class': 'iana-table'}):
    values = [td.get_text(strip=True) for td in table.find_all('td')]
    values = [td for td in table.find_all('td')]

for i in range(0, len(values), 3):
    tld = str(values[i].findAll('a', href=True)[0]).split('.html')[0][26:].upper()
    index = find(data_tlds, 'tld', tld)
    
    if index != None:
        data_tlds[index]['type'] = values[i + 1].get_text(strip = True)

In [16]:
write_data('data_tlds', data_tlds)

In [17]:
# Special TLDs are the same as record types or classes which do not work in some bulk operations.
special_tlds = ['CH', 'IN', 'MD', 'MG', 'MR', 'MX']
write_list('data/lists/tlds', [tld for tld in tlds if tld not in special_tlds])

# Name Servers

In [None]:
# Gathers the name servers of every TLD using dig.
print('Gathering name servers.')
!dig +noall +answer +noidn -t NS -f data/lists/tlds > data/dig/tld_nss

for tld in special_tlds:
    !dig +noall +answer +noidn -t NS {tld} >> data/dig/tld_nss

print('Done.')

In [None]:
# Parses the NS records.

data_ns = []

with open('data/dig/tld_nss', 'r') as f:
    for line in f:
        if not line.startswith('.'):
            tld, _, _, _, ns = line.split()
            data_ns.append({'tld': tld[:-1], 'ns': ns.lower()[:-1]})
            
write_data('data_ns', data_ns)

# IP Addresses

In [None]:
write_list('data/lists/nss', set([datum['ns'] for datum in data_ns]))

In [None]:
print('Gathering IPv4 addresses.')
!dig +noall +answer +noidn A -f data/lists/nss > data/dig/ns_ipv4s
print('Done.')

print('Gathering IPv6 addresses.')
!dig +noall +answer +noidn AAAA -f data/lists/nss > data/dig/ns_ipv6s
print('Done.')

In [None]:
ns_ipv4s = ns_ips('data/dig/ns_ipv4s')
ns_ipv6s = ns_ips('data/dig/ns_ipv6s')

data_ips = []

# Adds the IP adress to the dictionary and creates a copy 
# in case a name server has multiple IP addresses or has 
# both a IPv4 and IPv6 address.
for datum in data_ns:
    if datum['ns'] in ns_ipv4s:
        for ip in ns_ipv4s[datum['ns']]:
            new_datum = copy.deepcopy(datum)
            new_datum['ip'] = ip
            data_ips.append(new_datum)
    
    if datum['ns'] in ns_ipv6s:
        for ip in ns_ipv6s[datum['ns']]:
            new_datum = copy.deepcopy(datum)
            new_datum['ip'] = ip
            data_ips.append(new_datum)

In [None]:
write_data('data_ips', data_ips)

# Autonomous System Number

In [None]:
write_list('data/lists/ips', ['begin'])
append_list('data/lists/ips', set([datum['ip'] for datum in data_ips]))
append_list('data/lists/ips', ['end'])

In [None]:
!netcat whois.cymru.com 43 < data/lists/ips | sort -n > data/cymru/ip_asns

In [None]:
# Makes a dictionary with the IP addresses as key and a list of ASNs as value.

ip_asns = {}

with open('data/cymru/ip_asns', 'r') as f:
    for line in f:
        if not line.startswith('Bulk') and not line.startswith('NA'):
            
            asn, ip, org = [value.strip() for value in line.split('|')]
            
            if ip in ip_asns and asn not in ip_asns[ip]:
                ip_asns[ip].append(asn)
            else:
                ip_asns[ip] = [asn]

In [None]:
data_asns = []

for datum in data_ips:
    if datum['ip'] in ip_asns:
        for asn in ip_asns[datum['ip']]:
            new_datum = copy.deepcopy(datum)
            new_datum['asn'] = asn
            data_asns.append(new_datum)

In [None]:
write_data('data_asns', data_asns)

# Reachability

In [None]:
data_ips = read_json('data/data_ips.json')

In [None]:
data_reach = test_tcp_udp(data_ips)

In [None]:
# Retries testing TCP or UDP
data_reach = test_tcp_udp(data_reach, timeout = 15)

In [None]:
write_data('data_reach', data_reach)

# Credibility

In [20]:
print('Gathering DNSKEY records.')
!dig +noall +answer +noidn -t DNSKEY -f data/lists/tlds > data/dig/tld_dnskeys
print('Done')

print('Gathering DS records.')
!dig +noall +answer +noidn -t DS -f data/lists/tlds > data/dig/tld_dss
print('Done.')

print('Gathering DNSKEY and DS records for special TLDs.')
for tld in special_tlds:
    !dig +noall +answer +noidn -t DNSKEY {tld} >> data/dig/tld_dnskeys
    !dig +noall +answer +noidn -t DS {tld} >> data/dig/tld_dss
print('Done')

Gathering DNSKEY records.
Done
Gathering DS records.
Done.
Gathering DNSKEY and DS records for special TLDs.
Done


In [None]:
# Parses the DNSKEY records.

data_cred = [{'tld': tld, 'ds': False, 'dnskey': False, 'jsj': None} for tld in tlds]

for answer in read_list('data/dig/tld_dnskeys'):
    answer_fields = answer.split()
    tld = answer_fields[0][:-1].upper()
    index = find(data_cred, 'tld', tld)
    
    try:
        data_cred[index]['dnskey'] = True
        
        if int(answer_fields[4]) == KSK:
            data_cred[index]['ksk'] = answer_fields[6]
        elif int(answer_fields[4]) == ZSK:
            data_cred[index]['zsk'] = answer_fields[6]
    except:
        print(tld)

In [None]:
# Parses the DS records.

for answer in read_list('data/dig/tld_dss'):
    v = answer.split()
    tld = v[0][:-1].upper()
    
    index = find(data_cred, 'tld', tld)
    
    try:
        data_cred[index]['ds'] = True
    except:
        print(tld)

In [None]:
write_data('data_cred', data_cred)

# Performance

In [22]:
# Generates domains that result in a NXDOMAIN response for each TLD.
data_test_perf = []
pbar = tqdm(total=len(tlds))

for tld in tlds:
    data_test_perf.append({'tld': tld, 'domain': find_nxdomain(tld)})
    
    pbar.update(1)
pbar.close()

100%|██████████| 1547/1547 [01:51<00:00, 13.85it/s]


In [23]:
# Generates domains that result in a NXDOMAIN response for each TLD that uses wildcards.
pbar = tqdm(total=len([datum for datum in data_test_perf if not datum['domain']]))

for datum in [datum for datum in data_test_perf if not datum['domain']]:
    datum['domain'] = find_nxdomain_wildcard(datum['tld'])
    
    pbar.update(1)
pbar.close()

100%|██████████| 38/38 [00:05<00:00,  3.85it/s]


In [24]:
# Set the probes.
probe_ids = [10262, 10287, 11040, 11429, 12515, 12873, 12956, 13623, 13728, 13769, 13788, 13799, 13804, 
             13805, 13810, 14237, 26057, 14564, 15156, 14691, 15594, 15799, 4205, 18131, 18195, 18691, 
             19326, 19740, 20111, 20353, 20493, 20531, 20621, 21003, 21035, 21122, 21251, 21345, 21703, 
             22286, 22695, 23031, 23085, 28240, 27972, 23697, 24807, 25011, 25148, 25323, 26936, 26378, 
             26627, 4155, 26823, 28355, 30676, 4829, 29006, 29183, 29405, 30225, 30324, 31201, 19306, 
             19634, 6025, 11660, 22388, 25182, 4123, 3812, 20923, 14384, 12389]

probes = [
    {
        "value": str(probe_ids)[1:-1],
        "type": "probes",
        "requested": len(probe_ids)
    }
]

In [28]:
payloads = []
payload_size = 100 # max 100
step_size = int(payload_size / 2)

for i in range(0, len(data_test_perf), step_size):
    defintions = []
    
    for datum in data_test_perf[i:i + step_size]:
        # Create caching measurement
        definition_caching = definition.copy()
        definition_caching['query_type'] = "NS"
        definition_caching['query_argument'] = datum['tld']
        definition_caching['description'] = "caching " + datum['tld']
        defintions.append(definition_caching)
        
        # Create response time measurement
        definition_measuring = definition.copy()
        definition_measuring['query_type'] = "SOA"
        definition_measuring['query_argument'] = datum['domain']
        definition_measuring['description'] = "measuring " + datum['tld']
        defintions.append(definition_measuring)        

    new_payload = payload.copy()
    new_payload['probes'] = probes
    new_payload['definitions'] = defintions

    payloads.append(new_payload)

In [None]:
write_data('payloads', payloads)

In [None]:
measurement_ids = []
measurement_responses = []
url = URL_DNS_MEASUREMENT_CREATE + '?key=' + ATLAS_API_KEY

In [None]:
# Only 10 payloads can be sent per day.
# Manually set this slice each day.
for payload in payloads[:10]:
    
    request = requests.post(url, data = json.dumps(payload), headers = HEADERS)
    print(request.status_code)
    
    while request.status_code == 400:
        print(request.json())
        request = requests.post(url, data = json.dumps(payload), headers = HEADERS)
        time.sleep(300)
        print(request.status_code)
    
    measurement_ids += measurement_ids + request.json()
    measurement_ids = [min(measurement_ids), max(measurement_ids)]

In [None]:
# Retrieves the result of each measurement and writes it to a file.

next_result = True
url = '{}?id__gte={}&id__lte={}&mine=true'.format(URL_DNS_MEASUREMENT_GET, min(measurement_ids), max(measurement_ids))
measurements = requests.get(url).json()
pbar = tqdm(total=len(tlds))

while next_result:
    for result in measurements['results']:
        if len(result['description'].split()) == 2:
            measurement_type, tld = result['description'].split()
            request = requests.get(result['result']).json()

            if measurement_type == 'measuring':
                with open('data/atlas/soa/{}.json'.format(tld.upper()), 'w') as f:
                    temp = copy.deepcopy(result)
                    temp['result'] = request

                    f.write(json.dumps(temp))

            elif measurement_type == 'caching':
                with open('data/atlas/ns/{}.json'.format(tld.upper()), 'w') as f:
                    temp = copy.deepcopy(result)
                    temp['result'] = request

                    f.write(json.dumps(temp))
    
    if measurements['next']:
        measurements = requests.get(measurements['next']).json() 
    else:
        next_result = False

    pbar.update(1)
pbar.close()

In [None]:
# Extracts the response time from the measurement results.

indir = 'data/atlas/soa/'

data_perf = []

for root, dirs, filenames in os.walk(indir):
    for f in filenames:
        tld, _ = f.split('.')
        
        datum = {'tld': tld, 'rt': [], 'timeouts': 0}
        
        with open(indir + f, 'r') as f:
            tld_results = json.loads(f.read())
            
            for probe in tld_results['result']:
                for result in probe['resultset']:
                    if 'result' in result:
                        datum['rt'].append(result['result']['rt'])                    
                    elif 'error' in result and 'timeout' in result['error']:
                        datum['timeouts'] += 1
        
        datum['rt'] = np.mean(datum['rt'])
        data_perf.append(datum)

In [None]:
write_data('data_perf', data_perf)

# Anycast

In [None]:
test_set = [
    {'target': '185.49.140.60', 'argument': 'nlnetlabs.nl', 'ns': 'ns.nlnetlabs.nl'},
    {'target': '192.16.197.229', 'argument': 'nlnet.nl', 'ns': 'mcvax.nlnet.nl'},
    {'target': '194.0.28.53', 'argument': 'nl', 'ns': 'ns5.dns.nl'},
    {'target': '204.61.216.4', 'argument': 'nlnetlabs.nl', 'ns': 'anyns.pch.net'}
]

In [None]:
definitions = []
n = 3 # number of measurements

for test in test_set:
    for i in range(1, n + 1):    
        new_definition = copy.deepcopy(definition)
        new_definition['type'] = 'SOA'
        new_definition['query_argument'] = test['argument']
        new_definition['use_probe_resolver'] = False
        new_definition['target'] = test['target']
        new_definition['description'] = 'anycast {} {}'.format(i, test['ns'])

        definitions.append(new_definition)

payload['definitions'] = definitions

In [None]:
anycast_probes = [
    # North America
    {'id': 22447, 'country-code': 'US', 'city': 'San Francisco'},
    {'id': 14233, 'country-code': 'US', 'city': 'Denver'},
    {'id': 25081, 'country-code': 'US', 'city': 'Washington'},
    # South America
    {'id': 31450, 'country-code': 'CR', 'city': 'San Jose'},
    {'id': 30185, 'country-code': 'BR', 'city': 'Sao Paulo'},
    {'id': 30123, 'country-code': 'CL', 'city': 'Santiago'},
    # Europe
    {'id': 32669, 'country-code': 'GR', 'city': 'Athens'},
    {'id': 31479, 'country-code': 'RU', 'city': 'Moscow'},
    {'id': 29762, 'country-code': 'ES', 'city': 'Madrid'},
    {'id': 26610, 'country-code': 'NL', 'city': 'Utrecht'},
    # Africa
    {'id': 22458, 'country-code': 'ZA', 'city': 'Cape Town'},
    {'id': 13258, 'country-code': 'AE', 'city': 'Dubai'},
    {'id': 28493, 'country-code': 'SN', 'city': 'Dakar'},
    # Asia
    {'id': 28819, 'country-code': 'JP', 'city': 'Tokyo'},
    {'id': 28964, 'country-code': 'KR', 'city': 'Seoul'},
    {'id': 6107,  'country-code': 'IN', 'city': 'Mumbai'},
    {'id': 25047, 'country-code': 'HK', 'city': 'Hong Kong'},
    {'id': 26378, 'country-code': 'KG', 'city': 'Bishkek'},
    # Oceania
    {'id': 25208, 'country-code': 'AU', 'city': 'Sydney'},
    {'id': 28226, 'country-code': 'NZ', 'city': 'Welington'}
]

probes = [
    {
        "value": str([probe['id'] for probe in anycast_probes])[1:-1],
        "type": "probes",
        "requested": len(anycast_probes)
    }
]

payload['probes'] = probes

In [None]:
url = '{}?key={}'.format(URL_DNS_MEASUREMENT_CREATE, atlas_api_key)
request = requests.post(url, data = json.dumps(payload), headers = HEADERS)

In [None]:
measurement_ids = request.json()

In [None]:
# Only 10 payloads can be sent per day.
# Manually set this slice each day.
for payload in payloads[:10]:
    
    request = requests.post(url, data = json.dumps(payload), headers = HEADERS)
    print(request.status_code)
    
    while request.status_code == 400:
        print(request.json())
        request = requests.post(url, data = json.dumps(payload), headers = HEADERS)
        time.sleep(300)
        print(request.status_code)
    
    measurement_ids += measurement_ids + request.json()

In [None]:
next_result = True
url = '{}?id__in={}&mine=true'.format(URL_DNS_MEASUREMENT_GET, str(measurement_ids)[1:-1])
measurements = requests.get(url).json()

while next_result:
    for result in measurements['results']:
        if len(result['description'].split()) == 3:
            measurement_type, i, ns = result['description'].split()
            request = requests.get(result['result']).json()

            if measurement_type == 'anycast':
                temp = copy.deepcopy(result)
                temp['result'] = request
                
                write_json('data/atlas/anycast/{}.{}.json'.format(ns, i), temp)
    
    if measurements['next']:
        measurements = requests.get(measurements['next']).json() 
    else:
        next_result = False

In [None]:
indir = 'data/atlas/anycast/'
probe_data = {k:v for k, v in [(probe_id, []) for probe_id in [probe['id'] for probe in anycast_probes]]}
data_ac = [{'ns': test['ns'], 'probes': copy.deepcopy(probe_data)} for test in test_set]

for root, dirs, filenames in os.walk(indir):
    for f in filenames:
        result = read_json(indir + f)
        _, _, ns = result['description'].split()
        index = find(data_ac, 'ns', ns)
        
        for probe in result['result']:
            try:
                data_ac[index]['probes'][probe['prb_id']].append(probe['result']['rt'])
            except:
                print('x')

In [None]:
write_data('data_ac', data_ac)