In [1]:
# Constantijn Bicker Caarten
# Last updated: 13-06-2017

In [2]:
from socket import error as socket_error
from urllib.request import urlopen
from dns.query import udp, tcp
from bs4 import BeautifulSoup
from dns.resolver import dns
from uuid import uuid4
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib as plt

import subprocess
import socket
import copy
import time
import json
import os

%matplotlib inline

In [76]:
# First time run
!mkdir data
!mkdir data/whois
!mkdir data/cymru
!mkdir data/dig
!mkdir data/backup
!mkdir data/lists

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘data/whois’: File exists
mkdir: cannot create directory ‘data/dig’: File exists
mkdir: cannot create directory ‘data/backup’: File exists
mkdir: cannot create directory ‘data/lists’: File exists


In [199]:
ZSK = 256
KSK = 257

pie = (6, 6)

newline = '\n'

# Functions

In [157]:
def write_list(fn, data):
    '''Writes a list to a file with each value on a new line'''
    with open(fn, 'w') as f:
        for datum in data:
            f.write(datum + newline)
        
def append_list(fn, data):
    '''Appends a list to a file with each value on a new line'''
    with open(fn, 'a') as f:
        for datum in data:
            f.write(datum + newline)
    
def read_list(fn):
    '''Reads a file and '''
    with open(fn, 'r') as f:
        return [line.strip(newline) for line in f]
    
def write_json(fn, data):
    with open(fn, 'w') as f:
        f.write(json.dumps(data))    
            
def read_json(fn):
    '''Read a json file (fn) and returns it as a dictionary'''
    with open(fn, 'r') as f:
        return json.dumps(f.read())

In [134]:
def write_data(fn, data):
    """Backs up the previous version of the data if it exists and writes the new data to a file."""
    # Backs up the previous data if it exists.
    try:
        write_json("data/backup/{}.json ".format(fn) + time.ctime().replace(' ', '-'), 
                   read_json("data/{}.json".format(fn)))
    except:
        pass

    write_json("data/{}.json".format(fn), data)

In [6]:
def find(lst, key, value):
    for i, dic in enumerate(lst):
        if dic[key] == value:
            return i
    return None

def sort_dict_list(data, x):
    return sorted(data, key=lambda k: k[x]) 

In [7]:
def ns_ips(fn):
    ns_ips = {}

    with open(fn, 'r') as f:
        for line in f:
            if not line.startswith(';;'):
                ns, _, _, _, ip = line.split()
                ns = ns[:-1]

                if ns in ns_ips and ip not in ns_ips[ns]:
                    ns_ips[ns].append(ip)
                else:
                    ns_ips[ns] = [ip]
                
    return ns_ips

In [119]:
class CustomDNSException(Exception):
    pass

def test_tcp_udp(data, timeout = 5):
    pbar = tqdm(total=len(data))

    for datum in data:
        for p in (udp, tcp):
            # Create SOA query
            m = dns.message.make_query(datum['tld'], dns.rdatatype.SOA)
            try: 
                a = p(m, datum['ip'], timeout = timeout)
                # We expect NOERROR RCODE (0) and an answer
                if a.rcode() == 0 and len(a.answer) > 0:
                    datum[p.__name__] = True

                else:
                    raise CustomDNSException('failed')
            except (dns.exception.Timeout, socket_error, CustomDNSException):
                datum[p.__name__] = False

        pbar.update(1)
    pbar.close()

# Top-Level Domains

In [23]:
!wget https://data.iana.org/TLD/tlds-alpha-by-domain.txt -O data/lists/tlds
!sed -i '1d' data/lists/tlds # remove header

--2017-06-13 13:09:02--  https://data.iana.org/TLD/tlds-alpha-by-domain.txt
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving data.iana.org... 2606:2800:11f:bb5:f27:227f:1bbf:a0e, 72.21.81.189
Connecting to data.iana.org|2606:2800:11f:bb5:f27:227f:1bbf:a0e|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10295 (10K) [text/plain]
Saving to: ‘data/lists/tlds’


2017-06-13 13:09:08 (128 MB/s) - ‘data/lists/tlds’ saved [10295/10295]



In [24]:
tlds = read_list('data/lists/tlds')

In [None]:
pbar = tqdm(total=len(tlds))

for tld in tlds:
    !whois -h whois.iana.org:43 {tld} > data/whois/{tld}  
        
    pbar.update(1)
pbar.close()

In [92]:
indir = 'data/whois/'

for root, dirs, filenames in os.walk(indir):
    print("Gathering empty WHOIS records.")
    for f in filenames:
        stat = os.stat(indir + f)
            
        if stat.st_size == 0:
            !whois -h whois.iana.org:43 {f} > data/whois/{f}
    
    print("Gathering missing WHOIS records.")
    for tld in tlds:
        if tld not in filenames:
            !whois -h whois.iana.org:43 {tld} > data/whois/{tld}

Gathering empty WHOIS records.
Gathering missing WHOIS records.


In [126]:
data_tlds = [{'tld': tld, 'organisations': []} for tld in tlds]

for root, dirs, filenames in os.walk(indir):
    for fn in filenames:
        with open(indir + fn, 'r') as f:
            for line in f:
                if line.startswith('created'):                    
                    index = find(data_tlds, 'tld', fn)
                    data_tlds[index]['creation_date'] = line.split()[-1]
                elif line.startswith('organisation'):
                    _, org = line.split('rganisation:')
                    index = find(data_tlds, 'tld', fn)
                    data_tlds[index]['organisations'].append(org.strip(newline))

In [136]:
write_data('data_tlds', data_tlds)

In [152]:
# Special TLDs are the same as record types or classes which do not work in bulk.
special_tlds = ['CH', 'IN', 'MD', 'MG', 'MR', 'MX']
write_list('data/lists/tlds', [tld for tld in tlds if tld not in special_tlds])

# Name Servers

In [18]:
# Gathers the name servers of every TLD using dig.
print('Gathering name servers.')
!dig +noall +answer +noidn -t NS -f data/lists/tlds > data/dig/tld_nss

for tld in special_tlds:
    !dig +noall +answer +noidn -t NS {tld} >> data/dig/tld_nss

print('Done.')

Gathering name servers.
Done.


In [139]:
data_ns = []

# Parses the answers of dig.
with open('data/dig/tld_nss', 'r') as f:
    for line in f:
        if not line.startswith('.'):
            tld, _, _, _, ns = line.split()
            data_ns.append({'tld': tld[:-1], 'ns': ns.lower()[:-1]})
            
write_data('data_ns', data_ns)

# IP Addresses

In [20]:
write_list('data/lists/nss', set([datum['ns'] for datum in data]))

In [21]:
print('Gathering IPv4 addresses.')
!dig +noall +answer +noidn A -f data/lists/nss > data/dig/ns_ipv4s
print('Done.')

print('Gathering IPv6 addresses.')
!dig +noall +answer +noidn AAAA -f data/lists/nss > data/dig/ns_ipv6s
print('Done.')

Gathering IPv4 addresses.
Done.
Gathering IPv6 addresses.
Done.


In [143]:
ns_ipv4s = ns_ips('data/dig/ns_ipv4s')
ns_ipv6s = ns_ips('data/dig/ns_ipv6s')

data_ips = []

for datum in data:
    if datum['ns'] in ns_ipv4s:
        for ip in ns_ipv4s[datum['ns']]:
            new_datum = copy.deepcopy(datum)
            new_datum['ip'] = ip
            data_ips.append(new_datum)
    
    if datum['ns'] in ns_ipv6s:
        for ip in ns_ipv6s[datum['ns']]:
            new_datum = copy.deepcopy(datum)
            new_datum['ip'] = ip
            data_ips.append(new_datum)

In [144]:
write_data('data_ips', data_ips)

# Autonomous System Number

In [73]:
write_list('data/lists/ips', ['begin'])
append_list('data/lists/ips', set([datum['ip'] for datum in data_ips]))
append_list('data/lists/ips', ['end'])

In [77]:
!netcat whois.cymru.com 43 < data/lists/ips | sort -n > data/cymru/ip_asns

In [145]:
ip_asns = {}

with open('data/cymru/ip_asns', 'r') as f:
    for line in f:
        if not line.startswith('Bulk') and not line.startswith('NA'):
            
            asn, ip, org = [value.strip() for value in line.split('|')]
            
            if ip in ip_asns and asn not in ip_asns[ip]:
                ip_asns[ip].append(asn)
            else:
                ip_asns[ip] = [asn]

In [146]:
for datum in data_ips:
    if datum['ip'] in ip_asns:
        datum['asn'] = ip_asns[datum['ip']]

In [147]:
data_asns = []

for datum in data_ips:
    if datum['ip'] in ip_asns:
        for asn in ip_asns[datum['ip']]:
            new_datum = copy.deepcopy(datum)
            new_datum['asn'] = asn
            data_asns.append(new_datum)

In [148]:
write_data('data_asns', data_asns)

# Reachability

In [149]:
test_tcp_udp(data_ips)

In [None]:
data_no_tcp_and_udp = [datum for datum in data_ips if not datum['tcp'] and not datum['udp']]

test_tcp_udp(data_no_tcp_and_udp, timeout = 10)

In [None]:
len([datum for datum in data_ips if not datum['tcp'] and not datum['udp']]), len(data_ips)

In [142]:
write_data('data_prot', data_ips)

In [None]:
len(set([datum['tld'] for datum in data_ips if not datum['tcp'] and not datum['udp']]))

In [None]:
with open('data/backup/data_tcp_udp', 'w') as f:
    f.write(json.dumps(data_ips))

In [None]:
# with open('data/backup/data_tcp_udp', 'r') as f:
#     data_ips = json.loads(f.read())

In [None]:
df = pd.DataFrame(data_ips)

In [None]:
xtlds = []
xnss = []
xips = []
xasns = []
xtcp = []
xudp = []

for datum in data_ips:
    xtlds.append(datum['tld'])
    xnss.append(datum['ns'])
    xips.append(datum['ip'])
    xtcp.append(datum['tcp'])
    xudp.append(datum['udp'])
    if 'asn' in datum:
        xasns.append(datum['asn'])
    else:
        xasns.append([])
        
print(len(data_ips), len(xtlds), len(xnss), len(xips))
    
ix = pd.MultiIndex.from_arrays([xtlds, xnss, xips], names=['tld', 'ns', 'ip'])
dg = pd.DataFrame({'asn': xasns, 'tcp': xtcp, 'udp': xudp}, index = ix)
# dg.head(10)

In [None]:
# dage = {}

# for datum in data_age:
#     dage[datum['tld']] = datum['age']
    
# for datum in data_ips:
#     if datum['tld'][:-1].upper() in dage:
#         if dage[datum['tld'][:-1].upper()] == 'new':
#             datum['age'] = 'new'
#         else:
#             datum['age'] = 'old'
#     else:
#         datum['age'] = None

# xtlds = []
# xnss = []
# xips = []
# xasns = []
# xtcp = []
# xudp = []

# for datum in data_ips:
#     if datum['age'] == 'old':
    
#         xtlds.append(datum['tld'])
#         xnss.append(datum['ns'])
#         xips.append(datum['ip'])
#         xtcp.append(datum['tcp'])
#         xudp.append(datum['udp'])
#         if 'asn' in datum:
#             xasns.append(datum['asn'])
#         else:
#             xasns.append([])
        
# print(len(data_ips), len(xtlds), len(xnss), len(xips))
    
# ix = pd.MultiIndex.from_arrays([xtlds, xnss, xips], names=['tld', 'ns', 'ip'])
# dg = pd.DataFrame({'asn': xasns, 'tcp': xtcp, 'udp': xudp}, index = ix)
# # dg.head(10)

In [None]:
# data_ips[0]

In [None]:
ns_tcp_udp = [{'ns': datum['ns'], 'tcp': False, 'udp': False} for datum in data_ips if datum['tld'] in dtype and dtype[datum['tld']] == 'country-code']

for ns in ns_tcp_udp:
    for datum in data_ips:
        if datum['ns'] == ns['ns']:
            if datum['tcp']:
                ns['tcp'] = True
                
            if datum['udp']:
                ns['udp'] = True       

In [None]:
ns_tcp_udp = [{'ns': datum['ns'], 'tcp': False, 'udp': False} for datum in data_ips]

for ns in ns_tcp_udp:
    for datum in data_ips:
        if datum['ns'] == ns['ns']:
            if datum['tcp']:
                ns['tcp'] = True
                
            if datum['udp']:
                ns['udp'] = True       

In [None]:
ip = ':'

ns_tcp_udp = [{'ns': datum['ns'], 'tcp': False, 'udp': False} for datum in data_ips if ip in datum['ip']]

for ns in ns_tcp_udp:
    for datum in data_ips:
        if datum['ns'] == ns['ns'] and ip in datum['ip']:
            if datum['tcp']:
                ns['tcp'] = True
                
            if datum['udp']:
                ns['udp'] = True

In [None]:
dg = pd.DataFrame(ns_tcp_udp)

In [None]:
ax = dg.tcp.value_counts().plot.pie(autopct=lambda p : '{:.2f}% ({:.0f})'.format(p, p * dg.count().udp / 100), figsize = pie_size)
# ax = dg.tcp.plot.bar()
ax.set_ylabel('')
fig = ax.get_figure()
fig.savefig("imgs/tcp.pdf")

In [None]:
ax = dg.udp.value_counts().plot.pie(autopct=lambda p : '{:.2f}% ({:.0f})'.format(p, p * dg.count().udp / 100), 
                                    figsize = pie_size)
ax.set_ylabel('')
# ax.set_title('UDP')
fig = ax.get_figure()
fig.savefig("imgs/udp.pdf")

In [None]:
df_ftcp = dg.loc[dg.tcp == False]
df_fudp = dg.loc[dg.udp == False]
df_ttcp = dg.loc[dg.tcp == True]

ff = df_ftcp.loc[df_ftcp.udp == False].count().tcp
ft = df_ftcp.loc[df_ftcp.udp == True].count().tcp
tf = df_fudp.loc[df_fudp.tcp == True].count().tcp
tt = df_ttcp.loc[df_ttcp.udp == True].count().tcp

print(ff, ft, tf, tt)

ut_data = [{'name': 'none', 'count': ff},       
           {'name': 'tcp', 'count': tf},     
           {'name': 'udp', 'count': ft},           
           {'name': 'tcp + udp', 'count': tt}
          ]

total = ff + ft + tf + tt

dfgh = pd.DataFrame(ut_data)
dfgh.index = dfgh['name']
del dfgh['name']
# ax = dfgh.plot.pie('count',
# #                    autopct='s(%.2f)',
#                    autopct=lambda p : '{:.2f}% ({:.0f})'.format(p, p * total / 100),
# #                    radius = 2.5,
# #                    pctdistance=1.2,
# #                    labeldistance=1.2,
# #                    explode = True,
#                    figsize = pie_size, 
#                    legend=False, 
#                    labels=['','','',''])

ax = dfgh.plot.barh()
# ax.set_xlim([0,10000])

ax.legend(loc='best', labels=dfgh.index)
ax.set_xlabel('Number of name servers')
ax.set_ylabel('Protocol(s) supported')
# ax.set_title('name server udp/tcp support')
ax.legend_.remove()
ax.set_xscale('log')
fig = ax.get_figure()
fig.tight_layout()
fig.savefig("imgs/tcp_udp_generic.pdf")

In [None]:
dfgh.index

In [None]:
df.loc[df.tld == 'actor.']

In [None]:
# def write_to_file(fn, indir, content):
#     with open(indir + fn, 'w') as f:
#         f.write(content)

In [None]:
# write_to_file('tcp_udp_not_working', 'data/temp/', df_ftcp.loc[df_ftcp.udp == False].to_csv())

In [None]:
# df_ftcp.loc[df_ftcp.udp == False]

In [None]:
# df.loc[df.ip.str.contains(':')].count().tcp #ipv6
# df[~df["ip"].str.contains(":")].count().tcp #ipv4

In [None]:
# ax = df[~df["ip"].str.contains(":")].tcp.value_counts().plot.pie(autopct='%.2f', figsize = pie_size)
# ax.set_ylabel('')
# fig = ax.get_figure()
# fig.savefig("imgs/tcp_ipv4.pdf")

In [None]:
# ax = df[~df["ip"].str.contains(":")].udp.value_counts().plot.pie(autopct='%.2f', figsize = pie_size)
# ax.set_ylabel('')
# fig = ax.get_figure()
# fig.savefig("imgs/udp_ipv4.pdf")

In [None]:
# ax = df.loc[df.ip.str.contains(':')].tcp.value_counts().plot.pie(autopct='%.2f', figsize = pie_size)
# ax.set_ylabel('')
# fig = ax.get_figure()
# fig.savefig("imgs/tcp_ipv6.pdf")

In [None]:
# ax = df.loc[df.ip.str.contains(':')].udp.value_counts().plot.pie(autopct='%.2f', figsize = pie_size)
# ax.set_ylabel('')
# fig = ax.get_figure()
# fig.savefig("imgs/udp_ipv6.pdf")

In [None]:
df.head()

# Credibility

In [201]:
!dig +noall +answer +noidn -t DNSKEY -f data/lists/tlds > data/dig/tld_dnskeys
!dig +noall +answer +noidn -t DS -f data/lists/tlds > data/dig/tld_dss

for tld in special_tlds:
    !dig +noall +answer +noidn -t DNSKEY {tld} >> data/dig/tld_dnskeys
    !dig +noall +answer +noidn -t DS {tld} >> data/dig/tld_dss

















In [235]:
data_cred = [{'tld': tld, 'ds': False, 'dnskey': False, 'algorithm': None} for tld in tlds]

temp = []

for answer in read_list('data/dig/tld_dnskeys'):
    v = answer.split()
    tld = v[0][:-1].upper()
    index = find(data_cred, 'tld', tld)
    
    try:
        data_cred[index]['dnskey'] = True
        data_cred[index]['algorithm'] = v[6]
    except:
        print(tld)

In [236]:
for answer in read_list('data/dig/tld_dss'):
    v = answer.split()
    tld = v[0][:-1].upper()
    
    index = find(data_cred, 'tld', tld)
    
    try:
        data_cred[index]['ds'] = True
    except:
        print(tld)

In [237]:
write_data('data_cred', data_cred)

# Organisations per TLD

In [None]:
# tld_orgs

In [None]:
df_orgs = pd.DataFrame(tld_orgs)
df_orgs.index = df_orgs['tld']
del df_orgs['tld']
df_orgs.head()

In [None]:
df_orgs.reset_index(inplace=True)
rows = []
_ = df_orgs.apply(lambda row: [rows.append([row['tld'], nn]) 
                         for nn in row.organisations], axis=1)
df_orgs_new = pd.DataFrame(rows, columns=df_orgs.columns).set_index(['tld'])

df_orgs_new.head()

In [None]:
df_orgs_new.organisations.value_counts(ascending=False).head(80).plot.barh(figsize = (10,20))

In [None]:
bins = df_orgs_new.organisations.value_counts().nunique() - 1
ax = df_orgs_new.organisations.value_counts().hist(bins = bins)
ax.set_yscale('log')

In [None]:
df_orgs_new.organisations.value_counts().value_counts().plot.pie()

In [None]:
# df_orgs_new.organisations.value_counts()

# df2[df2['rr_quality'] > 0]].groupby([df2.index.hour,'sleep_summary_id')

In [None]:
ax = df_tld_orgs.organisation.value_counts().head(20).plot.barh(figsize = bar_size, fontsize=12)
ax.set_xlabel('Number of TLDs', fontsize = 16)
ax.set_ylabel('Organisation',fontsize = 16)
fig = ax.get_figure()
fig.savefig("imgs/orgs.png")

In [None]:
ax = df_tld_orgs.type.value_counts().plot.pie(figsize = pie_size, legend=True)
ax.set_ylabel('')
fig = ax.get_figure()
fig.savefig("imgs/types.png")

# Growth

In [None]:
data_age = []
dage = {}

for datum in tld_creation:
    y, m, d = datum['date_created'].split('-')
    if y in ['2014', '2015', '2016', '2017'] or y == '2013' and int(m) >= 10:
        data_age.append({'tld': datum['tld'], 'age': 'new'})
        dage[datum['tld']] = 'new'
    else:
        data_age.append({'tld': datum['tld'], 'age': 'old'})
        dage[datum['tld']] = 'old'