# Clustering FQDNs

Now that we have the data from DNSDB, we'd like to see if we can repeat the clustering, but on the fqdns. 

There's two ways to do this clustering: 
 1. Cluster fqdns like they're documents, clustering each one as a separate document
 1. Group subdomains together by their base registrable domain, cluster the registrable domains together by the pattern in their subdomains. 

In [26]:
import io
import time
import json
import re
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import idna
from collections import defaultdict
from sklearn.cluster import DBSCAN
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

## Attempt 1 - cluster fqdns together.

## Step 1 - load the clusters & make the map

In [2]:
domain_clusters = dict()
cluster_ids = list()
with open("disney-clusters.json") as infile:
    data = json.load(infile)
    for cluster_id in data:
        if cluster_id == -1:
            # skip the grab bad
            continue
        cluster_text = f"cluster_id-{cluster_id}"
        cluster_ids.append(cluster_text)
        domains = data[cluster_id]
        for domain in domains:
            domain_clusters[domain] = cluster_text
        #if cluster_id != -1:
        #    domains = data[cluster_id]
        #    domain_id = domains[0]
        #    for domain in domains:
        #        domain_clusters[domain] = domain_id
        #else:
        #    # these are ones that failed to cluster, leave them as their originals
        #    for broken in data[cluster_id]:
        #        domain_clusters[broken] = broken
stop_words = list(ENGLISH_STOP_WORDS.union(cluster_ids))
cluster_stop_words = list(cluster_ids)

In [3]:
list(domain_clusters.keys())[:10]

['26uy6.top',
 'appchasses6wqwb9nzykac95ng.com',
 'apple-unlocked.com',
 'appsuptmpjubxee3gcvimqe6mq3rp.com',
 'bridgeaccess.us',
 'cashappmvb-sec.com',
 'cashappsalert-sec.com',
 'commercialservlces-mtb.com',
 'cprapid.com',
 'designestylelab.com']

## Step 2 - load the FQDNs from dnsdb & replace their base registrable domain

In [4]:
fqdns = list()
def decode_domain(entry):
    try:
        decoded = idna.decode(entry)
        if decoded.endswith("."):
            decoded = decoded.rstrip(".")
        return decoded
    except (idna.InvalidCodepoint, idna.IDNAError):
        return None

with open("dnsdb_resolutions.json") as infile:
    data = json.load(infile)
    for base_domain in data:
        apex_domains = set()
        # apex_domains = {decode_domain(entry['domain']) for entry in data[base_domain]['apex']}
        subdomains = {decode_domain(entry['domain']) for entry in data[base_domain]['subdomains']}
        all_domains = apex_domains.union(subdomains)
        if len(all_domains) > 20:
            continue
        if None in all_domains:
            all_domains.remove(None)
        fqdns.extend(list(all_domains))

In [5]:
fixed_fqdns = list()
keys = list(domain_clusters.keys())
for entry in fqdns:
    found = False
    counter = 0
    while not found and counter < len(domain_clusters):
        key = keys[counter]
        if entry.endswith("." + key):
            # key is the registrable domain
            fixed_fqdn = entry.replace(key, domain_clusters[key])
            fixed_fqdns.append(fixed_fqdn)
            found = True
        elif entry == key:
            # skip these. only interested in ones with subdomains
            # fixed_fqdns.append(domain_clusters[entry])
            found = True
        counter += 1
    if not found:
        # means we made it through without doing a substitution, which is surprising
        print(f"didn't find {entry}")
        fixed_fqdns.append(entry)
        

In [6]:
fixed_fqdns[-10:]

['singlepoint.cluster_id-12',
 'singlepoint.cluster_id-12',
 'singlepoint.cluster_id-12',
 'www.cluster_id-12',
 'www.cluster_id-12',
 'singlepoint.cluster_id-12',
 'mail.cluster_id--1',
 'www.cluster_id--1',
 'www.cluster_id-56',
 'www.cluster_id-56']

### step 2.5: remove duplicate domains after normalization like this (otherwise they'll all just cluster with themselves.

In [7]:
fixed_fqdns = set(fixed_fqdns)

## Step 3 - break them apart into mini documents

In [8]:
documents = [" ".join(re.split("[.]", entry)) for entry in fixed_fqdns]

In [9]:
# other idea for the documents: make one big document per cluster-id, see if we find commonalities that way.
big_docs = defaultdict(list)
for fqdn in fixed_fqdns:
    minidoc = re.split("[.]", fqdn)
    cluster = fqdn[-1]
    big_docs[cluster].extend(minidoc)
big_documents = [" ".join(entry) for entry in big_docs.values()]

## Step 4 - do normal clustering on them like they were documents

In [27]:
vectorizer = TfidfVectorizer(stop_words = cluster_stop_words)
X_tfidf = vectorizer.fit_transform(big_documents)

In [11]:
hash_vectorizer = HashingVectorizer()
X_hash = hash_vectorizer.fit_transform(documents)

In [12]:
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(documents)

In [13]:
db = DBSCAN(eps=0.7, min_samples=2).fit(X_tfidf)

In [14]:
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 0
Estimated number of noise points: 10


In [15]:
translated = defaultdict(list)
for label, entry in zip(labels, documents):
    translated[int(label)].append(entry)

In [16]:
for key in translated:
    print(f"cluster {key}")
    for entry in sorted(translated[key]):
        print(f"\t{entry}")

cluster -1
	banking cluster_id-47
	bfwkoiy cluster_id-28
	cpanel cluster_id--1
	cpcontacts cluster_id-8
	help cluster_id-0
	ro cluster_id-46
	webdisk cluster_id-14
	webmail cluster_id-14
	www cluster_id-8
	www treasury cluster_id-53


Okay, that's not doing what I want. 

 

# Attempt 2 - cluster registrable domains by their subdomain words

Let's try this a different way. Let's take the domains from the dnsdb_resolutions, and simply pull all the subdomains found for a given registrable domain, and make them all one document. So, [`www.test.com`, `mail.test.com`, `secure.test.com`, `login.test.com`] would collapse to a document like `www mail secure login test.com`. That should allow us to cluster the domains together by their subdomains to see if there are common patterns in the subdomain registration.

Notes about the document creation here: 
 1. I'm going to leave in the original registrable domain for the sake of interpretability
 1. I'm going to defang that registrable domain by replacing the "." with "_". If I don't, some of the tfidf parsing code will split the domain name up, and treat "com" as a word also. Don't want that. 
 1. This shouldn't matter since we're treating each registerable domain separately, but we don't want to cluster on the registrable domains themselves, just the word patterns underneath them, so I'm also adding the registrable domain itself as a stop word.

In [17]:
fqdn_docs = list()
# start with "www" as a stop word. Don't feel like I learn anything from a domain having that as a subdomain
stop_domains = ["www"]
with open("dnsdb_resolutions.json") as infile:
    data = json.load(infile)
    for base_domain in data:
        if base_domain.endswith("."):
            substitute_domain = base_domain.rstrip(".")
            stop_domain = base_domain.rstrip(".").replace(".", "_")
        else:
            stop_domain = base_domain.replace(".", "_")
            substitute_domain = base_domain
        stop_domains.append(stop_domain)
        apex_domains = set()
        # apex_domains = {decode_domain(entry['domain']) for entry in data[base_domain]['apex']}
        subdomains = {decode_domain(entry['domain']) for entry in data[base_domain]['subdomains']}
        all_domains = apex_domains.union(subdomains)
        if len(all_domains) > 20:
            continue
        if None in all_domains:
            all_domains.remove(None)
        words = set()
        for entry in all_domains:
            subdomains = entry.replace(substitute_domain, "")
            if not subdomains: 
                continue
            words.update([word for word in subdomains.split(".") if word])
        if not words:
            continue
        words.add(stop_domain)
        fqdn_docs.append(" ".join(words))

In [18]:
fqdn_docs[:3]

['images whm vb blog i2 ww sandbox www vb5 53vb_com',
 'www 53vl_com m ww1',
 'www 53vz_com']

I'm concerned that the documents that are nothing but stop words (just the registrable domain, or "www" + the domain) are causing problems for the clustering app. Let's try removing those.

In [19]:
"www" in stop_domains

True

In [20]:
fqdn_docs[2]

'www 53vz_com'

In [21]:
for word in fqdn_docs[2].split(" "):
    if word not in stop_domains:
        print(word)

In [22]:
cleaned_fqdn_docs = list()
for doc in fqdn_docs:
    # test if all of the words for the doc are in stop_words    
    for word in doc.split(" "):
        if word not in stop_domains:
            cleaned_fqdn_docs.append(doc)
            break

In [23]:
cleaned_fqdn_docs[:3]

['images whm vb blog i2 ww sandbox www vb5 53vb_com',
 'www 53vl_com m ww1',
 'cpanel wordpress remote app www 53xa_com test']

In [28]:
vectorizer = TfidfVectorizer(stop_words=stop_domains)
X_tfidf = vectorizer.fit_transform(cleaned_fqdn_docs)

In [29]:
hash_vectorizer = HashingVectorizer()
X_hash = hash_vectorizer.fit_transform(fqdn_docs)

In [30]:
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(documents)

In [31]:
db = DBSCAN(eps=0.6, min_samples=2, metric="cosine").fit(X_tfidf)

In [32]:
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 30
Estimated number of noise points: 23


In [33]:
translated = defaultdict(list)
for label, entry in zip(labels, cleaned_fqdn_docs):
    translated[int(label)].append(entry)

In [34]:
key = 3
translated[key]

['apple-unlocked_com cpcalendars cpanel mail cpcontacts webdisk webmail ns2 www ns1',
 'cpcalendars cpanel webdisk cpcontacts cashapp-mvf_com webmail www',
 'cpcalendars webdisk cpcontacts webmail www cashapp-mvn_com',
 'mx cpcalendars cpanel git web webdisk cpcontacts secure crm webdav cashapp-uid_com webmail www portal',
 'cpcalendars cpanel cashappmvb-sec_com webdisk cpcontacts webmail www',
 'cpcalendars cpanel cashapps-cid_com qa cpcontacts webdisk webmail www exchange',
 'cpcalendars cpanel webdisk cpcontacts webmail cashapps-pid_com www poczta',
 'cpcalendars cpanel webdisk cpcontacts webmail www cashappsalert-sec_com',
 'cpcalendars cpanel mail webdisk cpcontacts dnb-no_com webmail www',
 'cpcalendars cpanel webdisk cpcontacts mail webmail www eeukbtgroupjshzuayd7syz_com',
 'wap cpcalendars notify-sms_com cpanel webdisk mail demo cpcontacts webmail www',
 'psql02 mysql04 cpcalendars cpanel remote webdisk cpcontacts mail api recoverycash-cid_com share webmail www',
 'cpcalendars

In [35]:
key = 12
translated[key]

['secure eqbanb_com',
 'secure eqhanh_com',
 'www secure eqhank_com webmail',
 'secure roaylbamh_com',
 'secure roaylbamk_com',
 'royalhamh_com secure',
 'secure royalhanh_com',
 'secure royalkamb_com',
 'royalkamh_com secure',
 'royalkamk_com secure',
 'www secure royalkanb_com',
 'secure royalkanh_com',
 'secure royalkank_com www1',
 'seblv_com webmail ibanka',
 'șẹb-lv_com ibanka',
 'șẹb_com ibanka',
 'șẹblv_com ibanka',
 'șeb_com ibanka',
 'șeb-lv_com ibanka',
 'șeblv_com ibanka',
 'eq-bạņks_com secure',
 'www wp secure gȩtinhank_com',
 'www secure gẹtinhạnk_com',
 'www gȩtinhạnk_com secure',
 'www wp kẹynaviqator-kẹy_com',
 'sẹb-lv_com ibanka']

Not bad. Not perfect...there's some weirdness about similar things ending up in different clusters. But I like where it's going with this. Going to pass this over to Clay to see what he thinks.

In [36]:
with open("subdomain_clusters.json", "w") as outfile:
    json.dump(translated, outfile)

Now, having these clusters, need to unwind the word-collection documents back to the fqdns that they came from. The way to do that, I *think*, is to walk through every subdomain, make a set of words from that subdomain (including changing the apex to the underscore version above), and check to see if that word is a subset of any of the entries in a cluster. If it's not, skip it. 

Things that will speed this up: make the "translated" defaultdict a defaultdict of sets, rather than list, and make the document a set also, rather than a list. This may just be a slog, though. 

In [37]:
translated_as_set = defaultdict(set)
for key in translated:
    translated_as_set[key] = [set(entry.split(" ")) for entry in translated[key]]

now we re-do the walk through the dnsdb_resolutions, but don't add hte results to a document, instead write them to a new file with their correlations

In [38]:
def find_cluster_id(words, translated_as_set):
    for key in translated_as_set:
        for entry in translated_as_set[key]:
            if words.issubset(entry):
                return key
    return None

In [39]:
test = {"pnchạnbs_com", "www", "treasury"}
find_cluster_id(test, translated_as_set)

28

In [40]:
with open("subdomain_clusters.csv", "w") as outfile:
    outfile.write(f"cluster_id,subdomain,apex_domain\n")
    with open("dnsdb_resolutions.json") as infile:
        data = json.load(infile)
        for base_domain in data:
            if base_domain.endswith("."):
                substitute_domain = base_domain.rstrip(".")
            else:
                substitute_domain = base_domain
            subdomains = set()
            for entry in data[base_domain]['subdomains']:
                subdomains.add(entry['domain'])
            if len(subdomains) > 20:
                continue
            decoded_subdomains = set()
            for entry in subdomains:
                decoded_domain = decode_domain(entry)
                if decoded_domain is None:
                    continue
                decoded_subdomains.add(decoded_domain)
            underscore = substitute_domain.replace(".", "_")
            for entry in decoded_subdomains:
                replaced = entry.replace(substitute_domain, underscore)
                parts = replaced.split(".")
                parts_set = set(parts)
                cluster_id = find_cluster_id(parts_set, translated_as_set)
                if cluster_id is not None:
                    outfile.write(f"{cluster_id},{entry},{substitute_domain}\n")