# Traceroute Modeling

In [1]:
# Notebook containing preliminary exploration of cybersecuirty related data pulled from both benign and malicious sources
# with the goal of identifying suspicious websites using various machine learning models.

import json
import re
import time
import pprint
import operator
import os

from scipy.stats import chisquare
from ipwhois import IPWhois
import matplotlib.pyplot as plt
from collections import defaultdict
from bisect import bisect_left
import socket, struct

import networkx as nx
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics

## Convert Raw Data

In [2]:
#Enumerate filenames of all json documents for stragithforward iteration

date_path = '/data/data/2019-06-03/'
enrichment=date_path+'enrichment/'
ingest=date_path+'ingest/'
stream='traceroute'

# Index: list of all json names in the folder of a particular date
index=dict.fromkeys(os.listdir(enrichment))
for x in index:
    index[x]=os.listdir(enrichment+x)

In [3]:
#Count the number of times each route crosses a certian IP
frequency_counts_benign=defaultdict(int)
frequency_counts_malicious=defaultdict(int)

#Lists to keep track of various features throughout iteration
frames=[] #Holds a complete dataframe for each json file 
populated=True #Determines whether a json was empty
totalRoutes=[] #Contains all routes 
totalBRoutes=[] # ^^ for "benign" routes
totalMRoutes=[] # ^^ for "malicious" routes
totalDest=set() #Tracks all the destination nodes

In [None]:
alpha=json.load(open('subnetDirectory.json'))
beta=json.load(open('subnetDirectory1.json'))
gamma=json.load(open('subnetDirectory2.json'))
delta=json.load(open('subnetDirectory3.json'))
consolidated={**alpha,**beta,**gamma,**delta}
del alpha
del beta
del gamma
del delta

#Manually insert certain IPs
consolidated['58.120.0.0']=13
consolidated['203.234.128.0']=16
consolidated['0.0.0.0']=32
consolidated['211.206.0.0']=14
consolidated['1.208.0.0 ']=12
consolidated['103.70.240.0']=22
consolidated['183.96.0.0']=14
consolidated['112.160.0.0']=11
consolidated['113.216.0.0']=15
consolidated['220.70.0.0']=15
consolidated['218.152.0.0']=14
consolidated['111.118.0.0']=17
consolidated['39.112.0.0']=12
consolidated['52.88.0.0']=13
consolidated['99.78.128.0']=17
consolidated['99.79.0.0']=16
consolidated['99.80.0.0']=15
consolidated['99.82.0.0']=17
consolidated['99.82.128.0']=18
consolidated['23.192.0.0']=11
consolidated['104.16.0.0']=12
consolidated['152.176.0.0']=12
consolidated['152.192.0.0']=13
consolidated['183.111.0.0']=16
consolidated['110.76.140.0']=22
consolidated['203.246.160.0']=21
consolidated['1.208.0.0']=12
consolidated['125.128.0.0']=11
consolidated['121.128.0.0']=11

jsonWrite = json.dumps(con)
f = open("subnetDirectory.json","w")
f.write(jsonWrite)
f.close()


k=list(consolidated.keys())
k=sorted(k)

In [None]:
def parseCIDR(alpha):
    ip=alpha[:alpha.find('/')]
    mask=alpha[alpha.find('/')+1:]
    return (ip, int(mask))
    
def dottedQuadToNum(ip):
    "convert decimal dotted quad string to long integer"
    return struct.unpack('!L',socket.inet_aton(ip))[0]

def numToDottedQuad(n):
    "convert long int to dotted quad string"
    return socket.inet_ntoa(struct.pack('!L',n))
      
def makeMask(n):
    "return a mask of n bits as a long integer"
    return (1 << 32-n)-1

def ipToNetAndMask(ip):
    "returns tuple (network, host) dotted-quad addresses given IP and mask size"
    if (len(ip) > 18):
        ip=ip[:ip.find(',')]
    network,mask = parseCIDR(ip)
    n = dottedQuadToNum(network)
    m = makeMask(mask)

    host = n & m
    net = n - host

    return numToDottedQuad(net), mask

def toNet(network,maskbits):
    "returns tuple (network, host) dotted-quad addresses given IP and mask size"
    n = dottedQuadToNum(network)
    m = makeMask(maskbits)

    host = n & m
    net = n - host

    return numToDottedQuad(net)

registered=0
allFailures=[]
def oneHotEncode(hops):
    global allFailures
    global registered
    ret=np.zeros(len(k)+1)
    for point in hops:
        if (point != '***'):
            
            
            for i in range (32,0,-1):
                ref=toNet(point,i)
                if (consolidated.get(ref,False)):
                    if (consolidated[ref] == i):
                        ret[consolidated[ref]]=1
                        break
            else:
                allFailures.append(point)
            #pos=bisect_left(k,point)

            #if (pos > 0):
            #    pos-=1

            #if (toNet(point,consolidated[k[pos]]) != k[pos]):
            #    allFailures.append(point)
            #else:
            #    ret[pos]=1
            #    registered+=1
            
    return ret

In [None]:
#For each json in the "stream" folder for a particular date...
for w in index[stream]:
    temp_json=json.load(open(enrichment+stream+'/'+w))
    if type(temp_json["data"]) == list:
        frames.append(pd.DataFrame(temp_json["data"]))  
    
    #For each element in the data list from the traceroute json...
    all_trace=[]
    all_ping=[]
    all_benign=[]
    all_dest=[]
    all_route_lengths=[]
    all_avg_ping=[]
    all_timeouts=[]
    all_weighted_ping=[]
    expanded_route=[]
    
    for route in temp_json["data"]:
        
        #Error handling
        if type(route) is str:
            populated=False
            break
            
        # Determine benign or malicious feature set
        if (len(w)<10):
            all_benign.append(True)
        else:
            all_benign.append(False)
            
        #Parse the string of traceroute data
        split=route["traceroute"].splitlines()
        all_route_lengths.append(len(split)-1)
        hops=[]
        pings=[]
        
        #For each line in the traceroute for a given indicator
        count=0
        timeouts=0
        for x in split:
            regIP=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
            regPing=re.findall("\s\d+[.]\d{3}\s",x)
            pings.append(regPing)
            for ip in regIP:
                if (count ==0):
                    all_dest.append(ip)
                    count=1
                ip=ip[1:-1]
                totalDest.add(ip)
                hops.append(ip)
                if (len(w)<10): 
                    frequency_counts_benign[ip]+=1
                    break
                else:
                    frequency_counts_malicious[ip]+=1
                    break
            if (count == 0):
                all_dest.append("***")
                count=1
                
            if (len(regIP) == 0):
                hops.append("***")
                timeouts=timeouts+1
                
        #Append each IP node from this indicator to a list of lists containing all routes for all jsons on this day
        all_trace.append(hops[1:])
        totalRoutes.append(hops)
        if (len(w)<10):
            totalBRoutes.append(hops)
        else:
            totalMRoutes.append(hops)
            
            
        # One-hot encoding    
        expanded_route.append(pd.DataFrame([oneHotEncode(hops)]))
        
        
        all_ping.append(pings[1:])
        all_timeouts.append(timeouts)
        overallPing=0
        weighted_ping=0
        idx=1
        for trio in pings:
            if (len(trio) != 0):
                overallPing=overallPing+float(min(trio))
                weighted_ping=weighted_ping+float(min(trio))*idx*idx
                idx+=1
                
            
        all_avg_ping.append(overallPing/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        all_weighted_ping.append(weighted_ping/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        
    #print(frames[len(frames)-1].shape)
    #print(len(all_trace))
    #print(w)
    
    if (populated):
        frames[len(frames)-1].insert(2,"Route",all_trace)
        frames[len(frames)-1].insert(3,"Ping",all_ping)
        frames[len(frames)-1].insert(1,"Benign",all_benign)
        frames[len(frames)-1].insert(2,"Dest",all_dest)
        frames[len(frames)-1].insert(2,"NumHops",all_route_lengths)
        frames[len(frames)-1].insert(2,"AveragePing",all_avg_ping)
        frames[len(frames)-1].insert(2,"Timeouts",all_timeouts)
        frames[len(frames)-1]=pd.concat([frames[len(frames)-1], pd.concat(expanded_route).reset_index()], axis=1)
        #frames[len(frames)-1]=pd.concat([frames[len(frames)-1],pd.concat(expanded_route)],axis=1,join_axes=[frames[len(frames)-1].index])
        #frames[len(frames)-1]
    else:
        populated=True
        frames.pop()

del all_trace
del all_ping
del all_benign
del all_dest
del all_route_lengths
del all_avg_ping
del all_timeouts
del all_weighted_ping
del pings
del hops
del expanded_route

In [None]:
print(len(allFailures))
#pprint.pprint(allFailures)

In [None]:
upset={}
iterCount=0
for test in allFailures:
    iterCount+=1
    if (iterCount % 250 == 0):
        print (iterCount/25000)

    try:
        tmp=IPWhois(test,allow_permutations=True).lookup_rdap()['network']['cidr']
        if (tmp.find(',')):
            for annoy in tmp.split(','):
                annoy=annoy.strip()
                newSubnet,newMask=ipToNetAndMask(annoy)
                upset[newSubnet]=newMask
        else:
            print("diff:",test)
            
    except:
        print("Error:", test)
        
    #break

In [None]:
#Spare regex expressions 


#print(time.time()-t1)           

#regDNS=re.findall("(\s[\w\-._~:\/\?\#\[\]\@\!\$\&\'\(\)\*\+\,\;\=]+\s)([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
#reg1=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])(\s+\d+[.]\d{3}\sms)+",x)
#reg1=re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]\s+\d+[.]\d{3}\s",x)

#q=-1
#for x in s:
    #q=q+1
    #Parse ip adresses and latency
    #print(q, re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]",x), re.findall("\s\d+[.]\d{3}\s",x))
    #print(x)

In [None]:
df=pd.concat(frames)
del frames
df=df.fillna("X")
df.drop(columns=['traceroute','index'])

## Supervised Classification Models

### Decision Tree Classifier

In [None]:
df=df.sample(frac=1).reset_index(drop=True)
X=df[['Timeouts','AveragePing','NumHops','success']]
Y=df['Benign']
#clf=sklearn.tree.DecisionTreeClassifier()
#clf = clf.fit(X.loc[0:3000,:],Y.loc[0:3000])

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(Y, clf.predict(X))))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y, clf.predict(X),labels=[False,True]))

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(Y.loc[3001:6000], clf.predict(X.loc[3001:6000,:]))))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y.loc[3001:6000], clf.predict(X.loc[3001:6000,:]),labels=[False,True]))

In [None]:
totalDest=sorted(totalDest)

### Custom One Hot Encoding Function

In [None]:
#print(totalDest)
from bisect import bisect_left
sample_onehot=[0]*len(totalDest)
t0=time.time()
for x in hops:
    sample_onehot[bisect_left(totalDest,x)]=1
print(time.time()-t0)

## Chi-Sqare Analysis of IP Frequencies

In [None]:
srt_benign=pd.DataFrame(sorted(frequency_counts_benign.items(), key=operator.itemgetter(1), reverse=True), columns=["BenignIP","BenignFreq"])
srt_malicious=pd.DataFrame(sorted(frequency_counts_malicious.items(), key=operator.itemgetter(1), reverse=True),columns=["MalIP","MalFreq"])
srt=pd.concat([srt_benign,srt_malicious], axis=1)
srt

In [None]:
freqB=[]
freqM=[]
for x in frequency_counts_benign.keys():
    if frequency_counts_malicious.get(x) != None:
        freqB.append(frequency_counts_benign.get(x))
        freqM.append(frequency_counts_malicious.get(x)/5.4)

print(pd.Series(freqB).sum(),pd.Series(freqM).sum())
chisquare(freqM, f_exp=freqB)

In [None]:
print(len(totalBRoutes), len(totalMRoutes))

## Network Graph of IP Addresses

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
G=nx.Graph()

#Add nodes to graph
for x in totalRoutes:
    x=list(filter(lambda a:a!="***",x))
    G.add_nodes_from(x)
    #if (df.iloc[c][2]):
        #colorCode=colorCode+['blue']*(len(x)-1)
    #else:
         #colorCode=colorCode+['red']*(len(x)-1)
        
    #Add adges and connect last node to the final destination
    if len(x) != 0:
        for i in range (1,len(x)-1):
            G.add_edge(x[i],x[i+1],stop=i)
        G.add_edge(x[len(x)-1],x[0])

In [None]:
#Color destination nodes
#TODO: Vary destination color by benign or malicious designation

colorCode=[]
for x in G.nodes():
    if (x in totalDest):
        colorCode.append('blue')
        
    else:
        colorCode.append('black')
        
print(len(G.nodes()), len(colorCode), cnt)

In [None]:
options = {
        'node_color': colorCode,
        'node_size': 1,
        'edge_color': 'grey',
        'linewidths': 0,
        'width': 0.1,
    }
plt.figure(figsize=(20,10),dpi=1000)
nx.draw(G, **options)
plt.show()

In [None]:
plt.savefig('network_graph.jpg', dpi=1000)