In [1]:
# Notebook containing preliminary exploration of cybersecuirty related data pulled from both benign and malicious sources
# with the goal of identifying suspicious websites using various machine learning models.

import json
import os
import pprint
import numpy
import pandas as pd
import tensorflow as tf
import re
import time
from collections import defaultdict
import operator
import networkx as nx
import matplotlib.pyplot as plt
from scipy.stats import chisquare
from ipwhois import IPWhois


#Create a table for easy access to information spread over many json files

date_path = '/data/data/2019-06-07/'
enrichment=date_path+'enrichment/'
ingest=date_path+'ingest/'
index=dict.fromkeys(os.listdir(enrichment))
for x in index:
    index[x]=os.listdir(enrichment+x)
#pprint.pprint(index["traceroute"])

In [None]:
#Count the number of times each route crosses a certian IP
stream='traceroute'
subnets=defaultdict(int)
count = 0 
numEmpty=0
#For each json in the "stream" folder for a particular date...
for w in index[stream]:
    temp_json=json.load(open(enrichment+stream+'/'+w))
    if type(temp_json["data"]) == list:
        split=route["traceroute"].splitlines()
        for x in split:
            regIP=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
            print(count)
            for q in regIP:
                count+=1
                obj=IPWhois(q[1:len(q)-1])
                results=obj.lookup_rdap(depth=1)
                s=results["asn_cidr"]
                if (not s):
                    subnets[s[:s.rfind('/')]]=s[]
                else:
                    numEmpty+=1
                break

 

In [6]:
stream='traceroute'
#Count the number of times each route crosses a certian IP
frequency_counts_benign=defaultdict(int)
frequency_counts_malicious=defaultdict(int)
t1=time.time()
frames=[]
alternate=[]
populated=True
totalRoutes=[]
totalBRoutes=[]
totalMRoutes=[]
totalDest=set()

#For each json in the "stream" folder for a particular date...
for w in index[stream]:
    temp_json=json.load(open(enrichment+stream+'/'+w))
    if type(temp_json["data"]) == list:
        frames.append(pd.DataFrame(temp_json["data"]))  
    
    #For each element in the data list from the traceroute json...
    all_trace=[]
    all_ping=[]
    all_benign=[]
    all_dest=[]
    all_route_lengths=[]
    all_avg_ping=[]
    all_timeouts=[]
    all_weighted_ping=[]
    expanded_route=[]
    
    for route in temp_json["data"]:
        if type(route) is str:
            populated=False
            break
        if (len(w)<10):
            all_benign.append(True)
        else:
            all_benign.append(False)
            
        split=route["traceroute"].splitlines()
        all_route_lengths.append(len(split)-1)
        hops=[]
        pings=[]
        #For each line in the traceroute for a given indicator
        count=0
        timeouts=0
        for x in split:
            regIP=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
            regPing=re.findall("\s\d+[.]\d{3}\s",x)
            pings.append(regPing)
            for ip in regIP:
                if (count ==0):
                    all_dest.append(ip)
                    count=1
                #ip=ip[1:ip.rfind(".")]
                ip=ip[1:8]
                totalDest.add(ip)
                hops.append(ip)
                if (len(w)<10): 
                    frequency_counts_benign[ip]+=1
                    break
                else:
                    frequency_counts_malicious[ip]+=1
                    break
            if (count == 0):
                all_dest.append("***")
                count=1
                
            if (len(regIP) == 0):
                hops.append("***")
                timeouts=timeouts+1
                
        #Append each IP node from this indicator to a list of lists containing all routes for all jsons on this day
        all_trace.append(hops[1:])
        totalRoutes.append(hops)
        if (len(w)<10):
            totalBRoutes.append(hops)
        else:
            totalMRoutes.append(hops)
            
        expanded_route.append(pd.DataFrame([hops]))
        all_ping.append(pings[1:])
        all_timeouts.append(timeouts)
        overallPing=0
        weighted_ping=0
        idx=1
        for trio in pings:
            if (len(trio) != 0):
                overallPing=overallPing+float(min(trio))
                weighted_ping=weighted_ping+float(min(trio))*idx*idx
                idx+=1
                
            
        all_avg_ping.append(overallPing/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        all_weighted_ping.append(weighted_ping/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        
    #print(frames[len(frames)-1].shape)
    #print(len(all_trace))
    #print(w)
    if (populated):
        
        frames[len(frames)-1].insert(2,"Route",all_trace)
        frames[len(frames)-1].insert(3,"Ping",all_ping)
        frames[len(frames)-1].insert(1,"Benign",all_benign)
        frames[len(frames)-1].insert(2,"Dest",all_dest)
        frames[len(frames)-1].insert(2,"NumHops",all_route_lengths)
        frames[len(frames)-1].insert(2,"AveragePing",all_avg_ping)
        frames[len(frames)-1].insert(2,"Timeouts",all_timeouts)
        frames[len(frames)-1]=pd.concat([frames[len(frames)-1], pd.concat(expanded_route).reset_index()], axis=1)
        #frames[len(frames)-1]=pd.concat([frames[len(frames)-1],pd.concat(expanded_route)],axis=1,join_axes=[frames[len(frames)-1].index])
        #frames[len(frames)-1]
    else:
        populated=True
        frames.pop()
#print(time.time()-t1)           

#regDNS=re.findall("(\s[\w\-._~:\/\?\#\[\]\@\!\$\&\'\(\)\*\+\,\;\=]+\s)([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
#reg1=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])(\s+\d+[.]\d{3}\sms)+",x)
#reg1=re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]\s+\d+[.]\d{3}\s",x)

#q=-1
#for x in s:
    #q=q+1
    #Parse ip adresses and latency
    #print(q, re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]",x), re.findall("\s\d+[.]\d{3}\s",x))
    #print(x)

In [3]:
df=pd.concat(frames)
df=df.fillna("X")
df.drop(columns=['traceroute','index'])

Unnamed: 0,AveragePing,Benign,Dest,NumHops,Ping,Route,Timeouts,indicator,success,0,...,21,22,23,24,25,26,27,28,29,30
0,13.003133,True,(184.86.106.177),30,"[[ 11.543 , 11.537 , 20.112 ], [ 12.623 , 1...","[96.127., 100.66., 100.66., 100.65., 52.94.1, ...",15,discover.com,True,184.86.,...,***,***,***,***,***,***,***,***,***,***
1,58.374421,True,(122.147.53.67),30,"[[ 94.456 , 19.733 , 20.396 ], [ 12.471 , 2...","[96.127., 100.66., 100.66., 100.65., 205.251, ...",11,104.com.tw,True,122.147,...,***,***,***,***,***,***,***,***,***,***
2,9.213786,True,(192.124.249.10),30,"[[ 17.406 , 17.234 , 17.488 ], [ 17.496 , 1...","[96.127., 100.66., 100.66., 100.65., 52.94.1, ...",16,factaholics.com,True,192.124,...,***,***,***,***,***,***,***,***,***,***
3,33.535833,True,(198.134.112.243),30,"[[ 20.920 , 20.615 , 20.902 ], [ 11.241 , 1...","[96.127., 100.66., 100.66., 100.65., 52.94.1, ...",6,nfkd2ug8d9.com,True,198.134,...,154.54.,154.54.,38.142.,173.239,173.239,***,***,***,***,***
4,15.830600,True,(93.93.51.223),30,"[[ 21.381 , 15.167 , 11.589 ], [ 21.726 , 1...","[96.127., 100.66., 100.66., 100.65., 205.251, ...",20,crptentry.com,True,93.93.5,...,***,***,***,***,***,***,***,***,***,***
5,34.360850,True,(45.55.99.72),30,"[[ 16.101 , 16.085 , 18.969 ], [ 19.708 , 2...","[96.127., 100.66., 100.66., 100.65., 52.94.1, ...",10,python.org,True,45.55.9,...,66.110.,138.197,***,***,***,***,***,***,***,***
6,6.507750,True,(151.101.1.28),30,"[[ 12.622 , 18.824 , 12.594 ], [ 19.492 , 1...","[96.127., 100.66., 100.66., 100.65., 52.94.1, ...",18,bandcamp.com,True,151.101,...,***,***,***,***,***,***,***,***,***,***
7,13.104385,True,(198.35.26.96),30,"[[ 17.448 , 17.914 , 17.902 ], [ 21.975 , 1...","[96.127., 100.66., 100.66., 100.65., 52.94.1, ...",17,wikimedia.org,True,198.35.,...,***,***,***,***,***,***,***,***,***,***
8,29.294933,True,(88.208.29.19),30,"[[ 13.209 , 12.331 , 21.396 ], [ 13.998 , 1...","[96.127., 100.66., 100.66., 100.65., 205.251, ...",15,xhamsterlive.com,True,88.208.,...,***,***,***,***,***,***,***,***,***,***
9,77.911273,True,(120.55.226.24),30,"[[ 15.524 , 20.294 , 15.506 ], [ 14.093 , 1...","[96.127., 100.66., 100.66., 100.65., 205.251, ...",8,oschina.net,True,120.55.,...,202.97.,220.191,122.224,42.120.,119.38.,***,***,***,***,***


In [10]:
totalDest=sorted(totalDest)



['0.0.0.0', '1.1.1.1', '1.1.10.', '1.160.5', '1.169.5', '1.175.1', '1.175.8', '1.179.1', '1.180.1', '1.190.1', '1.190.2', '1.193.1', '1.196.1', '1.20.22', '1.201.1', '1.202.2', '1.206.2', '1.206.7', '1.208.1', '1.213.1', '1.213.8', '1.215.7', '1.221.2', '1.232.7', '1.233.2', '1.255.2', '1.255.7', '1.28.12', '1.29.23', '1.30.21', '1.31.13', '1.34.88', '1.37.18', '1.49.23', '1.54.20', '1.56.1.', '1.58.10', '1.58.14', '1.58.81', '1.59.62', '1.59.67', '1.62.15', '1.82.11', '1.82.26', '1.86.55', '1.9.239', '1.9.241', '100.1.9', '100.2.8', '100.24.', '100.27.', '100.41.', '100.42.', '100.64.', '100.65.', '100.66.', '101.0.1', '101.108', '101.109', '101.110', '101.132', '101.167', '101.198', '101.200', '101.201', '101.226', '101.227', '101.230', '101.249', '101.254', '101.255', '101.28.', '101.37.', '101.4.1', '101.50.', '101.72.', '101.78.', '101.87.', '101.95.', '101.99.', '102.101', '102.103', '102.136', '102.165', '103.1.1', '103.1.2', '103.1.7', '103.10.', '103.100', '103.104', '103.108'

In [19]:
#print(totalDest)
from bisect import bisect_left
sample_onehot=[0]*len(totalDest)
t0=time.time()
for x in hops:
    sample_onehot[bisect_left(totalDest,x)]=1
print(time.time()-t0)

9.918212890625e-05


In [5]:
srt_benign=pd.DataFrame(sorted(frequency_counts_benign.items(), key=operator.itemgetter(1), reverse=True), columns=["BenignIP","BenignFreq"])
srt_malicious=pd.DataFrame(sorted(frequency_counts_malicious.items(), key=operator.itemgetter(1), reverse=True),columns=["MalIP","MalFreq"])
srt=pd.concat([srt_benign,srt_malicious], axis=1)
srt

Unnamed: 0,BenignIP,BenignFreq,MalIP,MalFreq
0,52.93.1,2815.0,52.93.1,15933
1,100.66.,1960.0,100.66.,10026
2,52.95.5,1444.0,54.239.,7188
3,54.239.,1428.0,52.95.5,5978
4,100.65.,1090.0,100.65.,5126
5,96.127.,981.0,96.127.,5016
6,52.93.2,690.0,202.97.,3595
7,205.251,611.0,62.115.,3419
8,52.94.1,481.0,52.93.2,3385
9,129.250,352.0,205.251,3059


In [None]:
freqB=[]
freqM=[]
for x in frequency_counts_benign.keys():
    if frequency_counts_malicious.get(x) != None:
        freqB.append(frequency_counts_benign.get(x))
        freqM.append(frequency_counts_malicious.get(x)/5.4)

print(pd.Series(freqB).sum(),pd.Series(freqM).sum())
chisquare(freqM, f_exp=freqB)

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
print(len(totalBRoutes), len(totalMRoutes))

In [None]:
G=nx.Graph()
c=0
for x in totalRoutes:
    x=list(filter(lambda a:a!="***",x))
    G.add_nodes_from(x)
    #if (df.iloc[c][2]):
        #colorCode=colorCode+['blue']*(len(x)-1)
    #else:
         #colorCode=colorCode+['red']*(len(x)-1)
    if len(x) != 0:
        for i in range (1,len(x)-1):
            G.add_edge(x[i],x[i+1],stop=i)
        G.add_edge(x[len(x)-1],x[0])

In [None]:
colorCode=[]
cnt=0
for x in G.nodes():
    if (x in totalDest):
        colorCode.append('blue')
        cnt+=1
    else:
        colorCode.append('black')
        
print(len(G.nodes()), len(colorCode), cnt)

In [None]:
options = {
        'node_color': colorCode,
        'node_size': 1,
        'edge_color': 'grey',
        'linewidths': 0,
        'width': 0.1,
    }
plt.figure(figsize=(20,10),dpi=1000)
nx.draw(G, **options)
plt.show()

In [None]:
plt.savefig('network_graph.jpg', dpi=1000)

number of timeouts, number of hops total