In [42]:
# Notebook containing preliminary exploration of cybersecuirty related data pulled from both benign and malicious sources
# with the goal of identifying suspicious websites using various machine learning models.

import json

import pprint
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import time
from collections import defaultdict
import operator
import networkx as nx
import matplotlib.pyplot as plt
from scipy.stats import chisquare
from ipwhois import IPWhois
import os
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics


#Create a table for easy access to information spread over many json files

date_path = '/data/data/2019-06-07/'
enrichment=date_path+'enrichment/'
ingest=date_path+'ingest/'
index=dict.fromkeys(os.listdir(enrichment))
for x in index:
    index[x]=os.listdir(enrichment+x)
#pprint.pprint(index["traceroute"])

In [6]:
stream='traceroute'
#Count the number of times each route crosses a certian IP
frequency_counts_benign=defaultdict(int)
frequency_counts_malicious=defaultdict(int)
t1=time.time()
frames=[]
alternate=[]
populated=True
totalRoutes=[]
totalBRoutes=[]
totalMRoutes=[]
totalDest=set()

#For each json in the "stream" folder for a particular date...
for w in index[stream]:
    temp_json=json.load(open(enrichment+stream+'/'+w))
    if type(temp_json["data"]) == list:
        frames.append(pd.DataFrame(temp_json["data"]))  
    
    #For each element in the data list from the traceroute json...
    all_trace=[]
    all_ping=[]
    all_benign=[]
    all_dest=[]
    all_route_lengths=[]
    all_avg_ping=[]
    all_timeouts=[]
    all_weighted_ping=[]
    expanded_route=[]
    
    for route in temp_json["data"]:
        if type(route) is str:
            populated=False
            break
        if (len(w)<10):
            all_benign.append(True)
        else:
            all_benign.append(False)
            
        split=route["traceroute"].splitlines()
        all_route_lengths.append(len(split)-1)
        hops=[]
        pings=[]
        #For each line in the traceroute for a given indicator
        count=0
        timeouts=0
        for x in split:
            regIP=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
            regPing=re.findall("\s\d+[.]\d{3}\s",x)
            pings.append(regPing)
            for ip in regIP:
                if (count ==0):
                    all_dest.append(ip)
                    count=1
                ip=ip[1:ip.rfind(".")]
                totalDest.add(ip)
                hops.append(ip)
                if (len(w)<10): 
                    frequency_counts_benign[ip]+=1
                    break
                else:
                    frequency_counts_malicious[ip]+=1
                    break
            if (count == 0):
                all_dest.append("***")
                count=1
                
            if (len(regIP) == 0):
                hops.append("***")
                timeouts=timeouts+1
                
        #Append each IP node from this indicator to a list of lists containing all routes for all jsons on this day
        all_trace.append(hops[1:])
        totalRoutes.append(hops)
        if (len(w)<10):
            totalBRoutes.append(hops)
        else:
            totalMRoutes.append(hops)
            
        expanded_route.append(pd.DataFrame(np.random.rand(1,1000)))
        all_ping.append(pings[1:])
        all_timeouts.append(timeouts)
        overallPing=0
        weighted_ping=0
        idx=1
        for trio in pings:
            if (len(trio) != 0):
                overallPing=overallPing+float(min(trio))
                weighted_ping=weighted_ping+float(min(trio))*idx*idx
                idx+=1
                
            
        all_avg_ping.append(overallPing/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        all_weighted_ping.append(weighted_ping/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        
    #print(frames[len(frames)-1].shape)
    #print(len(all_trace))
    #print(w)
    if (populated):
        
        frames[len(frames)-1].insert(2,"Route",all_trace)
        frames[len(frames)-1].insert(3,"Ping",all_ping)
        frames[len(frames)-1].insert(1,"Benign",all_benign)
        frames[len(frames)-1].insert(2,"Dest",all_dest)
        frames[len(frames)-1].insert(2,"NumHops",all_route_lengths)
        frames[len(frames)-1].insert(2,"AveragePing",all_avg_ping)
        frames[len(frames)-1].insert(2,"Timeouts",all_timeouts)
        frames[len(frames)-1]=pd.concat([frames[len(frames)-1], pd.concat(expanded_route).reset_index()], axis=1)
        #frames[len(frames)-1]=pd.concat([frames[len(frames)-1],pd.concat(expanded_route)],axis=1,join_axes=[frames[len(frames)-1].index])
        #frames[len(frames)-1]
    else:
        populated=True
        frames.pop()
#print(time.time()-t1)           

#regDNS=re.findall("(\s[\w\-._~:\/\?\#\[\]\@\!\$\&\'\(\)\*\+\,\;\=]+\s)([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
#reg1=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])(\s+\d+[.]\d{3}\sms)+",x)
#reg1=re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]\s+\d+[.]\d{3}\s",x)

#q=-1
#for x in s:
    #q=q+1
    #Parse ip adresses and latency
    #print(q, re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]",x), re.findall("\s\d+[.]\d{3}\s",x))
    #print(x)

In [7]:
df=pd.concat(frames)
df=df.fillna("X")
df.drop(columns=['traceroute','index'])

Unnamed: 0,indicator,Benign,Timeouts,AveragePing,NumHops,Dest,success,Route,Ping,0,...,990,991,992,993,994,995,996,997,998,999
0,discover.com,True,15,13.003133,30,(184.86.106.177),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.1, 52.94...","[[ 11.543 , 11.537 , 20.112 ], [ 12.623 , 1...",0.034877,...,0.622415,0.794753,0.807831,0.953567,0.556796,0.831396,0.343531,0.650332,0.328885,0.571564
1,104.com.tw,True,11,58.374421,30,(122.147.53.67),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.1, 205.2...","[[ 94.456 , 19.733 , 20.396 ], [ 12.471 , 2...",0.220740,...,0.042385,0.257027,0.094961,0.983654,0.477972,0.973663,0.485147,0.333102,0.042966,0.741228
2,factaholics.com,True,16,9.213786,30,(192.124.249.10),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.1, 52.94...","[[ 17.406 , 17.234 , 17.488 ], [ 17.496 , 1...",0.124211,...,0.707075,0.875071,0.194887,0.881857,0.965989,0.330298,0.535006,0.886209,0.689876,0.300103
3,nfkd2ug8d9.com,True,6,33.535833,30,(198.134.112.243),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.1, 52.94...","[[ 20.920 , 20.615 , 20.902 ], [ 11.241 , 1...",0.065818,...,0.896390,0.790672,0.280978,0.798094,0.351794,0.937843,0.412413,0.140564,0.121470,0.079969
4,crptentry.com,True,20,15.830600,30,(93.93.51.223),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.1, 205.2...","[[ 21.381 , 15.167 , 11.589 ], [ 21.726 , 1...",0.640057,...,0.101108,0.630026,0.996972,0.508899,0.945759,0.719399,0.518235,0.814540,0.121328,0.844334
5,python.org,True,10,34.360850,30,(45.55.99.72),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.0, 52.94...","[[ 16.101 , 16.085 , 18.969 ], [ 19.708 , 2...",0.926614,...,0.767163,0.356677,0.434063,0.425132,0.731806,0.955269,0.943972,0.530342,0.817153,0.842064
6,bandcamp.com,True,18,6.507750,30,(151.101.1.28),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.1, 52.94...","[[ 12.622 , 18.824 , 12.594 ], [ 19.492 , 1...",0.695433,...,0.781695,0.923419,0.557048,0.031409,0.007949,0.269933,0.251965,0.297249,0.478820,0.365014
7,wikimedia.org,True,17,13.104385,30,(198.35.26.96),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.0, 52.94...","[[ 17.448 , 17.914 , 17.902 ], [ 21.975 , 1...",0.305186,...,0.088786,0.119210,0.845569,0.942030,0.062231,0.170883,0.741538,0.323069,0.134901,0.306602
8,xhamsterlive.com,True,15,29.294933,30,(88.208.29.19),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.0, 205.2...","[[ 13.209 , 12.331 , 21.396 ], [ 13.998 , 1...",0.859509,...,0.586311,0.761413,0.018897,0.564348,0.110838,0.663332,0.775908,0.831433,0.702416,0.250633
9,oschina.net,True,8,77.911273,30,(120.55.226.24),True,"[96.127.0, 100.66.0, 100.66.0, 100.65.0, 205.2...","[[ 15.524 , 20.294 , 15.506 ], [ 14.093 , 1...",0.281649,...,0.765084,0.063769,0.768129,0.640470,0.042836,0.987550,0.373744,0.221374,0.088263,0.316790


In [76]:
df=df.sample(frac=1).reset_index(drop=True)
X=df[['Timeouts','AveragePing','NumHops','success']]
Y=df['Benign']
clf=sklearn.tree.DecisionTreeClassifier()
clf = clf.fit(X.loc[0:3000,:],Y.loc[0:3000])

In [77]:
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(Y.loc[3001:6000], clf.predict(X.loc[3001:6000,:]))))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y.loc[3001:6000], clf.predict(X.loc[3001:6000,:]),labels=[False,True]))

Classification report for classifier DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best'):
              precision    recall  f1-score   support

       False       0.88      0.89      0.89      2505
        True       0.42      0.40      0.41       495

    accuracy                           0.81      3000
   macro avg       0.65      0.64      0.65      3000
weighted avg       0.81      0.81      0.81      3000


Confusion matrix:
[[2237  268]
 [ 299  196]]


In [None]:
totalDest=sorted(totalDest)

In [None]:
#print(totalDest)
from bisect import bisect_left
sample_onehot=[0]*len(totalDest)
t0=time.time()
for x in hops:
    sample_onehot[bisect_left(totalDest,x)]=1
print(time.time()-t0)

In [None]:
srt_benign=pd.DataFrame(sorted(frequency_counts_benign.items(), key=operator.itemgetter(1), reverse=True), columns=["BenignIP","BenignFreq"])
srt_malicious=pd.DataFrame(sorted(frequency_counts_malicious.items(), key=operator.itemgetter(1), reverse=True),columns=["MalIP","MalFreq"])
srt=pd.concat([srt_benign,srt_malicious], axis=1)
srt

In [None]:
freqB=[]
freqM=[]
for x in frequency_counts_benign.keys():
    if frequency_counts_malicious.get(x) != None:
        freqB.append(frequency_counts_benign.get(x))
        freqM.append(frequency_counts_malicious.get(x)/5.4)

print(pd.Series(freqB).sum(),pd.Series(freqM).sum())
chisquare(freqM, f_exp=freqB)

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
print(len(totalBRoutes), len(totalMRoutes))

In [None]:
G=nx.Graph()
c=0
for x in totalRoutes:
    x=list(filter(lambda a:a!="***",x))
    G.add_nodes_from(x)
    #if (df.iloc[c][2]):
        #colorCode=colorCode+['blue']*(len(x)-1)
    #else:
         #colorCode=colorCode+['red']*(len(x)-1)
    if len(x) != 0:
        for i in range (1,len(x)-1):
            G.add_edge(x[i],x[i+1],stop=i)
        G.add_edge(x[len(x)-1],x[0])

In [None]:
colorCode=[]
cnt=0
for x in G.nodes():
    if (x in totalDest):
        colorCode.append('blue')
        cnt+=1
    else:
        colorCode.append('black')
        
print(len(G.nodes()), len(colorCode), cnt)

In [None]:
options = {
        'node_color': colorCode,
        'node_size': 1,
        'edge_color': 'grey',
        'linewidths': 0,
        'width': 0.1,
    }
plt.figure(figsize=(20,10),dpi=1000)
nx.draw(G, **options)
plt.show()

In [None]:
plt.savefig('network_graph.jpg', dpi=1000)