#### Run the "Subnet Identifier" notebook first, then this code in its entirety

# Traceroute Modeling

In [12]:
# Notebook containing preliminary exploration of cybersecuirty related data pulled from both benign and malicious sources
# with the goal of identifying suspicious websites using various machine learning models.

import json
import re
import time
import pprint
import operator
import random
import os
import pickle

from scipy.stats import chisquare
from ipwhois import IPWhois
import matplotlib.pyplot as plt
from collections import defaultdict
from bisect import bisect_left
import socket, struct

import networkx as nx
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn import tree
from sklearn import metrics
from sklearn import datasets, cluster
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Convert Raw Data

In [2]:
#Enumerate filenames of all json documents for stragithforward iteration

date_paths = '/data/data/'
everyFile=os.listdir(date_paths)
everyFile=['/data/data/'+x+'/enrichment/traceroute/' for x in everyFile]

In [3]:
everyFile=everyFile[31:]
everyFile.remove('/data/data/2019-04-13/enrichment/traceroute/')
everyFile.remove('/data/data/2019-05-13/enrichment/traceroute/')
everyFile.remove('/data/data/2019-05-14/enrichment/traceroute/')
everyFile.remove('/data/data/canyouseethisfile.txt/enrichment/traceroute/')

In [4]:
tempEvery=[]
for x in everyFile:
    for s in os.listdir(x):
        if (len(s) > 10):
            tempEvery.append(x+s)

for s in os.listdir('/data/data/newbenigndata/enrichment/traceroute/'):
        tempEvery.append('/data/data/newbenigndata/enrichment/traceroute/'+s)
        
for s in os.listdir('/data/data/newNONTORbenigndata/enrichment/traceroute/'):
        tempEvery.append('/data/data/newNONTORbenigndata/enrichment/traceroute/'+s)

In [14]:
print(len(tempEvery), "total traceroute files")

13501 total traceroute files


### Encode IPs by Subnet

In [6]:
# Json containing all subnets in the dataset
consolidated=json.load(open('SerializedResources/reducedConsolidated.json'))
k=list(consolidated.keys())
k=sorted(k)
print(len(k))

2874


In [7]:
def parseCIDR(alpha):
    ip=alpha[:alpha.find('/')]
    mask=alpha[alpha.find('/')+1:]
    return (ip, int(mask))
    
def dottedQuadToNum(ip):
    "convert decimal dotted quad string to long integer"
    return struct.unpack('!L',socket.inet_aton(ip))[0]

def numToDottedQuad(n):
    "convert long int to dotted quad string"
    return socket.inet_ntoa(struct.pack('!L',n))
      
def makeMask(n):
    "return a mask of n bits as a long integer"
    return (1 << 32-n)-1

def ipToNetAndMask(ip):
    "returns tuple (network, host) dotted-quad addresses given IP and mask size"
    if (len(ip) > 18):
        ip=ip[:ip.find(',')]
    network,mask = parseCIDR(ip)
    n = dottedQuadToNum(network)
    m = makeMask(mask)

    host = n & m
    net = n - host

    return numToDottedQuad(net), mask

def toNet(network,maskbits):
    "returns tuple (network, host) dotted-quad addresses given IP and mask size"
    n = dottedQuadToNum(network)
    m = makeMask(maskbits)

    host = n & m
    net = n - host

    return numToDottedQuad(net)


unique_ips=set()
def oneHotEncode(hops):
    unregistered=0
    #global unique_ips
    ret={}
    order=0
    for point in hops:
        #unique_ips.add(point)
        if (point != '***'):
            for i in range (32,0,-1):
                ref=toNet(point,i)
                if (consolidated.get(ref,False)):
                    if (consolidated[ref] == i):
                        ret[order]=bisect_left(k,ref)
                        break

            
        order+=1
    return ret

In [8]:
frames=[]
expanded_route=[]
numEmpties=0
s=-1
for w in tempEvery:
    s+=1
    #w=tempEvery[random.randint(0,len(tempEvery))]
    #tempEvery.remove(w)
    if (s%135 == 0):
        print(str(s/135)+'%')

    try:
        temp_json=json.load(open(w))
    except:
        continue
        
    if type(temp_json["data"]) == list and len(temp_json["data"]) > 0:
        frames.append(pd.DataFrame(temp_json["data"]))  
    else:
        continue
    #For each element in the data list from the traceroute json...
    #all_trace=[]
    #all_ping=[]
    all_benign=[]
    all_dest=[]
    all_route_lengths=[]
    all_avg_ping=[]
    all_timeouts=[]
    #all_weighted_ping=[]
    all_tailTimes=[]
    all_reached=[]
    #expanded_route=[]
    
    for route in temp_json["data"]:
        populated=True
        #Error handling
        if type(route) is str:
            populated=False
            break
            
        # Determine benign or malicious feature set
        if (len(w)-w.rfind('/')<10):
            all_benign.append(True)
        else:
            all_benign.append(False)
            
        #Parse the string of traceroute data
        split=route["traceroute"].splitlines()
        all_route_lengths.append(len(split)-1)
        hops=[]
        pings=[]
        
        #For each line in the traceroute for a given indicator
        count=0
        timeouts=0
        for x in split:
            regIP=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
            regPing=re.findall("\s\d+[.]\d{3}\s",x)
            pings.append(regPing)
            for ip in regIP:
                if (count ==0):
                    all_dest.append(ip[1:len(ip)-1])
                    count=1
                ip=ip[1:-1]
                hops.append(ip)
                break
               
            if (count == 0):
                all_dest.append("***")
                count=1
                
            if (len(regIP) == 0):
                hops.append("***")
                timeouts=timeouts+1
            
        tailTimes=0
        
        for x in hops[::-1]:
            if (x == '***'):
                tailTimes += 1
            else:
                all_reached.append(x == all_dest[len(all_dest)-1])
                break
        else:
            all_reached.append(False)
             
        all_tailTimes.append(tailTimes)
        # One-hot encoding
        #t0=time.time()
        #newRow=
        #t1=time.time()
        expanded_route.append(oneHotEncode(hops))
        #print((t1-t0),(time.time()-t1))
        
        #all_ping.append(pings[1:])
        all_timeouts.append(timeouts)
        overallPing=0
        #weighted_ping=0
        idx=1
        for trio in pings:
            if (len(trio) != 0):
                overallPing=overallPing+float(min(trio))
                #weighted_ping=weighted_ping+float(min(trio))*idx*idx
                idx+=1
                
        try:
            all_avg_ping.append(overallPing/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        except:
            all_avg_ping.append(0)
            
        #try:
        #    all_weighted_ping.append(weighted_ping/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        #except:
        #    all_weighted_ping.append(0)
    #print(frames[len(frames)-1].shape)
    #print(len(all_trace))
    #print(w)
    
   
    if (populated):
        frames[len(frames)-1]=frames[len(frames)-1].drop(columns=['traceroute','success'])
        frames[len(frames)-1].insert(1,"Reached",all_reached)
        frames[len(frames)-1].insert(1,"Benign",all_benign)
        frames[len(frames)-1].insert(2,"Dest",all_dest)
        frames[len(frames)-1].insert(2,"NumHops",all_route_lengths)
        frames[len(frames)-1].insert(3,"Tail Timeouts",all_tailTimes)
        frames[len(frames)-1].insert(2,"AveragePing",all_avg_ping)
        frames[len(frames)-1].insert(2,"Timeouts",all_timeouts)
        #frames[len(frames)-1]=pd.concat([frames[len(frames)-1], pd.DataFrame(expanded_route, columns=[str(i) for i in range(len(expanded_route[0]))])], axis=1)
        #frames[len(frames)-1]=pd.concat([frames[len(frames)-1],pd.concat(expanded_route)],axis=1,join_axes=[frames[len(frames)-1].index])
        #frames[len(frames)-1]
    else:
        populated=True
        frames.pop()

#del all_trace
#del all_ping
del all_benign
del all_dest
del all_route_lengths
del all_avg_ping
del all_timeouts
#del all_weighted_ping
del pings
del hops
#del expanded_route

0.0%
1.0%
2.0%
3.0%
4.0%
5.0%
6.0%
7.0%
8.0%
9.0%
10.0%
11.0%
12.0%
13.0%
14.0%
15.0%
16.0%
17.0%
18.0%
19.0%
20.0%
21.0%
22.0%
23.0%
24.0%
25.0%
26.0%
27.0%
28.0%
29.0%
30.0%
31.0%
32.0%
33.0%
34.0%
35.0%
36.0%
37.0%
38.0%
39.0%
40.0%
41.0%
42.0%
43.0%
44.0%
45.0%
46.0%
47.0%
48.0%
49.0%
50.0%
51.0%
52.0%
53.0%
54.0%
55.0%
56.0%
57.0%
58.0%
59.0%
60.0%
61.0%
62.0%
63.0%
64.0%
65.0%
66.0%
67.0%
68.0%
69.0%
70.0%
71.0%
72.0%
73.0%
74.0%
75.0%
76.0%
77.0%
78.0%
79.0%
80.0%
81.0%
82.0%
83.0%
84.0%
85.0%
86.0%
87.0%
88.0%
89.0%
90.0%
91.0%
92.0%
93.0%
94.0%
95.0%
96.0%
97.0%
98.0%
99.0%
100.0%


In [9]:
df=pd.concat(frames)
del frames
df=df.fillna("X")

In [10]:
print('Dimensions',df.shape)
print('Benign', len(df[df.Benign== True]))
print('Malicious', len(df[df.Benign== False]))

Dimensions (515253, 8)
Benign 23975
Malicious 491278


In [None]:
#Spare regex expressions 


#print(time.time()-t1)           

#regDNS=re.findall("(\s[\w\-._~:\/\?\#\[\]\@\!\$\&\'\(\)\*\+\,\;\=]+\s)([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
#reg1=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])(\s+\d+[.]\d{3}\sms)+",x)
#reg1=re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]\s+\d+[.]\d{3}\s",x)

#q=-1
#for x in s:
    #q=q+1
    #Parse ip adresses and latency
    #print(q, re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]",x), re.findall("\s\d+[.]\d{3}\s",x))
    #print(x)

## Save Dataframe as Parquet

In [13]:
df=df.reset_index(drop=True)
df.to_parquet("BigExtractedFeatures.parquet")

with open('BigExpanded.pickle', 'wb') as output:  # Overwrites any existing file.
    pickle.dump(expanded_route, output, pickle.HIGHEST_PROTOCOL)
