# Traceroute Modeling

In [2]:
# Notebook containing preliminary exploration of cybersecuirty related data pulled from both benign and malicious sources
# with the goal of identifying suspicious websites using various machine learning models.

import json
import re
import time
import pprint
import operator
import os

from scipy.stats import chisquare
from ipwhois import IPWhois
import matplotlib.pyplot as plt
from collections import defaultdict
from bisect import bisect_left
import socket, struct

import networkx as nx
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn import tree
from sklearn import metrics
from sklearn.svm import SVC
from sklearn import datasets, cluster
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AffinityPropagation

# Convert Raw Data

In [None]:
#Enumerate filenames of all json documents for stragithforward iteration

date_path = '/data/data/2019-04-22/'
enrichment=date_path+'enrichment/'
ingest=date_path+'ingest/'
stream='traceroute'

# Index: list of all json names in the folder of a particular date
index=dict.fromkeys(os.listdir(enrichment))
for x in index:
    index[x]=os.listdir(enrichment+x)

In [None]:
#Count the number of times each route crosses a certian IP
frequency_counts_benign=defaultdict(int)
frequency_counts_malicious=defaultdict(int)

#Lists to keep track of various features throughout iteration
frames=[] #Holds a complete dataframe for each json file 
populated=True #Determines whether a json was empty
totalRoutes=[] #Contains all routes 
totalBRoutes=[] # ^^ for "benign" routes
totalMRoutes=[] # ^^ for "malicious" routes
totalDest=set() #Tracks all the destination nodes

### Encode IPs by Subnet

In [None]:
consolidated=json.load(open('consolidated.json'))

k=list(consolidated.keys())
k=sorted(k)

In [None]:
def parseCIDR(alpha):
    ip=alpha[:alpha.find('/')]
    mask=alpha[alpha.find('/')+1:]
    return (ip, int(mask))
    
def dottedQuadToNum(ip):
    "convert decimal dotted quad string to long integer"
    return struct.unpack('!L',socket.inet_aton(ip))[0]

def numToDottedQuad(n):
    "convert long int to dotted quad string"
    return socket.inet_ntoa(struct.pack('!L',n))
      
def makeMask(n):
    "return a mask of n bits as a long integer"
    return (1 << 32-n)-1

def ipToNetAndMask(ip):
    "returns tuple (network, host) dotted-quad addresses given IP and mask size"
    if (len(ip) > 18):
        ip=ip[:ip.find(',')]
    network,mask = parseCIDR(ip)
    n = dottedQuadToNum(network)
    m = makeMask(mask)

    host = n & m
    net = n - host

    return numToDottedQuad(net), mask

def toNet(network,maskbits):
    "returns tuple (network, host) dotted-quad addresses given IP and mask size"
    n = dottedQuadToNum(network)
    m = makeMask(maskbits)

    host = n & m
    net = n - host

    return numToDottedQuad(net)

registered=0
allFailures=[]
def oneHotEncode(hops):
    global allFailures
    global registered
    ret=np.zeros(len(k)+1)
    for point in hops:
        if (point != '***'):
            
            
            for i in range (32,0,-1):
                ref=toNet(point,i)
                if (consolidated.get(ref,False)):
                    if (consolidated[ref] == i):
                        ret[consolidated[ref]]=1
                        break
            else:
                allFailures.append(point)
            #pos=bisect_left(k,point)

            #if (pos > 0):
            #    pos-=1

            #if (toNet(point,consolidated[k[pos]]) != k[pos]):
            #    allFailures.append(point)
            #else:
            #    ret[pos]=1
            #    registered+=1
            
    return ret

In [None]:
#For each json in the "stream" folder for a particular date...
for w in index[stream]:
    temp_json=json.load(open(enrichment+stream+'/'+w))
    if type(temp_json["data"]) == list:
        frames.append(pd.DataFrame(temp_json["data"]))  
    #For each element in the data list from the traceroute json...
    all_trace=[]
    all_ping=[]
    all_benign=[]
    all_dest=[]
    all_route_lengths=[]
    all_avg_ping=[]
    all_timeouts=[]
    all_weighted_ping=[]
    expanded_route=[]
    
    for route in temp_json["data"]:
        
        #Error handling
        if type(route) is str:
            populated=False
            break
            
        # Determine benign or malicious feature set
        if (len(w)<10):
            all_benign.append(True)
        else:
            all_benign.append(False)
            
        #Parse the string of traceroute data
        split=route["traceroute"].splitlines()
        all_route_lengths.append(len(split)-1)
        hops=[]
        pings=[]
        
        #For each line in the traceroute for a given indicator
        count=0
        timeouts=0
        for x in split:
            regIP=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
            regPing=re.findall("\s\d+[.]\d{3}\s",x)
            pings.append(regPing)
            for ip in regIP:
                if (count ==0):
                    all_dest.append(ip)
                    count=1
                ip=ip[1:-1]
                totalDest.add(ip)
                hops.append(ip)
                if (len(w)<10): 
                    frequency_counts_benign[ip]+=1
                    break
                else:
                    frequency_counts_malicious[ip]+=1
                    break
            if (count == 0):
                all_dest.append("***")
                count=1
                
            if (len(regIP) == 0):
                hops.append("***")
                timeouts=timeouts+1
                
        #Append each IP node from this indicator to a list of lists containing all routes for all jsons on this day
        all_trace.append(hops[1:])
        totalRoutes.append(hops)
        if (len(w)<10):
            totalBRoutes.append(hops)
        else:
            totalMRoutes.append(hops)
            
            
        # One-hot encoding
        #t0=time.time()
        #newRow=
        #t1=time.time()
        expanded_route.append(oneHotEncode(hops))
        #print((t1-t0),(time.time()-t1))
        
        all_ping.append(pings[1:])
        all_timeouts.append(timeouts)
        overallPing=0
        weighted_ping=0
        idx=1
        for trio in pings:
            if (len(trio) != 0):
                overallPing=overallPing+float(min(trio))
                weighted_ping=weighted_ping+float(min(trio))*idx*idx
                idx+=1
                
        try:
            all_avg_ping.append(overallPing/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        except:
            all_avg_ping.append(0)
            
        try:
            all_weighted_ping.append(weighted_ping/(all_route_lengths[len(all_route_lengths)-1]-timeouts))
        except:
            all_weighted_ping.append(0)
    #print(frames[len(frames)-1].shape)
    #print(len(all_trace))
    #print(w)
    
   
    if (populated):
        frames[len(frames)-1].insert(2,"Route",all_trace)
        frames[len(frames)-1].insert(3,"Ping",all_ping)
        frames[len(frames)-1].insert(1,"Benign",all_benign)
        frames[len(frames)-1].insert(2,"Dest",all_dest)
        frames[len(frames)-1].insert(2,"NumHops",all_route_lengths)
        frames[len(frames)-1].insert(2,"AveragePing",all_avg_ping)
        frames[len(frames)-1].insert(2,"Timeouts",all_timeouts)
        frames[len(frames)-1]=pd.concat([frames[len(frames)-1], pd.DataFrame(expanded_route, columns=[str(i) for i in range(len(expanded_route[0]))])], axis=1)
        #frames[len(frames)-1]=pd.concat([frames[len(frames)-1],pd.concat(expanded_route)],axis=1,join_axes=[frames[len(frames)-1].index])
        #frames[len(frames)-1]
    else:
        populated=True
        frames.pop()

del all_trace
del all_ping
del all_benign
del all_dest
del all_route_lengths
del all_avg_ping
del all_timeouts
del all_weighted_ping
del pings
del hops
del expanded_route

In [None]:
print(len(allFailures))

In [None]:
upset={}
iterCount=0
for test in allFailures:
    iterCount+=1
    if (iterCount % 250 == 0):
        print (iterCount/25000)

    try:
        tmp=IPWhois(test,allow_permutations=True).lookup_rdap()['network']['cidr']
        if (tmp.find(',')):
            for annoy in tmp.split(','):
                annoy=annoy.strip()
                newSubnet,newMask=ipToNetAndMask(annoy)
                upset[newSubnet]=newMask
        else:
            print("diff:",test)
            
    except:
        print("Error:", test)
        
    #break

In [None]:
#Spare regex expressions 


#print(time.time()-t1)           

#regDNS=re.findall("(\s[\w\-._~:\/\?\#\[\]\@\!\$\&\'\(\)\*\+\,\;\=]+\s)([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])",x)
#reg1=re.findall("([(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)])(\s+\d+[.]\d{3}\sms)+",x)
#reg1=re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]\s+\d+[.]\d{3}\s",x)

#q=-1
#for x in s:
    #q=q+1
    #Parse ip adresses and latency
    #print(q, re.findall("[(]\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}[)]",x), re.findall("\s\d+[.]\d{3}\s",x))
    #print(x)

In [None]:
sd=pd.concat(frames)
del frames
sd=sd.fillna("X")
sd.drop(columns=['traceroute'])
df=pd.concat([sd,df])

In [None]:
df=pd.concat(frames)
del frames
df=df.fillna("X")
df.drop(columns=['traceroute'])

In [None]:
df.shape

## Save/Read Parquet Dataframe

In [3]:
df=pd.read_parquet('universe.parquet')

In [None]:
#df=df.reset_index(drop=True)
#df=df.drop(columns=['Route','Ping'])
#df.to_parquet("hugeparquet.parquet")
#df.to_feather("hugefeather.feather")

In [4]:
df

Unnamed: 0,indicator,Benign,Timeouts,AveragePing,NumHops,Tail Timeouts,Dest,success,Ping,traceroute,...,7556,7557,7558,7559,7560,7561,7562,7563,7564,7565
0,aatextiles.com,False,12,18.364556,30,10,(207.148.248.143),True,"[[ 25.084 , 25.074 , 19.292 ], [ 15.858 , 1...",traceroute to aatextiles.com (207.148.248.143)...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,yumproject.com,False,12,63.041000,30,4,(3.94.104.205),True,"[[ 12.833 , 17.712 , 12.814 ], [ 11.582 , 1...","traceroute to yumproject.com (3.94.104.205), 3...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,estudiantesdelinstante.net,False,13,40.019118,30,12,(13.59.60.11),True,"[[ 15.106 , 19.307 , 71.139 ], [ 15.786 , 1...",traceroute to estudiantesdelinstante.net (13.5...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,protecca.com,False,14,60.050875,30,3,(23.20.239.12),True,"[[ 21.939 , 16.705 , 12.341 ], [ 12.940 , 2...","traceroute to protecca.com (23.20.239.12), 30 ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,okeanbg.com,False,19,32.956727,30,14,(104.217.152.168),True,"[[ 21.116 , 13.573 , 12.230 ], [ 18.860 , 1...","traceroute to okeanbg.com (104.217.152.168), 3...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,hanantoseto.com,False,14,19.307562,30,13,(50.63.202.44),True,"[[ 40.267 , 40.259 , 40.245 ], [ 19.413 , 1...","traceroute to hanantoseto.com (50.63.202.44), ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,estelareventos.com,False,3,28.811864,25,0,(192.185.136.46),True,"[[ 16.086 , 15.972 , 15.963 ], [ 14.536 , 1...",traceroute to estelareventos.com (192.185.136....,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,marquesdedaroca.com,False,13,52.988235,30,12,(91.142.217.6),True,"[[ 21.513 , 14.010 , 21.496 ], [ 16.954 , 2...",traceroute to marquesdedaroca.com (91.142.217....,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,razihearing.com,False,9,33.273333,30,7,(184.154.206.17),True,"[[ 13.660 , 17.958 , 17.951 ], [ 14.879 , 2...",traceroute to razihearing.com (184.154.206.17)...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,388cent.com,False,1,66.499045,23,0,(85.214.49.143),True,"[[ 20.884 , 18.927 , 15.769 ], [ 16.891 , 1...","traceroute to 388cent.com (85.214.49.143), 30 ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Supervised Classification Models

### Affinity Propogation Clustering (of IP Addresses)

In [None]:
clustering = AffinityPropagation().fit(df[df.columns.difference([header for header in df.columns if not (header.isdigit())])])

In [None]:
clustering.labels_

### Dimensionality Reduction

In [None]:
ftagl = cluster.FeatureAgglomeration(n_clusters=50)
reduced = ftagl.fit_transform(df[df.columns.difference([header for header in df.columns if not (header.isdigit())])])

In [None]:
print(len(ftagl.labels_))
print(len(otheragl.labels_))
q=sorted([x for x in ftagl.labels_ if x != 0])
r=sorted([x for x in otheragl.labels_ if x != 0])

In [None]:
reduced=otheragl.transform(df[df.columns.difference([header for header in df.columns if not (header.isdigit())])])
reduced=pd.DataFrame(reduced)

In [None]:
#Reduce Dimensionality for all classifiers below this point
df=df.drop(columns=df.columns.difference([header for header in df.columns if not (header.isdigit())]))
df=pd.concat([df,reduced],axis=1)
del reduced

In [None]:
vec=np.zeros(6684)
print(ftagl.transform(vec))

### MLP Classifier

#### Threshold Modication

In [5]:
def adjust_surface(probs,bound=0.48):
    return probs[:,1]>bound

In [9]:
bd=df.loc[df.Benign == True]
even=len(bd)
md=df.loc[df.Benign == False]
print(bd.shape, md.shape)

(16750, 7576) (86483, 7576)


In [10]:
bd=bd.sample(frac=1).reset_index(drop=True)
md=md.sample(frac=1).reset_index(drop=True)
md=md.loc[df.index < even]
ad=pd.concat([bd,md])
del bd
del md

In [14]:
exclude=ad.columns.difference(['indicator','Benign','Dest','Route','Ping','index','traceroute'])
ad=ad.sample(frac=1).reset_index(drop=True)
X=ad[exclude]
Y=ad['Benign']
clf=clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(50,50,50),max_iter=3000,verbose=1, shuffle=True,learning_rate_init=.001,solver='lbfgs')
clf=clf.fit(X.loc[0:0.8*2*even,:],Y.loc[0:0.8*2*even])

In [16]:
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(Y.loc[Y.index > 0.8*2*even], adjust_surface(clf.predict_proba(X.loc[X.index > 0.8*2*even])))))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y.loc[Y.index > 0.8*2*even], adjust_surface(clf.predict_proba(X.loc[X.index > 0.8*2*even])),labels=[True,False]))

Classification report for classifier MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50, 50, 50), learning_rate='constant',
              learning_rate_init=0.001, max_iter=3000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=1, warm_start=False):
              precision    recall  f1-score   support

       False       0.73      0.79      0.76      3385
        True       0.77      0.71      0.74      3314

    accuracy                           0.75      6699
   macro avg       0.75      0.75      0.75      6699
weighted avg       0.75      0.75      0.75      6699


Confusion matrix:
[[2345  969]
 [ 709 2676]]


In [None]:
mlp=clf

## Random Forest

In [None]:
#exclude=ad.columns.difference(['indicator','Benign','Dest','Route','Ping','index','traceroute'])
#ad=ad.sample(frac=1).reset_index(drop=True)
X=ad[exclude]
Y=ad['Benign']
clf= RandomForestClassifier(n_estimators=100, random_state=0)
clf=clf.fit(X.loc[0:0.8*2*even,:],Y.loc[0:0.8*2*even])

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(Y.loc[Y.index > 0.8*2*even], clf.predict(X.loc[X.index > 0.8*2*even]))))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y.loc[Y.index > 0.8*2*even], clf.predict(X.loc[X.index > 0.8*2*even]),labels=[True,False]))

In [None]:
rdf=clf

## SVM (Linearity Check)

In [None]:
X=ad[exclude]
Y=ad['Benign']
clf= SVC(gamma='auto')
clf=clf.fit(X.loc[0:0.8*2*even,:],Y.loc[0:0.8*2*even])

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(Y.loc[Y.index > 0.8*2*even], clf.predict(X.loc[X.index > 0.8*2*even]))))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y.loc[Y.index > 0.8*2*even], clf.predict(X.loc[X.index > 0.8*2*even]),labels=[True,False]))

In [None]:
svm=clf

## Ada Boost

In [None]:
X=ad[exclude]
Y=ad['Benign']
clf = AdaBoostClassifier(n_estimators=1000)
clf=clf.fit(X.loc[0:0.8*2*even,:],Y.loc[0:0.8*2*even])

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(Y.loc[Y.index > 0.8*2*even], clf.predict(X.loc[X.index > 0.8*2*even]))))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y.loc[Y.index > 0.8*2*even], clf.predict(X.loc[X.index > 0.8*2*even]),labels=[True,False]))

In [None]:
ada=clf

## K Neighbors

In [None]:
X=ad[exclude]
Y=ad['Benign']
clf = KNeighborsClassifier(n_neighbors=5,weights='distance',n_jobs=3)
clf=clf.fit(X.loc[0:0.8*2*even,:],Y.loc[0:0.8*2*even])

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(Y.loc[Y.index > 0.8*2*even], clf.predict(X.loc[X.index > 0.8*2*even]))))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y.loc[Y.index > 0.8*2*even], clf.predict(X.loc[X.index > 0.8*2*even]),labels=[True,False]))

In [None]:
knb=clf

## ROC Curves


In [None]:
mlp_score=mlp.predict_proba(X.loc[X.index > 0.8*2*even])[:,1]
rdf_score=rdf.predict_proba(X.loc[X.index > 0.8*2*even])[:,1]
svm_score=svm.decision_function(X.loc[X.index > 0.8*2*even])
ada_score=ada.predict_proba(X.loc[X.index > 0.8*2*even])[:,1]
knb_score=knb.predict_proba(X.loc[X.index > 0.8*2*even])[:,1]

In [None]:
plt.title('Receiver Operating Characteristic')

#Multilayer perceptron
fpr, tpr, auc_thresholds = roc_curve(Y.loc[Y.index > 0.8*2*even], mlp_score)
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, 'b', label = 'AUC MLP = %0.2f' % roc_auc)

#Random forest
fpr, tpr, auc_thresholds = roc_curve(Y.loc[Y.index > 0.8*2*even], rdf_score)
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, 'r', label = 'AUC RDF = %0.2f' % roc_auc)

#SVM
fpr, tpr, auc_thresholds = roc_curve(Y.loc[Y.index > 0.8*2*even], svm_score)
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, 'g', label = 'AUC SVM = %0.2f' % roc_auc)

#AdaBoost
fpr, tpr, auc_thresholds = roc_curve(Y.loc[Y.index > 0.8*2*even], ada_score)
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, 'y', label = 'AUC ADA = %0.2f' % roc_auc)

# K-Neighbors
fpr, tpr, auc_thresholds = roc_curve(Y.loc[Y.index > 0.8*2*even], knb_score)
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, 'c', label = 'AUC KNB = %0.2f' % roc_auc)

plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## Grid Search
## Loss Function

In [None]:
totalDest=sorted(totalDest)

### Custom One Hot Encoding Function

In [None]:
#print(totalDest)
from bisect import bisect_left
sample_onehot=[0]*len(totalDest)
t0=time.time()
for x in hops:
    sample_onehot[bisect_left(totalDest,x)]=1
print(time.time()-t0)

## Chi-Sqare Analysis of IP Frequencies

In [None]:
srt_benign=pd.DataFrame(sorted(frequency_counts_benign.items(), key=operator.itemgetter(1), reverse=True), columns=["BenignIP","BenignFreq"])
srt_malicious=pd.DataFrame(sorted(frequency_counts_malicious.items(), key=operator.itemgetter(1), reverse=True),columns=["MalIP","MalFreq"])
srt=pd.concat([srt_benign,srt_malicious], axis=1)
srt

In [None]:
freqB=[]
freqM=[]
for x in frequency_counts_benign.keys():
    if frequency_counts_malicious.get(x) != None:
        freqB.append(frequency_counts_benign.get(x))
        freqM.append(frequency_counts_malicious.get(x)/5.4)

print(pd.Series(freqB).sum(),pd.Series(freqM).sum())
chisquare(freqM, f_exp=freqB)

In [None]:
print(len(totalBRoutes), len(totalMRoutes))

# Sequential Processing

## Network Graph of IP Addresses

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
G=nx.Graph()

#Add nodes to graph
for x in totalRoutes:
    x=list(filter(lambda a:a!="***",x))
    G.add_nodes_from(x)
    #if (df.iloc[c][2]):
        #colorCode=colorCode+['blue']*(len(x)-1)
    #else:
         #colorCode=colorCode+['red']*(len(x)-1)
        
    #Add adges and connect last node to the final destination
    if len(x) != 0:
        for i in range (1,len(x)-1):
            G.add_edge(x[i],x[i+1],stop=i)
        G.add_edge(x[len(x)-1],x[0])

In [None]:
#Color destination nodes
#TODO: Vary destination color by benign or malicious designation

colorCode=[]
for x in G.nodes():
    if (x in totalDest):
        colorCode.append('blue')
        
    else:
        colorCode.append('black')
        
print(len(G.nodes()), len(colorCode), cnt)

In [None]:
options = {
        'node_color': colorCode,
        'node_size': 1,
        'edge_color': 'grey',
        'linewidths': 0,
        'width': 0.1,
    }
plt.figure(figsize=(20,10),dpi=1000)
nx.draw(G, **options)
plt.show()

In [None]:
plt.savefig('network_graph.jpg', dpi=1000)