In [18]:
from netaddr import IPNetwork, IPAddress
import matplotlib.pyplot as plt

#c
#CITY_BLOCKS_PATH    = ''
#CITY_LOCATIONS_PATH = ''
#DATA_PATHS          = ''

#m
CITY_BLOCKS_PATH    = '/FileStore/tables/5ne9pwzz1484570889985/GeoLite2_City_Blocks_IPv4-82d63.csv'
CITY_LOCATIONS_PATH = '/FileStore/tables/fcj10i6i1484571614099/GeoLite2_City_Locations_en-6f8fe.csv'
DATA_PATHS          = ['/FileStore/tables/xqubiq301484572436491/NetworkTraffic100.csv',\
                       '/FileStore/tables/810vguox1485019521316/100000am.csv',\
                       '/FileStore/tables/810vguox1485019521316/100000pm.csv']
#DATA_PATHS          = ['/FileStore/tables/xqubiq301484572436491/NetworkTraffic100.csv']

CITY_BLOCKS_PATH    = 'geolite/GeoLite2-City-Blocks-IPv4.csv'
CITY_LOCATIONS_PATH = 'geolite/GeoLite2-City-Locations-en.csv'
DATA_PATHS          = ['data/NetworkTraffic100.csv',\
                       'data/100000am.csv',\
                       'data/100000pm.csv',\
                       'data/1000000am.csv',\
                       'data/1000000pm.csv']



#s
#CITY_BLOCKS_PATH    = ''/FileStore/tables/jk3parwb1484571151117/GeoLite2_City_Blocks_IPv4-82d63.csv''
#CITY_LOCATIONS_PATH = '/FileStore/tables/1rn58fwl1484571610700/GeoLite2_City_Locations_en-6f8fe.csv'
#DATA_PATHS          = '/FileStore/tables/tjpvf32z1484575566663/100.csv']

def isIpInNet(ip, net):
  """
  Checks if a ip is part of a net.
  e.g. isIpInNet("192.168.0.1", "192.168.0.0/24")
  """
  if IPAddress(ip) in IPNetwork(net):
    return True
  return False

def buildNetworkCountryMap():
  """
  Builds a map with data from geoIp2 (http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz)
  1st file contains networks and locationkey (much more data available but we consider only this)
  2nd file contains locationkey and country name.
  method joins this two datarecords
  
  returns an RDD as [('network', 'country'), ...]
  """
  geoipfile = sc.textFile(CITY_BLOCKS_PATH, 2)
  
  #(geoname_id, network) / filters the header
  geoipdata = geoipfile.map(lambda l:l.split(',')).filter(lambda l:l[0] not in 'network').map(lambda p: (p[1], p[0]))

  locationsfile = sc.textFile(CITY_LOCATIONS_PATH, 2)
  
  #(geoname_id, country_name) / filters the header
  locationsdata = locationsfile.map(lambda l:l.split(',')).filter(lambda l:l[0] not in 'geoname_id').map(lambda p: (p[0], p[5]))

  #join on geoname_id
  joineddata = geoipdata.join(locationsdata)
  
  #(network, country_name)
  networkmap = joineddata.map(lambda d: (d[1][0],d[1][1]))
  return networkmap

def getCountryByIpBc(ip):
  """
  Returns the country for the given ip or None if not found
  """
  firstoctet = ip.split(".")[0]  
  #For speed improvements we build a map with the first 3 digits of the ip. # "built" or move to actual creation of map
  
  if firstoctet in reducednetworkmapbc.value: 
    for network in reducednetworkmapbc.value[firstoctet]:
      if isIpInNet(ip, network[0]):
        return network[1]
  return None

def loadData(filePath):
  #return sc.textFile(filePath).map(lambda l: l.split(",")).filter(lambda l:l[0] not in 'ts' and len[l] > 10)
  return sc.textFile(filePath).map(lambda l: l.split(",")).filter(lambda l:l[0] not in 'ts' and len(l) > 10)
  
networkmap = buildNetworkCountryMap().persist()




In [7]:
#We build a map with the first part of the ip, eg. 255 and a list of all the networks (max 4.294.967.296 adresse w/ ipv4)
reducednetworkmap = networkmap.map(lambda e: (e[0].split(".")[0], [e])).reduceByKey(lambda a,b: a+b)

#We need to broadcast this to all nodes to be able to access this map from inside a map function
reducednetworkmapbc = sc.broadcast(reducednetworkmap.collectAsMap())

In [22]:
#read the firewall data
data = loadData(DATA_PATHS[0])
#print data.take(5)
for path in range(1, len(DATA_PATHS)):
  #data = sc.union(data, loadData(DATA_PATHS[path]))
  data = data.union(loadData(DATA_PATHS[path]))

# maybe create an RDD that only contains the relevant columns
sourceAddressIndex = 3
destinationAddressIndex = 4
sourcePortIndex = 5
destinationPortIndex = 6
protocolIndex = 7
inBytesIndex = 12
outBytesIndex = 14
#data.take(5)

#get list of ips and country

#-> here sollten wir noch distinct machen auf der ip w������rde nochmals viel schneller werden, oder?!
# yep, vorallem da unsere 4.5s nur fuer take(5) gelten, bei take(100) sind's bereits 47s - von daher gute Optimierung ;o)
#ipCountry = data.map(lambda l: l.split(',')).map(lambda line: (line[3], getCountryByIpBc(line[3]))) #47s
#ipCountry = data.map(lambda l: l.split(',')).map(lambda line: line[3]).distinct().map(lambda sa: (sa, getCountryByIpBc(sa))) # 16s
#ipCountry.take(100)

#print networkmap.count() #2'766'452
#print reducednetworkmap.count() #221 => 2'766'452 / 221 = 12'500; 12'500 / 2 = 6'250 mean average (loops) after dictionary access instead of 2'766'452 / 2 = 1'383'226 (calculation supposes a uniform distribution - which is likely not the case)

In [9]:
# requests by country
requestsByCountry = data\
  .map(lambda line: line[sourceAddressIndex]).distinct()\
  .map(lambda sa: (getCountryByIpBc(sa), 1))\
  .filter(lambda r: r[0] is not None)\
  .reduceByKey(lambda a, b: a + b)\
  .takeOrdered(10, lambda x: -x[1]) # use takeOrdered for small results sets only

In [5]:
# visualize requests by country
fig = plt.figure(figsize=(10, 4))
plt.pie([r[1] for r in requestsByCountry],\
        explode=[0.1 if i == 0 else 0.0 for i, r in enumerate(requestsByCountry)],\
        labels=[r[0] for r in requestsByCountry],\
        autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
fig.show()
#display(fig) 

In [16]:
# VPN data definition (no collection)
vpnIdentifier = '192.168'
vpnData = data.filter(lambda r: r[sourceAddressIndex].startswith(vpnIdentifier) or r[destinationAddressIndex].startswith(vpnIdentifier))

In [20]:
# requests by protocol
# check for destination?
print vpnData\
  .map(lambda r: (r[protocolIndex], 1))\
  .reduceByKey(lambda a, b: a + b)\
  .takeOrdered(10, lambda x: -x[1])

[(u'TCP', 1103724), (u'UDP', 503069), (u'ICMP', 320472)]


In [27]:
# requests by port (53: Namensaufloesung, 3128: proxy)
print vpnData\
  .flatMap(lambda r: [(r[sourcePortIndex], 1), (r[destinationPortIndex], 1)])\
  .reduceByKey(lambda a, b: a + b)\
  .takeOrdered(10, lambda x: -x[1])

[(u'0', 562069), (u'1521', 323750), (u'53', 247700), (u'3128', 141886), (u'8080', 132181), (u'389', 117191), (u'59531', 73327), (u'443', 63730), (u'137', 63110), (u'8612', 60450)]


In [10]:
# heavy senders in MiB 194.9.121.8: proxy
heavySenders = data.map(lambda r: (r[sourceAddressIndex], int(r[outBytesIndex])))\
  .reduceByKey(lambda a, b: a + b)\
  .takeOrdered(20, lambda x: -x[1])

print [(x[0], x[1] / 1024.0 / 1024.0) for x in heavySenders]

[(u'10.3.249.112', 77824.37108516693), (u'194.9.121.8', 40888.62271785736), (u'10.3.249.114', 24576.071294784546), (u'192.168.64.253', 9778.159882545471), (u'77.116.55.104', 9037.539945602417), (u'10.3.249.113', 8192.058261871338), (u'93.132.38.113', 2292.731245994568), (u'192.168.65.84', 2250.500946044922), (u'192.168.64.254', 2017.529995918274), (u'192.168.64.247', 1905.39896774292), (u'192.168.64.230', 1635.3367395401), (u'91.224.49.91', 1571.7243366241455), (u'117.248.73.51', 1537.707438468933), (u'37.117.141.93', 1485.3183965682983), (u'79.239.152.23', 1378.3107528686523), (u'213.211.44.177', 1342.7892036437988), (u'178.238.175.185', 1324.6768207550049), (u'192.168.64.186', 1275.199104309082), (u'192.168.65.97', 1230.478343963623), (u'192.168.64.154', 1225.8725833892822)]


In [11]:
# heavy receivers in MiB
heavyReceivers = data.map(lambda r: (r[sourceAddressIndex], int(r[inBytesIndex])))\
  .reduceByKey(lambda a, b: a + b)\
  .takeOrdered(20, lambda x: -x[1])

print [(x[0], x[1] / 1024.0 / 1024.0) for x in heavyReceivers]

[(u'194.9.121.8', 12431.61822795868), (u'192.168.64.155', 987.5785312652588), (u'192.168.64.247', 898.1003284454346), (u'84.168.234.14', 726.1536312103271), (u'185.36.45.240', 725.4786233901978), (u'192.168.65.34', 665.936372756958), (u'10.1.73.133', 491.6970386505127), (u'188.23.26.252', 396.550630569458), (u'31.164.61.158', 395.7440404891968), (u'192.168.65.130', 332.15995693206787), (u'31.10.145.46', 270.9416837692261), (u'79.239.152.23', 216.79542446136475), (u'212.56.82.254', 187.82502555847168), (u'192.168.64.204', 177.5008029937744), (u'85.7.152.14', 146.15049362182617), (u'192.168.65.127', 141.49701309204102), (u'192.168.64.186', 132.32152271270752), (u'192.168.65.19', 130.68046283721924), (u'10.3.229.30', 130.6262435913086), (u'192.168.65.33', 124.64183139801025)]


In [12]:
# overall data whores in MiB
overallDataWhores = data.map(lambda r: (r[sourceAddressIndex], int(r[inBytesIndex]) + int(r[outBytesIndex])))\
  .reduceByKey(lambda a, b: a + b)\
  .takeOrdered(20, lambda x: -x[1])

print [(x[0], x[1] / 1024.0 / 1024.0) for x in overallDataWhores]

[(u'10.3.249.112', 77824.80150985718), (u'194.9.121.8', 53320.24094581604), (u'10.3.249.114', 24576.155138015747), (u'192.168.64.253', 9804.9662027359), (u'77.116.55.104', 9037.539945602417), (u'10.3.249.113', 8192.1283493042), (u'192.168.64.247', 2803.4992961883545), (u'93.132.38.113', 2386.5851907730103), (u'192.168.65.84', 2317.6403970718384), (u'192.168.64.254', 2030.4778728485107), (u'192.168.64.155', 1939.7226514816284), (u'91.224.49.91', 1696.2405834197998), (u'185.36.45.240', 1645.3963260650635), (u'192.168.64.230', 1642.5212593078613), (u'117.248.73.51', 1640.2734079360962), (u'37.117.141.93', 1598.1467714309692), (u'79.239.152.23', 1595.106177330017), (u'213.211.44.177', 1454.7120351791382), (u'178.238.175.185', 1420.6902751922607), (u'192.168.64.186', 1407.5206270217896)]


In [13]:
# senders and receivers
data\
  .filter(lambda r: int(r[inBytesIndex]) > 0 and int(r[outBytesIndex]) > 0)\
  .map(lambda r: (r[sourceAddressIndex], int(r[inBytesIndex]) + int(r[outBytesIndex])))\
  .take(10)

[(u'192.168.64.200', 158),
 (u'192.168.64.200', 129),
 (u'192.168.64.227', 450),
 (u'192.168.64.197', 3427),
 (u'192.168.65.72', 450),
 (u'192.168.64.238', 129),
 (u'192.168.65.72', 450),
 (u'10.12.79.203', 64),
 (u'192.168.65.72', 450),
 (u'192.168.65.72', 94)]