In [1]:
from netaddr import IPNetwork, IPAddress

def isIpInNet(ip, net):
  """
  Checks if a ip is part of a net.
  e.g. isIpInNet("192.168.0.1", "192.168.0.0/24")
  """
  if IPAddress(ip) in IPNetwork(net):
    return True
  return False

def buildNetworkCountryMap():
  """
  Builds a map with data from geoIp2 (http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz)
  1st file contains networks and locationkey (much more data available but we consider only this)
  2nd file contains locationkey and country name.
  method joins this two datarecords
  
  returns an RDD as [('network', 'country'), ...]
  """
  geoipfile = sc.textFile('/FileStore/tables/jk3parwb1484571151117/GeoLite2_City_Blocks_IPv4-82d63.csv', 2)
  
  #(geoname_id, network) / filters the header
  geoipdata = geoipfile.map(lambda l:l.split(',')).filter(lambda l:l[0] not in 'network').map(lambda p: (p[1], p[0]))

  locationsfile = sc.textFile('/FileStore/tables/1rn58fwl1484571610700/GeoLite2_City_Locations_en-6f8fe.csv', 2)
  
  #(geoname_id, country_name) / filters the header
  locationsdata = locationsfile.map(lambda l:l.split(',')).filter(lambda l:l[0] not in 'geoname_id').map(lambda p: (p[0], p[5]))

  #join on geoname_id
  joineddata = geoipdata.join(locationsdata)
  
  #(network, country_name)
  networkmap = joineddata.map(lambda d: (d[1][0],d[1][1]))
  return networkmap

def getCountryByIpBc(ip):
  """
  Returns the country for the given ip or None if not found
  """
  first = ip.split(".")[0]  
  #For speed improvements we build a map with the first 3 digits of the ip. 
  for network in reducednetworkmapbc.value[first]:
    if isIpInNet(ip, network[0]):
      return network[1]
  return None
  
networkmap = buildNetworkCountryMap().persist()




In [2]:
#We build a map with the first part of the ip, eg. 255 and a list of all the networks
reducednetworkmap = networkmap.map(lambda e: (e[0].split(".")[0], [e])).reduceByKey(lambda a,b: a+b)

#We need to broadcast this to all nodes to be able to access this map from inside a map function
reducednetworkmapbc = sc.broadcast(reducednetworkmap.collectAsMap())

In [3]:
#read the firewall data
sourcefile = sc.textFile('/FileStore/tables/tjpvf32z1484575566663/100.csv').filter(lambda l:l[0] not in 'ts')
#get list of ips and country

#-> here sollten wir noch distinct machen auf der ip würde nochmals viel schneller werden, oder?!
ipCountry = sourcefile.map(lambda l: l.split(',')).map(lambda line: (line[3], getCountryByIpBc(line[3])))
ipCountry.take(5)