In [1]:
# setup

import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

number_cores = 4
memory_gb = 8

conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))

sc = pyspark.SparkContext(conf=conf)

In [2]:
import re # for efficient parsing

import requests, json # for ip lookup

In [3]:
# import all data

#logfiles = [ sc.textFile( str("/Users/adrian/Development/CSC-467/assignment_1/auth_log/auth.log"+(('.'+str(i)) if i > 0 else ""))) for i in range(5) ]

logfile = sc.textFile( '/Users/adrian/Development/CSC-467/assignment_1/auth_log/auth.log*' )

#dataset = logfiles[0]

logfile.cache()

logfile.take(5)


['Dec 13 00:06:16 submitty sshd[19509]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=218.92.0.184  user=root',
 'Dec 13 00:06:17 submitty sshd[19509]: Failed password for root from 218.92.0.184 port 3753 ssh2',
 'Dec 13 00:06:21 submitty sshd[19509]: Failed password for root from 218.92.0.184 port 3753 ssh2',
 'Dec 13 00:07:01 submitty CRON[19523]: pam_unix(cron:session): session opened for user submitty_daemon by (uid=0)',
 'Dec 13 00:07:02 submitty CRON[19523]: pam_unix(cron:session): session closed for user submitty_daemon']

In [4]:
def breakdown_re( entry ):
    
    ''' sample 'entry' examples:
    Dec 3 16:11:54 submitty sshd[22178]: Failed password for root from 221.181.185.36 port 38772 ssh2
    Dec 3 16:11:59 submitty sshd[22178]: message repeated 2 times: [ Failed password for root from 221.181.185.36 port 38772 ssh2]
    Dec  6 19:19:42 submitty sshd[31862]: Failed password for invalid user pi from 188.27.33.153 port 36416 ssh2
    Dec 14 20:40:44 submitty systemd: pam_unix(systemd-user:session): session opened for user lngo by (uid=0)
    
    output format ex:
    { 
        'month': 'Dec',
        'day': '7',
        'time': '00:10:01',
        'proc': '6806',
        'msg': 'pam_unix(cron:session): session opened for user submitty_daemon by (uid=0)'
    }
    '''
    extracts = re.search(r'(?P<month>\w+) +(?P<day>\d+) (?P<time>\d+:\d+:\d+) \S+ \S+\[(?P<proc>\d+)\]: ?(?P<msg>[\S ]+)', entry)
    return extracts.groups() if extracts else entry

breakdown_re( 'Dec 3 16:11:54 submitty sshd[22178]: Failed password for root from 221.181.185.36 port 38772 ssh2') 


('Dec',
 '3',
 '16:11:54',
 '22178',
 'Failed password for root from 221.181.185.36 port 38772 ssh2')

In [31]:
def extract_from_pwfail_attempt( entry ):
    
    ''' sample entry
    ('Dec',
     '3',
     '16:11:54',
     '22178',
     'Failed password for root from 221.181.185.36 port 38772 ssh2')
    '''
    
    # '... Failed ...'
    extract = re.search(r'(?:for)?(?P<user>\w+|) from (?P<ip>\d+.\d+.\d+.\d+) port (?P<port>\d+)', entry[4])
    
    if extract:
        return extract.groups(), entry[:4]
    
    # '... Invalid ...'
    extract = re.search(r'(?P<user>\S+)? from (?P<ip>\d+.\d+.\d+.\d+) (?:port )?(?P<port>\d+)?', entry[4])
    
    if extract:
        return extract.groups(), entry[:4]
    
    return ()

In [32]:
pruned_data = logfile.map(breakdown_re).map( extract_from_pwfail_attempt ).filter( lambda entry: entry)

pruned_data.take(5)

[(('root', '218.92.0.184', '3753'), ('Dec', '13', '00:06:17', '19509')),
 (('root', '218.92.0.184', '3753'), ('Dec', '13', '00:06:21', '19509')),
 (('shalini', '27.128.173.81', '49454'), ('Dec', '13', '00:08:29', '19552')),
 (('shalini', '27.128.173.81', '49454'), ('Dec', '13', '00:08:32', '19552')),
 (('user5', '134.175.17.32', '44532'), ('Dec', '13', '00:08:40', '19560'))]

In [13]:
root_fails = pruned_data.filter(lambda entry: entry[0][0] == 'root')
non_root_fails = pruned_data.filter(lambda entry: entry[0][0] != 'root')
    
print('failed root login attempts:    ',root_fails.count())
print('failed non-root login attempts:', non_root_fails.count())    


failed root login attempts:     38287
failed non-root login attempts: 47582


In [10]:
keyd = logfile.filter(lambda entry: entry and entry[0][0] != 'root')

In [15]:
def online_location_lookup( entry ):
    
    request_url = 'https://geolocation-db.com/jsonp/' + entry[0][1]
    
    try:
        response = requests.get(request_url)
        result = response.content.decode()
        # Clean the returned string so it just contains the dictionary data for the IP address
        result = result.split("(")[0][1].strip(")")
        # Convert this data into a dictionary
        return json.loads(result)['country_name']
    except:
        return "err: ("+entry+")"
    

In [12]:
sample = pruned_data.takeSample(False, 100)
sample[:10]

[('root', '218.92.0.165', '8679'),
 ('root', '112.85.42.81', '13116'),
 ('disconnect', '49.234.62.183', '46408'),
 ('string', '164.68.112.178', '44449'),
 ('subida', '60.254.104.109', '54458'),
 ('root', '164.163.23.19', '57664'),
 ('root', '222.187.238.93', '46269'),
 ('nll', '157.245.100.56', '32988'),
 ('informix', '179.131.10.103', '8096'),
 ('root', '218.92.0.223', '28871')]

## Analysis Questions
- How many failed attempts to access the server as root are there? List all countries from which these attempts where carried out.
- How many failed attempts to access the server as non-root users are there? What are the attempted usernames? List all countries from which these attempts where carried out. 
- Through out the durations of the log files, which date has the highest number of attack attempts? Anything interesting ab?out that particular date?

In [21]:
# root accesses 
print('failed root login attempts:    ',root_fails.count())

hist = {}
root_fail_sample = root_fails.takeSample(False, 100)
for entry in root_fail_sample:
    resolve = online_location_lookup(entry)
    if resolve in hist:
        hist[resolve] += 1
    else:
        hist[resolve] = 1
        
for k,v in hist.items():
    print("{0}: {1}%".format(k, v))

failed root login attempts:     38287
Vietnam: 3%
China: 69%
United States: 5%
Brazil: 2%
Paraguay: 1%
Russia: 1%
Hong Kong: 1%
Venezuela: 1%
France: 3%
Thailand: 1%
Senegal: 1%
Germany: 3%
Argentina: 1%
Singapore: 3%
India: 2%
Taiwan: 1%
Netherlands: 1%
Spain: 1%


failed root login attempts:     38287
Vietnam: 3%
China: 69%
United States: 5%
Brazil: 2%
Paraguay: 1%
Russia: 1%
Hong Kong: 1%
Venezuela: 1%
France: 3%
Thailand: 1%
Senegal: 1%
Germany: 3%
Argentina: 1%
Singapore: 3%
India: 2%
Taiwan: 1%
Netherlands: 1%
Spain: 1%

In [29]:
# non-root accesses 
print('failed root login attempts:    ',non_root_fails.count())

hist_nr = {}
non_root_fail_sample = non_root_fails.takeSample(False, 100)
for entry in root_fail_sample:
    resolve = online_location_lookup(entry)
    if resolve in hist_nr:
        hist_nr[resolve] += 1
    else:
        hist_nr[resolve] = 1
        
for k,v in hist_nr.items():
    print("{0}: {1}%".format(k, v))

failed root login attempts:     47582
Vietnam: 3%
China: 69%
United States: 5%
Brazil: 2%
Paraguay: 1%
Russia: 1%
Hong Kong: 1%
Venezuela: 1%
France: 3%
Thailand: 1%
Senegal: 1%
Germany: 3%
Argentina: 1%
Singapore: 3%
India: 2%
Taiwan: 1%
Netherlands: 1%
Spain: 1%


failed root login attempts:     47582
Vietnam: 3%
China: 69%
United States: 5%
Brazil: 2%
Paraguay: 1%
Russia: 1%
Hong Kong: 1%
Venezuela: 1%
France: 3%
Thailand: 1%
Senegal: 1%
Germany: 3%
Argentina: 1%
Singapore: 3%
India: 2%
Taiwan: 1%
Netherlands: 1%
Spain: 1%

In [83]:
# Most attacked date

def dateonly(entry):
    return entry[1][0], entry[1][1]

date_hist = {'Dec':{},'Nov':{}}

datesonly = pruned_data.map( dateonly )

for m,d in datesonly.takeSample(False, 85000):
    if m in date_hist and d in date_hist[m]:
        date_hist[m][d] += 1
    else:
        date_hist[m][d] = 1

datesonly.count()

85869

In [85]:
for k,v in date_hist.items():
    for d,n in v.items():
        print(k, d,':', n)

Dec 3 : 2773
Dec 8 : 4107
Dec 4 : 2638
Dec 14 : 2181
Dec 10 : 2524
Dec 12 : 2905
Dec 6 : 3603
Dec 13 : 3427
Dec 15 : 2964
Dec 2 : 2493
Dec 7 : 3204
Dec 1 : 2248
Dec 11 : 3218
Dec 5 : 2558
Dec 9 : 2822
Dec 16 : 2421
Nov 16 : 2212
Nov 29 : 2818
Nov 30 : 2000
Nov 27 : 3168
Nov 17 : 2427
Nov 20 : 2349
Nov 15 : 1793
Nov 24 : 3053
Nov 18 : 1951
Nov 28 : 2708
Nov 25 : 2349
Nov 22 : 2812
Nov 19 : 2207
Nov 21 : 2251
Nov 23 : 2575
Nov 26 : 2241
