In [8]:
# setup

import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

number_cores = 4
memory_gb = 8

conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))

sc = pyspark.SparkContext(conf=conf)

In [9]:
import re # for efficient parsing

import requests, json # for ip lookup

In [10]:
# import all data

#logfiles = [ sc.textFile( str("/Users/adrian/Development/CSC-467/assignment_1/auth_log/auth.log"+(('.'+str(i)) if i > 0 else ""))) for i in range(5) ]

logfile = sc.textFile( '/Users/adrian/Development/CSC-467/assignment_1/auth_log/auth.log*' )

#dataset = logfiles[0]

logfile.cache()

logfile.take(5)


['Dec 13 00:06:16 submitty sshd[19509]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=218.92.0.184  user=root',
 'Dec 13 00:06:17 submitty sshd[19509]: Failed password for root from 218.92.0.184 port 3753 ssh2',
 'Dec 13 00:06:21 submitty sshd[19509]: Failed password for root from 218.92.0.184 port 3753 ssh2',
 'Dec 13 00:07:01 submitty CRON[19523]: pam_unix(cron:session): session opened for user submitty_daemon by (uid=0)',
 'Dec 13 00:07:02 submitty CRON[19523]: pam_unix(cron:session): session closed for user submitty_daemon']

In [11]:
def breakdown_re( entry ):
    
    ''' sample 'entry' examples:
    Dec 3 16:11:54 submitty sshd[22178]: Failed password for root from 221.181.185.36 port 38772 ssh2
    Dec 3 16:11:59 submitty sshd[22178]: message repeated 2 times: [ Failed password for root from 221.181.185.36 port 38772 ssh2]
    Dec  6 19:19:42 submitty sshd[31862]: Failed password for invalid user pi from 188.27.33.153 port 36416 ssh2
    Dec 14 20:40:44 submitty systemd: pam_unix(systemd-user:session): session opened for user lngo by (uid=0)
    
    output format ex:
    { 
        'month': 'Dec',
        'day': '7',
        'time': '00:10:01',
        'proc': '6806',
        'msg': 'pam_unix(cron:session): session opened for user submitty_daemon by (uid=0)'
    }
    '''
    extracts = re.search(r'(?P<month>\w+) +(?P<day>\d+) (?P<time>\d+:\d+:\d+) \S+ \S+\[(?P<proc>\d+)\]: ?(?P<msg>[\S ]+)', entry)
    return extracts.groups() if extracts else entry

breakdown_re( 'Dec 3 16:11:54 submitty sshd[22178]: Failed password for root from 221.181.185.36 port 38772 ssh2') 


('Dec',
 '3',
 '16:11:54',
 '22178',
 'Failed password for root from 221.181.185.36 port 38772 ssh2')

In [12]:
def extract_from_pwfail_attempt( entry ):
    
    ''' sample entry
    ('Dec',
     '3',
     '16:11:54',
     '22178',
     'Failed password for root from 221.181.185.36 port 38772 ssh2')
    '''
    
    # '... Failed ...'
    extract = re.search(r'(?:for)?(?P<user>\w+|) from (?P<ip>\d+.\d+.\d+.\d+) port (?P<port>\d+)', entry[4])
    
    if extract:
        return extract.groups(), entry[:4]
    
    # '... Invalid ...'
    extract = re.search(r'(?P<user>\S+)? from (?P<ip>\d+.\d+.\d+.\d+) (?:port )?(?P<port>\d+)?', entry[4])
    
    if extract:
        return extract.groups(), entry[:4]
    
    return ()

In [13]:
pruned_data = logfile.map(breakdown_re).map( extract_from_pwfail_attempt ).filter( lambda entry: entry)

pruned_data.take(5)

[(('root', '218.92.0.184', '3753'), ('Dec', '13', '00:06:17', '19509')),
 (('root', '218.92.0.184', '3753'), ('Dec', '13', '00:06:21', '19509')),
 (('shalini', '27.128.173.81', '49454'), ('Dec', '13', '00:08:29', '19552')),
 (('shalini', '27.128.173.81', '49454'), ('Dec', '13', '00:08:32', '19552')),
 (('user5', '134.175.17.32', '44532'), ('Dec', '13', '00:08:40', '19560'))]

In [14]:
root_fails = pruned_data.filter(lambda entry: entry[0][0] == 'root')
non_root_fails = pruned_data.filter(lambda entry: entry[0][0] != 'root')
    
print('failed root login attempts:    ',root_fails.count())
print('failed non-root login attempts:', non_root_fails.count())    


failed root login attempts:     38287
failed non-root login attempts: 47582


In [15]:
def online_location_lookup( entry ):
    
    request_url = 'https://geolocation-db.com/jsonp/' + entry[0][1]
    
    try:
        response = requests.get(request_url)
        result = response.content.decode()
        # Clean the returned string so it just contains the dictionary data for the IP address
        result = result.split("(")[1].strip(')')
        # Convert this data into a dictionary
        return json.loads(result)['country_name']
    except:
        return "err: ("+str(entry)+")"


## Analysis Questions
1. How many failed attempts to access the server as root are there? List all countries from which these attempts where carried out.
2. How many failed attempts to access the server as non-root users are there? What are the attempted usernames? List all countries from which these attempts where carried out. 
3. Through out the durations of the log files, which date has the highest number of attack attempts? Anything interesting ab?out that particular date?

## Analysis Questions 1 and 2 explanations

- For the root-access fails and non-root access fails, I filtered my data by the username `root`
- I then proceeded to take a sample of 100 (although larger sample sizes may be used) to use in the lookup process. This reduces the number of lookups that I have to perform, while still being representative of the data.

- I found the online lookup to be more robust than using the poorly formatted ip2loc dataset: ipv4 addresses without '.'s is effectively unusable

NOTE: In order to get my instance of jupyter and pyspark running I had to make some additional fixes. Some of the notebook may not work as it does on my machine, so I've included the outputs of each code block execution as reference for what it should look like and what data I used.

In [16]:
# root accesses 

root_fails = pruned_data.filter(lambda entry: entry[0][0] == 'root')

hist = {}
root_fail_sample = root_fails.takeSample(False, 100)

print('sample complete')

for entry in root_fail_sample:
    resolve = online_location_lookup(entry)
    print('*',end='')
    if resolve in hist:
        hist[resolve] += 1
    else:
        hist[resolve] = 1
        
        
print('\nCountries:')        
for k,v in hist.items():
    print("{0}: {1}%".format(k, v))

sample complete
****************************************************************************************************
Countries:
China: 77%
Russia: 1%
Malaysia: 1%
Ireland: 1%
France: 1%
United States: 8%
Hong Kong: 1%
United Kingdom: 1%
Cyprus: 1%
Brazil: 2%
Italy: 1%
Canada: 1%
South Korea: 1%
Hungary: 1%
Germany: 1%
South Africa: 1%


`failed root login attempts:     38287
Vietnam: 3%
China: 69%
United States: 5%
Brazil: 2%
Paraguay: 1%
Russia: 1%
Hong Kong: 1%
Venezuela: 1%
France: 3%
Thailand: 1%
Senegal: 1%
Germany: 3%
Argentina: 1%
Singapore: 3%
India: 2%
Taiwan: 1%
Netherlands: 1%
Spain: 1%`

In [17]:
# non-root accesses 

non_root_fails = pruned_data.filter(lambda entry: entry[0][0] != 'root')

print('failed root login attempts:    ',non_root_fails.count())

hist_nr = {}
non_root_fail_sample = non_root_fails.takeSample(False, 100)
for entry in root_fail_sample:
    resolve = online_location_lookup(entry)
    if resolve in hist_nr:
        hist_nr[resolve] += 1
    else:
        hist_nr[resolve] = 1
        
for k,v in hist_nr.items():
    print("{0}: {1}%".format(k, v))

failed root login attempts:     47582
China: 77%
Russia: 1%
Malaysia: 1%
Ireland: 1%
France: 1%
United States: 8%
Hong Kong: 1%
United Kingdom: 1%
Cyprus: 1%
Brazil: 2%
Italy: 1%
Canada: 1%
South Korea: 1%
Hungary: 1%
Germany: 1%
South Africa: 1%


`failed root login attempts:     47582
Vietnam: 3%
China: 69%
United States: 5%
Brazil: 2%
Paraguay: 1%
Russia: 1%
Hong Kong: 1%
Venezuela: 1%
France: 3%
Thailand: 1%
Senegal: 1%
Germany: 3%
Argentina: 1%
Singapore: 3%
India: 2%
Taiwan: 1%
Netherlands: 1%
Spain: 1%`

## Analysis Question 3 Explanation

- To determine which date had the most access attempts, I took only the month and day values from each entry in the dataset and simply counted the number of access fail entries existed for each date. I found that December 8th had the most failed access attempts.

In [18]:
# Most attacked date

def dateonly(entry):
    return entry[1][0], entry[1][1]

date_hist = {'Dec':{},'Nov':{}}

datesonly = pruned_data.map( dateonly )

highest = 0, (None, None)

for m in ['Nov','Dec']:
    for d in [ str(i) for i in (range(1,17) if m == 'Dec' else range(15,31)) ]:
        
        n = datesonly.filter(lambda entry: entry[0] == m and entry[1] == d).count()
        
        date_hist[m][d] = n
        
        if highest[0] < n:
            highest = n, (m,d)
        
        print(m,d,':',n)


Nov 15 : 1809
Nov 16 : 2233
Nov 17 : 2458
Nov 18 : 1974
Nov 19 : 2228
Nov 20 : 2375
Nov 21 : 2275
Nov 22 : 2847
Nov 23 : 2601
Nov 24 : 3079
Nov 25 : 2378
Nov 26 : 2268
Nov 27 : 3190
Nov 28 : 2730
Nov 29 : 2857
Nov 30 : 2027
Dec 1 : 2268
Dec 2 : 2516
Dec 3 : 2804
Dec 4 : 2656
Dec 5 : 2586
Dec 6 : 3644
Dec 7 : 3240
Dec 8 : 4149
Dec 9 : 2845
Dec 10 : 2547
Dec 11 : 3254
Dec 12 : 2926
Dec 13 : 3468
Dec 14 : 2199
Dec 15 : 2995
Dec 16 : 2443


In [19]:
print('highest attack day: [',highest[1][0], highest[1][1],'] with', highest[0],'attempts')

highest attack day: [ Dec 8 ] with 4149 attempts


`highest attack day: [ Dec 8 ] with 4149 attempts`

On December 8th 2021: 
    President Biden met with Russian President Vladimir Putin in a secure video call on Tuesday and warned him that the United States and its allies would impose harsh sanctions if Russia invades Ukraine.