In [2]:
from collections import Counter, defaultdict
import re

In [4]:
def get_data(name='access.log.txt'):
    log_file = open(name, 'r')

    parts = [
        r'(?P<host>\S+)',                   # host %h
        r'\S+',                             # indent %l (unused)
        r'(?P<user>\S+)',                   # user %u
        r'\[(?P<time>.+)\]',                # time %t
        r'"(?P<request>.*)"',               # request "%r"
        r'(?P<status>[0-9]+)',              # status %>s
        r'(?P<size>\S+)',                   # size %b (careful, can be '-')
        r'"(?P<referrer>.*)"',              # referrer "%{Referer}i"
        r'"\S+',                            # mozilla
        r'\((?P<OS>.*?)\)',                 # OS
        r'(?P<agent>.*)"',                  # user agent "%{User-agent}i"
    ]

    pattern = re.compile(r'\s+'.join(parts)+r'\s*\Z')
    log_data = []
    anomalies = []
    for line in log_file:
        m = pattern.match(line)
        if m:
            log_data.append(m.groupdict())
        else:
            anomalies.append(line.rstrip())

    return log_data, anomalies

In [5]:
log_data, anomalies = get_data()

In [6]:
len(log_data), len(anomalies)

(50622, 29)

In [7]:
def handle_anomalies(anomalies):
    parts = [
        r'(?P<host>\S+)',  # host %h
        r'\S+',  # indent %l (unused)
        r'(?P<user>\S+)',  # user %u
        r'\[(?P<time>.+)\]',  # time %t
        r'"(?P<request>.*)"',  # request "%r"
        r'(?P<status>[0-9]+)',  # status %>s
        r'(?P<size>\S+)',  # size %b (careful, can be '-')
        r'"(?P<referrer>.*)"',  # referrer "%{Referer}i"
        r'"(?P<agent>.*)"',  # user agent "%{User-agent}i"
    ]
    pattern = re.compile(r'\s+'.join(parts) + r'\s*\Z')

    result = []
    for line in anomalies:
        m = pattern.match(line)
        if m:
            result.append(m.groupdict())
    return result

handled_anomalies = handle_anomalies(anomalies)
len(handled_anomalies)

29

In [8]:
def unique(log_data, anomalies):
    set_of_hosts = set()
    unique_data = []
    unique_bad = []

    for line in log_data:
        if line['host'] not in set_of_hosts:
            unique_data.append(line)
            set_of_hosts.add(line['host'])

    for line in anomalies:
        if line['host'] not in set_of_hosts:
            unique_bad.append(line)
            set_of_hosts.add(line['host'])

    return unique_data, unique_bad, set_of_hosts

unique_host, bad_requests, count_unique = unique(log_data, handled_anomalies)
print("Number of unique ip: ", len(count_unique), "\n")
print("Unique IP", count_unique, "\n")
print("Bad requests: ", bad_requests, "\n")

Number of unique ip:  35 

Unique IP {'72.21.217.130', '119.205.214.218', '77.47.170.44', '109.207.199.67', '216.165.248.218', '188.163.67.155', '66.102.9.56', '188.163.67.14', '54.198.165.221', '5.101.40.8', '60.191.38.77', '66.118.142.167', '46.219.213.75', '88.135.251.193', '211.234.106.243', '194.183.170.131', '::1', '77.122.225.17', '218.53.106.7', '203.158.202.26', '193.201.105.15', '72.21.217.72', '176.116.89.6', '37.249.7.208', '188.163.8.74', '188.163.67.153', '188.163.67.102', '172.104.108.109', '195.12.59.18', '66.102.9.54', '188.163.67.104', '5.188.203.125', '51.218.218.149', '89.248.172.76', '107.170.255.200'} 

Bad requests:  [{'host': '172.104.108.109', 'user': '-', 'time': '22/Feb/2018:10:52:12 +0000', 'request': 'GET / HTTP/1.1', 'status': '200', 'size': '1588', 'referrer': '-', 'agent': 'Mozilla/5.0'}, {'host': '5.101.40.8', 'user': '-', 'time': '22/Feb/2018:11:14:25 +0000', 'request': '\\x03', 'status': '400', 'size': '0', 'referrer': '-', 'agent': '-'}, {'host': '54

In [9]:
counter = defaultdict(Counter)
for line in log_data:
    for key, value in line.items():
        counter[key].update([value])

print(counter['OS'])

Counter({'Windows NT 6.1': 28081, 'Windows NT 10.0; Win64; x64': 19895, 'Windows NT 6.1; Win64; x64': 1646, 'Windows NT 6.1; WOW64': 438, 'Windows NT 6.1; WOW64; Trident/7.0; rv:11.0': 236, 'Ubuntu': 144, 'iPhone; CPU iPhone OS 11_2_5 like Mac OS X': 79, 'X11; Linux x86_64': 73, 'Macintosh; Intel Mac OS X 10_12_3': 27, 'Windows NT 5.1; rv:9.0.1': 2, 'Macintosh; Intel Mac OS X 10.11; rv:47.0': 1})


In [10]:
print(counter['agent'])

Counter({'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36': 49491, 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36': 562, 'like Gecko': 236, '(internal dummy connection)': 144, 'AppleWebKit/604.1.34 (KHTML, like Gecko) CriOS/64.0.3282.112 Mobile/15D60 Safari/604.1': 79, 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36': 71, 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36': 27, 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.168 Safari/537.36': 7, 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 Google Favicon': 2, 'Gecko/20100101 Firefox/9.0.1': 2, 'Gecko/20100101 Firefox/47.0': 1})


In [11]:
hours = Counter()
for time_, count in counter['time'].items():
        hours[time_[12:14]] += count
        
print(hours)

Counter({'13': 12067, '09': 11009, '14': 8525, '20': 7172, '12': 6036, '21': 2330, '10': 988, '08': 918, '15': 455, '17': 269, '16': 231, '18': 166, '07': 131, '11': 97, '23': 93, '22': 79, '06': 18, '19': 16, '05': 9, '02': 7, '00': 4, '03': 2})


In [20]:
countries = Counter()
import requests
for host, count in counter['host'].items():
    if host != '::1':
        a = requests.get("https://ipinfo.io/" + host + "/country")
        if a is not None:
            countries[a.text.replace("\n","")] += count
print(countries)

Counter({'UA': 50430, 'KR': 18, 'TH': 9, 'PL': 8, 'FR': 7, 'US': 4, 'CN': 1, 'RU': 1})
