# Analysis of shebanq log files

We read the apache2 log files, filter out irrelevant events, analyse remaining events, and produce usable statistics.

First we tell the script where to look for the data.

In [8]:
import os, sys, collections, re, csv, gzip, time, pickle
from subprocess import check_output, CalledProcessError
from glob import glob
from datetime import datetime
from ipaddress import ip_address

GZIP_LEVEL = 2
PICKLE_PROTOCOL = 3

VLIMIT = 6

DATE = '2015-06-09'

base_dir = '/Users/dirk/SURFdrive/current/projects/etcbc/shebanq/work/managementinfo'
log_dir = 'logs-{}'.format(DATE)
access_in_file = '{}/{}/{}'.format(base_dir, log_dir, 'access.log')
access_file = '{}/{}'.format(base_dir, 'access_table.pickle')
ip_file = '{}/{}'.format(base_dir, 'ip.pickle')

hosts_file = '{}/{}'.format(base_dir, 'hosts.pickle')
geo_in_file = '{}/{}'.format(base_dir, 'dbip-city.csv')
geo_out_file = '{}/{}'.format(base_dir, 'ip_loc.pickle')
countryfile = '{}/{}'.format(base_dir, 'countrycodes.txt')
ip_out_file = '{}/{}'.format(base_dir, 'ip_geo_sorted.txt')

log_dir = '{}/{}'.format(base_dir, log_dir)
log_pats = ('access', 'ssl-access')
log_files = []

class Timestamp(object):
    def __init__(self):
        self.timestamp = time.time()

    def msg(self, m, newline=True, withtime=True):
        timed_msg = "{:>7} ".format(self._elapsed()) if withtime else ''
        timed_msg += m
        if newline: timed_msg += "\n"
        sys.stderr.write(timed_msg)
        sys.stderr.flush()

    def reset(self): self.timestamp = time.time()

    def _elapsed(self):
        interval = time.time() - self.timestamp
        if interval < 10: return "{: 2.2f}s".format(interval)
        interval = int(round(interval))
        if interval < 60: return "{:>2d}s".format(interval)
        if interval < 3600: return "{:>2d}m {:>02d}s".format(interval // 60, interval % 60)
        return "{:>2d}h {:>02d}m {:>02d}s".format(interval // 3600, (interval % 3600) // 60, interval % 60)

tm = Timestamp()

## Compiling additional data

We load databases with country codes and ip location information.
After processing the big ip location database, we write a compiled, pickled version to disk.
We only compile if the database is newe than the compiled version.

In [10]:
def get_ip_location_data():
    tm.reset()

    def get_countrycodes():
        tm.reset()
        countrycodes = {}
        with open(countryfile) as cf:
            for line in cf:
                (code, country) = line.rstrip('\n').split('=', 1)
                countrycodes[code] = country
        tm.msg('{} countries'.format(len(countrycodes)))
        return countrycodes
    
    def ipcompile():
        tm.msg('Start compiling')
        ln = 0
        locations = []
        chunk = 100000
        j = 0
        errors = []
        v6 = 0
        locations = []
        
        with open(geo_in_file, newline='') as csvfile:
            georeader = csv.reader(csvfile, delimiter=',', quotechar='"')
            for (ip_start, ip_end, country, region, city) in georeader:
                ln += 1
                try:
                    start_addr = ip_address(ip_start)
                    end_addr = ip_address(ip_end)
                except:
                    errors.append(ln)
                    continue
                if start_addr.version == 6:
                    v6 += 1
                    continue
                locations.append((int(start_addr), int(end_addr), country, region, city))
                j += 1
                if j == chunk:
                    tm.msg('{:>8} ranges ({:>5} ipv6; {:>4} errors)'.format(ln, v6, len(errors)))
                    j = 0
        tm.msg('{:>8} ranges ({:>5} ipv6; {:>4} errors)'.format(ln, v6, len(errors)))
        tm.msg('Sorting ip address ranges')
        locations.sort()
        tm.msg('Writing to disk')
        with gzip.open(geo_out_file, "wb", compresslevel=GZIP_LEVEL) as f: 
            pickle.dump(locations, f, protocol=PICKLE_PROTOCOL)
        tm.msg('End compiling')
        tm.msg('Checking overlaps')
        overlaps = []
        cure = -1
        i = 0
        for x in locations:
            (b, e) = (x[0], x[1])
            if b <= cure:
                overlaps.append(x)
            cure = e
            i += 1
        tm.msg('{} overlaps in {} ranges'.format(len(overlaps), i))
        return locations

    def ipload():
        tm.msg('Start loading')
        with gzip.open(geo_out_file, "rb") as f: locations = pickle.load(f)
        tm.msg('End loading')
        return locations

    countrycodes = get_countrycodes()
    uptodate = os.path.exists(geo_out_file) and os.path.getmtime(geo_out_file) >= os.path.getmtime(geo_in_file)
    locations = ipload() if uptodate else ipcompile()
    return (locations, countrycodes)

(locations, countrycodes) = get_ip_location_data()

  0.00s 254 countries
  0.00s Start loading
  8.21s End loading


## Reading the log files

We read the log files and filter out the easily detectable irrelevant ones.
Each line is decomposed in a few parts, and those parts are stored as tuples in a big table.
Dates are parsed from Apache formatted strings into logical tuples with year, month, day, hour minute and second.

In [11]:
os.chdir(log_dir)
for log_pat in log_pats:
    log_files += list(glob('{}.*'.format(log_pat)))
log_items = [(name[0:4] == 'ssl-', name) for name in log_files]    

def logical_date(apache_date):
    date = apache_date[0:apache_date.find(' ')]   #  take the string from the second character to the first blank
    return datetime.strptime(date, "%d/%b/%Y:%H:%M:%S")

def readlogs():
    tm.reset()
    
    def logs_compile():
        tm.msg('Compiling logs')
        access_table = []
        visitors = set()
        unmatched = []
        http = []
        longscan = re.compile(r'^([0-9.]+) - - \[([^\]]*)\] "GET ([^ ]*) HTTP[^"]*" ([0-9]{3}) [0-9]+')
        shortscan = re.compile(r'^([0-9.]+) - - \[([^\]]*)\] "[^"]*" ([0-9]{3}) [0-9]+')
        count = collections.Counter()

        pf_admin = {'admin', 'appadmin'}
        pf_irrelevant = {'welcome', 'examples', 'default', 'static', 'select_', 'user', '/?'}

        # 208.115.111.71 - - [05/Apr/2015:07:16:50 +0200] "GET /hebrew/word?id=2489&mr=r&qw=w HTTP/1.1" 200 38229
        
        def read_log(item):
            (is_https, name) = item
            tm.msg('{}'.format(name))
            aid = 0
            with open(name, encoding='utf8') as lf:
                ln = 0
                php = 0
                nhttp = 0
                admin = 0
                irrelev = 0
                ngood = 0
                for line in lf:
                    ln += 1
                    if 'php' in line:
                        php += 1
                        continue
                    cline = line.rstrip('\n')
                    match = longscan.search(cline)
                    if not match:
                        smatch = shortscan.search(cline)
                        if not smatch:
                            unmatched.append((name, ln, cline))
                    else:
                        (ip, date, path, code) = match.group(1, 2, 3, 4)
                        if code == '200':
                            if path.startswith('http'):
                                http.append((name, ln, path))
                                nhttp += 1
                                continue
                            cont = False
                            if path == '//' or path.startswith('/?') or path.startswith('/robots'):
                                irrelev += 1
                                continue

                            for pf in pf_admin:
                                if path.startswith('/'+pf) or path.startswith('/shebanq/'+pf):
                                    admin += 1
                                    cont = True
                                    break
                            if cont: continue
                            for pf in pf_irrelevant:
                                if path.startswith('/'+pf) or path.startswith('/shebanq/'+pf):
                                    irrelev += 1
                                    cont = True
                                    break
                            if cont: continue
                            ngood += 1
                            ld = logical_date(date)
                            aid += 1
                            access_table.append((aid, ip, path, ld.year, ld.month, ld.day, ld.hour, ld.minute, ld.second))
                            visitors.add(ip)
            count['lntot'] += ln
            count['php'] += php
            count['admin'] += admin
            count['irrelev'] += irrelev
            count['ngood'] += ngood
            count['http'] += nhttp

        count.clear()
        
        for litem in log_items:
            read_log(litem)
            count['fltot'] += 1
        count['nun'] = len(unmatched)
        tm.msg('''
Files       :  {:>5} total
Lines       : {:>6} unmatched    {:>6} php          {:>6} http    
Lines       : {:>6} relevant     {:>6} irrelevant
Lines       : {:>6} total
'''.format(
            count['fltot'],
            count['nun'], count['php'], count['http'],
            count['ngood'], count['lntot']-count['ngood'], 
            count['lntot'],
        ), newline=False, withtime=False)
        tm.msg('Writing')
        with gzip.open(access_file, "wb", compresslevel=GZIP_LEVEL) as f: 
            pickle.dump(access_table, f, protocol=PICKLE_PROTOCOL)
        with gzip.open(ip_file, "wb", compresslevel=GZIP_LEVEL) as f: 
            pickle.dump(visitors, f, protocol=PICKLE_PROTOCOL)
        tm.msg('End compiling')
        return (access_table, visitors)
    
    def logs_load():
        access_table = []
        visitors = set()
        tm.msg('Start loading')
        with gzip.open(access_file, "rb") as f: access_table = pickle.load(f)
        with gzip.open(ip_file, "rb") as f: visitors = pickle.load(f)
        tm.msg('End loading')
        return (access_table, visitors)

    uptodate = os.path.exists(access_file) and os.path.exists(ip_file) and \
        os.path.getmtime(access_file) >= os.path.getmtime(access_in_file) and \
        os.path.getmtime(ip_file) >= os.path.getmtime(access_in_file)
    # uptodate = False # uncomment if you want to force compiling
        
    (access_table, visitors) = logs_load() if uptodate else logs_compile()
# count the visitors and remove rare visitors (< vlimit)
    countip = collections.Counter()
    for x in access_table:
        countip[x[1]] += 1
    new_access_table = [x for x in access_table if countip[x[1]] >= VLIMIT]
    freq_visitors = [x for x in visitors if countip[x] >= VLIMIT]
    ntotal = len(countip)
    nfreq = len(freq_visitors)
    nrare = ntotal - nfreq
    tm.msg('{} visitors left after weeding out {} rare visitors from total {} visitors'.format(nfreq, nrare, ntotal))
    return (new_access_table, freq_visitors, countip)

(access_table, visitors, countip) = readlogs()

  0.00s Compiling logs
  0.00s access.log
  0.04s access.log.1
  0.56s access.log.2
  0.82s access.log.3
  1.10s access.log.4
  1.81s access.log.5
  2.21s access.log.6
  2.57s access.log.7
  3.43s ssl-access.log
  4.71s ssl-access.log.1
  4.91s ssl-access.log.2
  5.17s ssl-access.log.3
  5.54s ssl-access.log.4
  5.80s ssl-access.log.5
  6.02s ssl-access.log.6
  6.43s ssl-access.log.7

Files       :     16 total
Lines       :     96 unmatched      8385 php              84 http    
Lines       : 110903 relevant     321530 irrelevant
Lines       : 432433 total
  6.83s Writing
  7.14s End compiling
  7.26s 883 visitors left after weeding out 1288 rare visitors from total 2171 visitors


## IP numbers

Try to filter out as many bots as possible.

In [12]:
known_ip = {
    'andrews': (True, (
            '143.207',
    )),
    'vu': (True, (
            '145.108.',
    )),
    'radboud': (True, (
            '145.116.152',
            '145.116.153',
    )),
    'ruu': (True, (
            '145.107.78',
            '145.107.95',
    )),
    'hackers1': (False, ('208.115.113', '208.115.111','216.107.155.114')),
    'china': (False, (
            '113.105.237.86',
            '113.240.234.213',
            '114.112.90.54',
            '117.78.13.51',
            '121.205.',
            '180.76.4.',
            '180.76.6.',
            '182.118.',
            '183.207.228.52',
            '183.222.152.78',
            '202.46.',
            '220.181.156.195',
            '222.77.',
            '222.66.95.253',
            '27.151.',
            '27.153.',
            '58.20.54.248',
            '61.174.',
            '61.240.',
            '61.91.45.50',
    )),
    'nl': (False, (
            '77.250.128.118',
            '80.79.39.125',
    )),
    'italy': (False, (
            '84.222.108.13',
    )),
    'canada': (False, (
            '142.4.209.182',
            '192.99.',
            '198.27.',
    )),
    'us': (False, (
            '74.82.47.3',
            '104.193.9.137',
            '107.14.54.3',
            '128.61.240.66',
            '162.210.',
            '184.105.247.195',
            '199.116.169.254',
            '199.16.',
            '199.38.59.36',
            '199.58.',
            '199.59.',
    )),
    'israel': (False, (
            '77.126.217.20',
            '77.127.97.169',
            '79.182.',
            '109.64.163.94',
            '109.65.128.124',
            '109.67.105.232',
            '84.94.199.30',
            '89.139.25.20',
            '94.159.130.203',
            '94.188.248.70',
            '95.86.102.108',
            '212.199.169.92',
            '213.184.125.55',
    )),
    'austria': (False, (
            '212.232.24.2',
            '212.95.7.89',
            '213.162.68.83',
    )),
    'germany': (False, (
            '144.76.',
            '188.138.17.205',
            '31.186.',
            '46.165.',
            '79.207.221.215',
            '80.187.103.171',
            '84.150.105.68',
    )),
    'france': (False, (
            '188.165.214.26',
            '212.129.19.248', 
            '212.129.50.222', 
            '62.210.90.118',
            '80.11.154.71',
            '91.121.169.194', 
    )),
    'ukraine': (False, (
            '109.87.114.48',
            '134.249.53.8', 
            '176.111.61.12', 
            '176.8.89.30',
            '178.137.',
            '178.94.172.226',
            '193.201.',
            '193.110.89.6',
            '195.242.218.133',
            '217.12.204.117',
            '37.57.231.241',
            '46.119.',
            '46.151.',
            '77.123.2.4',
            '91.200.',
            '91.207.',
            '91.228.236.9',
            '92.113.32.172',
            '95.133.238.115',
    )),
    'spain': (False, (
            '84.123.113.44',
    )),
    'africa': (False, (
            '196.207.220.97',
            '41.139.152.172',
    )),
    'chile': (False, (
            '164.77.240.34',
            '190.196.11.130',
            '200.29.139.66',
    )),
    'russia': (False, (
            '109.120.157.179',
            '109.205.249.141',
            '188.143.232.19',
            '188.166.30.200',
            '213.108.208.79',
            '213.221.41.6',
            '213.85.68.3',
            '217.74.44.252',
            '31.192.105.59',
            '31.184.',
            '37.1.61.148',
            '46.161.41.199',
            '46.188.29.17',
            '46.39.231.103',
            '78.30.224.104',
            '81.200.27.2',
            '84.47.145.66',
            '85.142.20.210',
            '88.85.172.165',
            '92.242.35.54',
            '95.213.143.180',
    )),
    'brazil': (False, (
            '131.108.116.9',
            '177.107.64.52',
            '177.13.43.3',
            '177.70.',
            '186.201.228.138',
            '187.37.76.23',
            '187.54.173.156',
            '187.87.207.11',
            '189.114.',
            '189.26.183.170',
            '189.31.109.246',
            '189.40.216.70',
            '189.48.245.3',
            '189.82.',
            '189.97.73.90',
            '200.129.187.61',
            '200.204.228.94',
            '200.233.159.217',
            '201.13.78.143',
            '201.14.41.86',
            '201.24.161.163',
            '201.3.30.224',
            '201.38.68.130',
            '201.39.174.227',
            '201.47.248.146',
            '201.55.62.247',
    )),
    'nstrein': (True, ('145.15.244.',)),
    'oliver': (True, ('73.50.251.235',)),
    'dirk': (True, (
            '77.175.245.183',
            '77.174.60.234',
            '77.174.235.57',
    )),
}

known_hi = {
    'vu': (True, ('vu.nl',)),
    'uva': (True, ('uva.nl',)),
    'knaw': (True, ('knaw.nl',)),
    'calvin': (True, ('calvin.edu',)),
    'rug': (True, ('rug.nl',)),
    'radboud': (True, ('ru.nl',)),
    'tudelft': (True, ('tudelft.nl',)),
    'google1': (False, ('googlebot.com',)),
    'google2': (False, ('googleusercontent',)),
    'xbot': (False, ('findxbot',)),
    'baidu': (False, ('baidu.com',)),
    'ahrefs': (False, ('ahrefs.com',)),
    'miscrosoft': (False, ('msnbot',)),
    'yahoo': (False, ('crawl.yahoo.net', 'yse.yahoo.net')),
    'yandex': (False, ('yandex.com',)),
    'LAW Milano': (False, ('law.di.unimi.it',)),
    'hackers2': (False, ('reverse-dns.chicago',)),
    'amazon server': (False, ('amazonaws',)),
    'spammer nl': (False, ('kliksafe.nl',)),
    'hackers germany': (False, ('your-server.de',)),
    'telfort': (True, ('telfortglasvezel',)),
    'ziggo': (True, ('ziggo.nl',)),
    'hetnet': (True, ('hetnet.nl',)),
    'planet': (True, ('planet.nl',)),
}

bots = collections.defaultdict(lambda: set())
bots_ip = set()
noticed = collections.defaultdict(lambda: set())
noticed_ip = set()
pre_true_visitors = {}

def filter_ip():
    tm.reset()

    def ipdetail(ip):
        for (desc, (keep, pats)) in known_ip.items():
            match = False
            for pat in pats:
                if ip.startswith(pat):
                    match = True
                    break
            if match:
                return (keep, desc)
        return (None, '')


    tm.msg('Filtering on ip')
    for ip in sorted(visitors):
        (keep, desc) = ipdetail(ip)
        if keep == None:
            pre_true_visitors[ip] = '{:<16}'.format(ip)
        elif keep == True:
            pre_true_visitors[ip] = '{:<16} {}'.format(ip, desc)
            noticed[desc].add(ip)
            noticed_ip.add(ip)
        else:
            bots[desc].add(ip)
            bots_ip.add(ip)
    nvisitors = len(visitors)
    nnoticed = len(noticed_ip)
    nnoticed_logical = len(noticed)
    nbots = len(bots_ip)
    nbots_logical = len(bots)
    nnotspec = nvisitors - nnoticed - nbots
    ntrue = len(pre_true_visitors)
    tm.msg('Done')
    tm.msg('{} visitors = {} noticed + {} bots + {} not specified; that leaves {} true visitor candidates'.format(
        nvisitors, nnoticed, nbots, nnotspec, ntrue,
    ), withtime=False)
    for (src, label) in ((bots, 'bot'), (noticed, 'notable')):
        for x in sorted(src):
            tm.msg('{} ({} with {} ips)'.format(
                x, label, len(src[x]),
            ), withtime=False)

filter_ip()

  0.00s Filtering on ip
  0.04s Done
883 visitors = 92 noticed + 115 bots + 676 not specified; that leaves 768 true visitor candidates
austria (bot with 1 ips)
brazil (bot with 1 ips)
canada (bot with 4 ips)
china (bot with 75 ips)
france (bot with 2 ips)
germany (bot with 6 ips)
hackers1 (bot with 3 ips)
nl (bot with 1 ips)
russia (bot with 1 ips)
ukraine (bot with 6 ips)
us (bot with 15 ips)
andrews (notable with 16 ips)
dirk (notable with 2 ips)
nstrein (notable with 2 ips)
oliver (notable with 1 ips)
radboud (notable with 2 ips)
ruu (notable with 1 ips)
vu (notable with 68 ips)


Now we run the host command for all distinct IP numbers.
This takes time (more than 20 minutes currently), so we save the results.

In [16]:
def get_hostinfo():
    tm.reset()
    host_info = {}

    def hostinfo_compile():
        nips = set(pre_true_visitors.keys()) - set(host_info.keys())
        tm.msg('Compiling hostinfo for {} new ips'.format(len(nips)))
        v = 0
        i = 0
        oip = 0
        nip = 0
        chunk = 100
        for ip in pre_true_visitors:
            if ip in host_info:
                oip +=1
                continue
            else:
                nip += 1                
            try:
                detail = (0, check_output(['host', ip], universal_newlines=True).rstrip('\n'))
            except CalledProcessError as e:
                detail = (e.returncode, e.output.rstrip('\n'))
            host_info[ip] = detail
            i += 1
            if i == chunk:
                v += i
                i = 0
                tm.msg('\n{}'.format(v), newline=False)
            tm.msg('.', withtime=False, newline=False)
        v += i

        tm.msg('Writing ({} ips already seen, {} new ips, total {} ips)'.format(oip, nip, len(host_info)))
        with gzip.open(hosts_file, "wb", compresslevel=GZIP_LEVEL) as f: 
            pickle.dump(host_info, f, protocol=PICKLE_PROTOCOL)
        tm.msg('End compiling')
    
    def hostinfo_load():
        host_info = {}
        if os.path.exists(hosts_file):
            tm.msg('Start loading')
            with gzip.open(hosts_file, "rb") as f: host_info = pickle.load(f)
            tm.msg('End loading')
        else:
            tm.msg('No file {}'.format(hosts_file))
        return host_info
        
    host_info = hostinfo_load() 
    uptodate = os.path.exists(hosts_file) and os.path.getmtime(hosts_file) >= os.path.getmtime(ip_file)
    hostinfo_compile()
    new_host_info = {}
    for ip in host_info:
        (code, hinfo) = host_info[ip]
        comps = hinfo.split('in-addr.arpa', 1)
        thecomp = comps[1] if len(comps) == 2 else comps[0]
        ftext = thecomp.replace('\n', ' ').replace('domain ','').replace('name ', '').replace('pointer ','').replace('(NXDOMAIN)', '')
        new_host_info[ip] = (code, ftext)
    return new_host_info

host_info = get_hostinfo()

  0.00s Start loading
  0.01s End loading
  0.02s Compiling hostinfo for 493 new ips
...................................................................................................    28s 
100....................................................................................................    59s 
200.................................................................................................... 1m 19s 
300.................................................................................................... 1m 51s 
400.............................................................................................. 2m 16s Writing (275 ips already seen, 493 new ips, total 4549 ips)
 2m 16s End compiling


### IP numbers: filtering

Now we can use the host info for additional filtering.

In [17]:
true_visitors = {}

def filter_hi():
    tm.reset()

    def ipdetail(ip):
        (code, detail) = host_info[ip]
        for (desc, (keep, pats)) in known_hi.items():
            match = False
            for pat in pats:
                if pat in detail:
                    match = True
                    break
            if match:
                return (keep, desc)
        return (None, '')

    tm.msg('Filtering on hostinfo')
    for (ip, prev) in pre_true_visitors.items():
        (keep, desc) = ipdetail(ip)
        if keep == None:
            true_visitors[ip] = prev
        elif keep == True:
            true_visitors[ip] = '{} {}'.format(prev, desc)
            noticed_ip.add(ip)
            noticed[desc].add(ip)
        else:
            bots[desc].add(ip)
            bots_ip.add(ip)
    nvisitors = len(visitors)
    nnoticed = len(noticed_ip)
    nnoticed_logical = len(noticed)
    nbots = len(bots_ip)
    nbots_logical = len(bots)
    nnotspec = nvisitors - nnoticed - nbots
    ntrue = len(true_visitors)
    tm.msg('Done')
    tm.msg('{} visitors = {} noticed + {} bots + {} not specified; that leaves {} true visitors'.format(
        nvisitors, nnoticed, nbots, nnotspec, ntrue,
    ), withtime=False)
    for (src, label) in ((bots, 'bot'), (noticed, 'notable')):
        for x in sorted(src):
            tm.msg('{} ({} with {} ips)'.format(
                x, label, len(src[x]),
            ), withtime=False)

filter_hi()

for x in sorted(true_visitors.keys(), key=lambda x: -countip[x])[0:100]:
    tm.msg('{:<30} {:>7} {}'.format(true_visitors[x], countip[x], host_info[x][1]), withtime=False)

  0.00s Filtering on hostinfo
  0.01s Done
883 visitors = 143 noticed + 416 bots + 324 not specified; that leaves 467 true visitors
ahrefs (bot with 11 ips)
amazon server (bot with 4 ips)
austria (bot with 1 ips)
baidu (bot with 135 ips)
brazil (bot with 1 ips)
canada (bot with 4 ips)
china (bot with 75 ips)
france (bot with 2 ips)
germany (bot with 6 ips)
google1 (bot with 97 ips)
hackers germany (bot with 18 ips)
hackers1 (bot with 3 ips)
hackers2 (bot with 1 ips)
miscrosoft (bot with 28 ips)
nl (bot with 1 ips)
russia (bot with 1 ips)
ukraine (bot with 6 ips)
us (bot with 15 ips)
xbot (bot with 1 ips)
yahoo (bot with 3 ips)
yandex (bot with 3 ips)
andrews (notable with 16 ips)
dirk (notable with 2 ips)
hetnet (notable with 7 ips)
nstrein (notable with 2 ips)
oliver (notable with 1 ips)
planet (notable with 13 ips)
radboud (notable with 3 ips)
ruu (notable with 1 ips)
telfort (notable with 3 ips)
vu (notable with 89 ips)
ziggo (notable with 8 ips)
46.137.119.184                   113

## Locations

Find the locations of the visitors by means of the location database.

In [18]:
def translate_ips():
    tm.reset()
    ip_errors = {}
    int_ip = {}
    for ip in true_visitors:        
        try:
            iip = int(ip_address(ip))
        except:
            errors[ip] = 'syntax error'
        int_ip[iip] = ip
    srt_iip = sorted(int_ip.keys())
    tm.msg('{} ips to lookup'.format(len(srt_iip)))
    
    info = {}
    ip_loc = {}

    curloc = -1
    notfound = 0
    for iip in srt_iip:
        while curloc+1 < len(locations) and locations[curloc+1][0] <= iip: curloc += 1
        if locations[curloc][1] < iip:
            info[iip] = -1
            notfound += 1
        else:
            info[iip] = curloc
            ip_loc[int_ip[iip]] = locations[curloc][2:5]

    found = len(info)
    tm.msg('{} ips checked. {} ips found; {} not found'.format(found+notfound, found, notfound))
    return ip_loc

ip_loc = translate_ips()

  0.01s 467 ips to lookup
  3.00s 467 ips checked. 467 ips found; 0 not found


## Analysing the paths

We analyse the urls that were requested from apache by assigning each url to a kind.
This results in the following data:

* a mapping from paths to kinds (``path_map``)
* a set of unclassified paths (``unclassified``)
* a counter, that counts the number of accesses per kind (``kinds``) 

In [26]:
path_map = {}
kinds = collections.Counter()
unclassified = set()
pf_meta = {'help', 'about', 'news', 'sources'}
pf_action = {'field', 'fields', 'record'}

label = {
    '': 'ZZ_Unclassified',
    'chartw': 'Charts (word)',
    'chartq': 'Charts (query)',
    'chartn': 'Charts (note)',
    'chartu': 'Charts (other)',
    'csvw': 'CSV (word)',
    'csvq': 'CSV (query)',
    'csvn': 'CSV (note)',
    'csvu': 'CSV (other)',
    'dicthbo': 'Dictionary (Hebrew)',
    'dictarc': 'Dictionary (Aramaic)',
    'dictu': 'Dictionary (other)',
    'matp': 'Verses (passage)',
    'matq': 'Verses (query)',
    'matw': 'Verses (word)',
    'matn': 'Verses (note)',
    'matu': 'Verses (other)',
    'matp-q': 'Queries (side)',
    'matp-w': 'Words (side)',
    'matp-n': 'Notes (side)',
    'wview-l': 'Word (view) (ajax)',
    'wview': 'Word (view)',
    'windex': 'Word (index lookup)',
    'qview-l': 'Query (view) (ajax)',
    'qview': 'Query (view)',
    'qnew': 'Query (new)',
    'qedit': 'Query (edit)',
    'qaction': 'Query (edit action)',
    'qapi': 'Query (api)',
    'qother': 'Query (other)',
    'qtree': 'Queries (tree)',
    'text': 'Verses (text)',
    'verse': 'Verse (data)',
    'meta': 'Info pages',
    'index': 'Home page',
}

def path_analysis():
    tm.reset()
    for a in access_table:
        ip = a[1]
        if ip not in true_visitors: continue
        path = a[2]
        kind = ''
        if path == '/' or path.startswith('/index') or path == '/shebanq/':
            kind = 'index'
        elif path.startswith('/hebrew/chart'):
            if 'qw=q' in path:
                kind = 'chartq'
            elif 'qw=w' in path:
                kind = 'chartw'
            elif 'qw=n' in path:
                kind = 'chartn'
            else:
                kind = 'chartu'
        elif path.startswith('/hebrew/item.csv'):
            if 'qw=q' in path:
                kind = 'csvq'
            elif 'qw=w' in path:
                kind = 'csvw'
            elif 'qw=n' in path:
                kind = 'csvn'
            else:
                kind = 'csvu'
        elif path.startswith('/hebrew/dictionary'):
            if 'lan=hbo' in path:
                kind = 'dicthbo'
            elif 'lan=arc' in path:
                kind = 'dictarc'
            else:
                kind = 'dictu'
        elif path.startswith('/hebrew/sidew.load'):
            kind = 'wview-l'
        elif path.startswith('/hebrew/word') or path.startswith('/hebrew/sidewm'):
            kind = 'wview'
        elif path.startswith('/hebrew/windex.json'):
            kind = 'windex'
        elif path.startswith('/hebrew/material'):
            if 'mr=m' in path:
                kind = 'matp'
            elif 'qw=q' in path:
                kind = 'matq'
            elif 'qw=w' in path:
                kind = 'matw'
            elif 'qw=n' in path:
                kind = 'matn'
            else:
                kind = 'matu'
        elif path.startswith('/hebrew/sidem'):
            if 'qw=q' in path:
                kind = 'matp-q'
            elif 'qw=w' in path:
                kind = 'matp-w'
            elif 'qw=n' in path:
                kind = 'matp-n'
            else:
                kind = 'matp-u'
        elif path.startswith('/hebrew/my_queries') or path.startswith('/hebrew/sideqe.load'):
            if '/edit/' in path:
                kind = 'qedit'
            elif '/new/' in path:
                kind = 'qnew'
            else:
                kind = 'qother'
        elif path.startswith('/hebrew/public_queries'):
            kind = 'qview'
        elif path == '/hebrew/query' or path == '/hebrew/queries' or path.startswith('/hebrew/queries/'):
            kind = 'qother'
        elif path.startswith('/hebrew/queries?') or path.startswith('/hebrew/query?'):
            kind = 'qview'
        elif path.startswith('/hebrew/sideq.load'):
            kind = 'qview-l'
        elif path.startswith('/hebrew/sideqm'):
            kind = 'qview'
        elif path.startswith('/hebrew/query.json'):
            kind = 'qapi'
        elif path.startswith('/hebrew/pq.json'):
            kind = 'qtree'
        elif path.startswith('/hebrew/text'):
            kind = 'text'
        elif path.startswith('/hebrew/verse?') or path == '/hebrew/verse':
            kind = 'verse'
        else:
            for x in [1]:
                cont = False
                for pf in pf_meta:
                    if path.startswith('/'+pf):
                        kind = 'meta'
                        cont = True
                        break
                if cont: continue
                for pf in pf_action:
                    if path.startswith('/hebrew/'+pf+'.json'):
                        kind = 'qaction'
                        cont = True
        if kind == '':
            unclassified.add(path)
        path_map[path] = kind
        kinds[kind] += 1
        
    nclass = 0
    npaths = len(path_map)
    nunclass = len(unclassified)
    naccess = len(access_table)
    tm.msg('Done')
    for kind in sorted(label, key=lambda x: label[x]):
        thisn = kinds.get(kind, 0)
        if kind != '':
            nclass += thisn
        tm.msg('{:<30} : {:>6}'.format(label[kind], thisn), withtime=False)
    tm.msg('{:<30} : {:>6}'.format('Classified', nclass), withtime=False)
    tm.msg('{:<30} : {:>6}'.format('Total', nclass+kinds['']), withtime=False)
    tm.msg('{:<30} : {:>6}'.format('Check', naccess), withtime=False)
    tm.msg('{:<30} : {:>6}'.format('Distinct paths (classified)', npaths - nunclass), withtime=False)
    tm.msg('{:<30} : {:>6}'.format('Distinct paths (unclassified)', nunclass), withtime=False)
    tm.msg('{:<30} : {:>6}'.format('Distinct paths (total)', npaths), withtime=False)

    for p in sorted(unclassified)[0:100]:
        tm.msg('{}'.format(p), withtime=False)

path_analysis()

  0.27s Done
CSV (note)                     :      1
CSV (other)                    :      0
CSV (query)                    :     80
CSV (word)                     :     27
Charts (note)                  :      3
Charts (other)                 :      0
Charts (query)                 :     54
Charts (word)                  :     28
Dictionary (Aramaic)           :      0
Dictionary (Hebrew)            :      0
Dictionary (other)             :      0
Home page                      :   2354
Info pages                     :    615
Notes (side)                   :    118
Queries (side)                 :   1702
Queries (tree)                 :   2180
Query (api)                    :    213
Query (edit action)            :      0
Query (edit)                   :      0
Query (new)                    :      0
Query (other)                  :   2629
Query (view)                   :   5117
Query (view) (ajax)            :      0
Verse (data)                   :   1073
Verses (note)              

## Data reduction

We compile a reduced table containing buckets for each (ip, kind, year, month, day) combination.
We store the number of accesses per bucket.

In [27]:
def ftitle(title):
    titlebar = '-' * (len(title) + 4)
    tm.msg('''{}\n| {} |\n{}\n'''.format(titlebar, title, titlebar), withtime=False)

def data_reduce():
    reduced_access = collections.Counter()
    reduced_table = []
    last_uid = 0
    udone = {}
    users = {}
    uid_ip = {}
    
    for (aid, ip, path, year, month, day, hour, minute, second) in access_table:
        if ip not in true_visitors: continue
        host = true_visitors[ip]
        thekind = path_map[path]
        if ip in udone:
            uid = udone[ip]
        else:
            last_uid += 1
            uid = last_uid
            udone[ip] = uid
            uid_ip[uid] = ip
            users[uid] = ip_loc[ip]
        
        if thekind != '':
            reduced_access[(uid, thekind, year, month, day)] += 1
    for x in reduced_access:
        reduced_table.append((x[0], x[1], x[2], x[3], x[4], reduced_access[x]))
    tm.msg('{} elements reduced to {} elements for {} users in reduced table'.format(
        len(access_table), len(reduced_access), len(users)))
    return (users, uid_ip, reduced_table)

(users, uid_ip, reduced_table) = data_reduce()

    44s 108434 elements reduced to 5812 elements for 467 users in reduced table


## Users

We produce the number of unique visitors per month.
In fact, we count the number of unique ip addresses that have accessed the server per month.

In [28]:
def get_visitors():
    limit = 50
    tm.reset()
    uvisitors = collections.Counter()
    visits = collections.defaultdict(lambda: collections.Counter())
    for (uid, kind, yr, mn, dy, vs) in reduced_table:
        visits[(yr, mn)][uid] += vs
        uvisitors[uid] += vs
    stats = {}
    for x in visits:
        stats['{:>4}-{:02d}'.format(*x)] = len(visits[x])
    tm.msg('Done')
    tm.msg('{:<7} | visitors'.format('month'), withtime=False)
    tm.msg('------------------', withtime=False)
    for x in sorted(stats):
        tm.msg('{:<7} | {:>5}'.format(x, stats[x]), withtime=False)
    tm.msg('------------------', withtime=False)
    tm.msg('{:<5} unique visitors in whole period'.format(len(uvisitors)), withtime=False)
    
    tm.msg('Top visitors\n{}'.format('-' * 12), withtime=False)
    i = 0
    outfile = open(ip_out_file, 'w')
    tm.msg('{:>6} | {:<20} | {:<20} | {:<14} | {:<12}'.format('visits', 'country', 'city', 'ip address', 'info'), withtime=False)
    tm.msg('-' * 80, withtime=False)

    for (uid, vs) in sorted(uvisitors.items(), key=lambda x: (-x[1], x[0])):
        (country, region, city) = users[uid]
        ip = uid_ip[uid]
        host = true_visitors[ip]
        if i < limit:
            tm.msg('{:>6} | {:<20} | {:<20} | {:<30}'.format(
                vs, countrycodes[country], city, host,
            ), withtime=False)
        outfile.write('u{:>04} {:>5} {:<20} - {:<20} - {:<20} = {:<30} {}\n'.format(
                uid, vs, countrycodes[country], region, city, host, host_info[ip][1],
            ))
        i += 1
    outfile.close()
    return uvisitors

uvisitors = get_visitors()

  0.01s Done
month   | visitors
------------------
2015-04 |   201
2015-05 |   323
2015-06 |    89
------------------
467   unique visitors in whole period
Top visitors
------------
visits | country              | city                 | ip address     | info        
--------------------------------------------------------------------------------
  9706 | United Kingdom       | London               | 46.137.119.184                
  1472 | Netherlands          | Rotterdam            | 84.83.215.130    planet       
  1360 | Netherlands          | \'s-Hertogenbosch    | 77.175.245.183   dirk telfort 
  1246 | Canada               | Montreal             | 167.114.172.229               
  1207 | Netherlands          | Rotterdam            | 87.211.158.85                 
  1001 | Netherlands          | Veendam              | 82.173.135.47                 
   993 | Netherlands          | Schiphol-Rijk        | 77.248.252.113                
   976 | Netherlands          | Rotterdam         

In [22]:
def cstats(lbl, keytable, title, srcu, srca):
    stats = []
    for x in srcu:
        nusers = len(srcu[x])
        v_per_user = round(srca[x] / nusers, 2)
        stats.append((x, nusers, v_per_user))
    ftitle(title)
    tm.msg('{:<30} | {:>8} | {:>15}'.format(lbl, '#users', 'visits per user'), withtime=False)
    tm.msg('-' * 59, withtime=False)
    for (cn, nu, vpu) in sorted(stats, key=lambda x: (-x[1], -x[2], x[0])):
        tm.msg('{:<30} | {:8} | {:>15}'.format(keytable[cn], nu, vpu), withtime=False)
    tm.msg(('-' * 59) + '\n', withtime=False)


def get_visitcountries():
    tm.reset()
    visitorscu = collections.defaultdict(lambda:set())
    visitorsca = collections.Counter()
    visitsca = collections.defaultdict(lambda: collections.Counter())
    visitscu = collections.defaultdict(lambda:collections.defaultdict(lambda:set()))
    for (uid, kind, yr, mn, dy, vs) in reduced_table:
        cn = users[uid][0]
        visitscu[(yr, mn)][cn].add(uid)
        visitsca[(yr, mn)][cn] += vs
        visitorscu[cn].add(uid)
        visitorsca[cn] += vs

    cstats('Country', countrycodes, '{} countries in whole period'.format(len(visitorsca)), visitorscu, visitorsca)

    ftitle('Months summary')
    stats = {}
    for x in visitsca:
        stats['{:>4}-{:02d}'.format(*x)] = len(visitsca[x])
    tm.msg('{:<7} | {:>9}'.format('month', 'countries'), withtime=False)
    tm.msg('-' * 19, withtime=False)
    for x in sorted(stats):
        tm.msg('{:<7} | {:>9}'.format(x, stats[x]), withtime=False)
    tm.msg(('-' * 19) + '\n', withtime=False)

    for x in sorted(visitsca):
        cstats('Country', countrycodes, '{:>4}-{:02d}'.format(*x), visitscu[x], visitsca[x])

    tm.msg('Done')

get_visitcountries()


--------------------------------
| 29 countries in whole period |
--------------------------------

Country                        |   #users | visits per user
-----------------------------------------------------------
Netherlands                    |      220 |          117.39
United States                  |       75 |           45.21
France                         |       54 |           45.72
Germany                        |       31 |           91.71
Denmark                        |       14 |           71.07
Brazil                         |       11 |           17.91
United Kingdom                 |       10 |          1041.9
Ukraine                        |        7 |           77.86
Japan                          |        6 |          105.33
Canada                         |        5 |           275.0
Switzerland                    |        5 |            44.4
Australia                      |        4 |           16.75
Korea, Republic of             |        3 |           10.33


## Visits per kind

What parts of shebanq are used most?

In [29]:
def get_visitkinds():
    tm.reset()
    visitorscu = collections.defaultdict(lambda:set())
    visitorsca = collections.Counter()
    visitsca = collections.defaultdict(lambda: collections.Counter())
    visitscu = collections.defaultdict(lambda:collections.defaultdict(lambda:set()))
    for (uid, kind, yr, mn, dy, vs) in reduced_table:
        visitscu[(yr, mn)][kind].add(uid)
        visitsca[(yr, mn)][kind] += vs
        visitorscu[kind].add(uid)
        visitorsca[kind] += vs

    cstats('Kind', label, '{} Kinds in whole period'.format(len(visitorsca)), visitorscu, visitorsca)

    ftitle('Months summary')
    stats = {}
    for x in visitsca:
        stats['{:>4}-{:02d}'.format(*x)] = len(visitsca[x])
    tm.msg('{:<7} | {:>9}'.format('month', 'kinds'), withtime=False)
    tm.msg('-' * 19, withtime=False)
    for x in sorted(stats):
        tm.msg('{:<7} | {:>9}'.format(x, stats[x]), withtime=False)
    tm.msg(('-' * 19) + '\n', withtime=False)

    for x in sorted(visitsca):
        cstats('Kind', label, '{:>4}-{:02d}'.format(*x), visitscu[x], visitsca[x])

    tm.msg('Done')

get_visitkinds()



----------------------------
| 23 Kinds in whole period |
----------------------------

Kind                           |   #users | visits per user
-----------------------------------------------------------
Verses (text)                  |      401 |           21.59
Home page                      |      349 |            6.74
Verses (passage)               |      319 |           10.24
Words (side)                   |      238 |             8.5
Queries (side)                 |      225 |            7.56
Query (view)                   |      221 |           23.15
Query (other)                  |      205 |           12.82
Verses (query)                 |      203 |           31.56
Word (view)                    |      187 |           65.41
Info pages                     |      166 |             3.7
Queries (tree)                 |      165 |           13.21
Verse (data)                   |      138 |            7.78
Verses (word)                  |      111 |            8.23
CSV (query) 