# WEB USAGE MINING: ANALYTICS

### Regular expressions

In [1]:
from datetime import datetime
import pytz
#%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"
def parse_str(x):
    """
    Returns the string delimited by two characters.
    Example:
    `>>> parse_str('[my string]')`
    `'my string'`
    """
    if x is None:
        return '-'
    return x[1:-1]

### parse_int

In [2]:
def parse_int(x):
    """Returns parsed string if no error occured during parse
    else returns 0"""
    if x is None:
        return 0
    try:
        y = int(x)
        return y
    except ValueError:
        return 0

In [3]:
def parse_datetime(x):
    '''
    Parses datetime with timezone formatted as:
    `[day/month/year:hour:minute:second zone]`
    Example:
    `>>> parse_datetime('13/Nov/2015:11:45:42 +0000')`
    `datetime.datetime(2015, 11, 3, 11, 45, 4, tzinfo=<UTC>)`
    Due to problems parsing the timezone (`%z`) with
    `datetime.strptime`, the
    timezone will be obtained using the `pytz` library.
    '''
    try:
        dt = datetime.strptime(x[1:-7], '%d/%b/%Y:%H:%M:%S')
        dt_tz = int(x[-6:-3])*60+int(x[-3:-1])
        return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))
    except ValueError:
        return '-'

In [4]:
import re
import pandas as pd

In [5]:
conv = {'time': parse_datetime,
'request': parse_str,
'status': parse_int,
'size': parse_int,
'referer': parse_str,
'user_agent': parse_str}

In [6]:
data = pd.read_csv('apache_logs.txt', header=None, error_bad_lines=False, sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
engine='python', na_values='-', usecols=[0, 3, 4, 5, 6, 7, 8],
#names=['ip', 'time', 'request', 'status', 'size','referer', 'user_agent'],
names=['Client_IP_address',
'Time_of_activity','Requested_Page', 'Status_Code', 'Size_of_Page',
'Referer_page','Client_Device'], converters=conv)



  data = pd.read_csv('apache_logs.txt', header=None, error_bad_lines=False, sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',


In [7]:
data.head()

Unnamed: 0,Client_IP_address,Time_of_activity,Requested_Page,Status_Code,Size_of_Page,Referer_page,Client_Device
0,83.149.9.216,[17/May/2015:10:05:03 +0000],"""GET /presentations/logstash-monitorama-2013/i...",200,203023.0,"""http://semicomplete.com/presentations/logstas...","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1..."
1,83.149.9.216,[17/May/2015:10:05:43 +0000],"""GET /presentations/logstash-monitorama-2013/i...",200,171717.0,"""http://semicomplete.com/presentations/logstas...","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1..."
2,83.149.9.216,[17/May/2015:10:05:47 +0000],"""GET /presentations/logstash-monitorama-2013/p...",200,26185.0,"""http://semicomplete.com/presentations/logstas...","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1..."
3,83.149.9.216,[17/May/2015:10:05:12 +0000],"""GET /presentations/logstash-monitorama-2013/p...",200,7697.0,"""http://semicomplete.com/presentations/logstas...","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1..."
4,83.149.9.216,[17/May/2015:10:05:07 +0000],"""GET /presentations/logstash-monitorama-2013/p...",200,2892.0,"""http://semicomplete.com/presentations/logstas...","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1..."


In [8]:
IP=set(data['Client_IP_address'])
len(IP)

1754

In [9]:
devices=set(data['Client_Device'])
len(devices)

559

In [10]:
D={}
for i,row in data.iterrows():
    if(tuple((row['Client_IP_address'],row['Client_Device'])) in D):
        D[tuple((row['Client_IP_address'],row['Client_Device']))]+=1
    else:
        D[tuple((row['Client_IP_address'],row['Client_Device']))]=1

10 people who visited frequently

In [11]:
freq = sorted(D, key = D.get, reverse = True)
top_10 = freq[:10]
for i,j in enumerate(top_10, start=1):
    print(i,"",j)

1  ('46.105.14.53', '"UniversalFeedParser/4.2-pre-314-svn +http://feedparser.org/"')
2  ('130.237.218.86', '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.91 Safari/537.36"')
3  ('75.97.9.59', '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36"')
4  ('66.249.73.135', '"Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"')
5  ('66.249.73.135', '"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"')
6  ('50.16.19.13', '"Tiny Tiny RSS/1.11 (http://tt-rss.org/)"')
7  ('68.180.224.225', '"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"')
8  ('208.115.111.72', '"Mozilla/5.0 (compatible; Ezooms/1.0; help@moz.com)"')
9  ('198.46.149.143', '"Tiny Tiny RSS/1.11 (http://tt-rss.or

Sessions and page views per session

In [12]:
time=[]
page=[]
al=[]
for i in list(D)[:500]:
    for j,row in data.iterrows():
        if(i==tuple((row['Client_IP_address'],row['Client_Device']))):
            time.append(row['Time_of_activity'])
            page.append(row['Requested_Page'])
            al.append(i)

In [14]:
df=pd.DataFrame({'Time':time,'Pages':page,'Client':al})
df.head()

Unnamed: 0,Time,Pages,Client
0,[17/May/2015:10:05:03 +0000],"""GET /presentations/logstash-monitorama-2013/i...","(83.149.9.216, ""Mozilla/5.0 (Macintosh; Intel ..."
1,[17/May/2015:10:05:43 +0000],"""GET /presentations/logstash-monitorama-2013/i...","(83.149.9.216, ""Mozilla/5.0 (Macintosh; Intel ..."
2,[17/May/2015:10:05:47 +0000],"""GET /presentations/logstash-monitorama-2013/p...","(83.149.9.216, ""Mozilla/5.0 (Macintosh; Intel ..."
3,[17/May/2015:10:05:12 +0000],"""GET /presentations/logstash-monitorama-2013/p...","(83.149.9.216, ""Mozilla/5.0 (Macintosh; Intel ..."
4,[17/May/2015:10:05:07 +0000],"""GET /presentations/logstash-monitorama-2013/p...","(83.149.9.216, ""Mozilla/5.0 (Macintosh; Intel ..."


Five most referrer websites

In [15]:
referers = data['Referer_page'].dropna(how='all')

In [16]:
from urllib.parse import urlparse

In [17]:
ref=[]
for i in referers:
    parsed_uri = urlparse(i.replace('"', ''))
    result = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    ref.append(result)
word_counter = {}

In [18]:
for url in ref:
    if url in word_counter:
        word_counter[url] += 1
    else:
        word_counter[url] = 1

In [19]:
popular_words = sorted(word_counter, key = word_counter.get, reverse = True)

In [20]:
top_5 = popular_words[:6]
for i in top_5:
    if(i==':///'''):
        top_5.remove(i)

In [21]:
for i,j in enumerate(top_5,start=1):
    print(i,"",j)

1  http://www.semicomplete.com/
2  http://semicomplete.com/
3  http://www.google.com/
4  https://www.google.com/
5  http://stackoverflow.com/


Ten most frequent patterns

In [22]:
Pages_visited_per_session=list(df['Pages'])
Pages_accessed={}
for i,row in df.iterrows():
    if(row['Client'] in Pages_accessed):
        Pages_accessed[row['Client']].append(row['Pages'])
    else:
        Pages_accessed[row['Client']]=[row['Pages']]

In [23]:
for i in Pages_accessed:
    print('\n Client {} accessed these pages \n {}\n'.format(i,Pages_accessed[i]))


 Client ('83.149.9.216', '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"') accessed these pages 
 ['"GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1"', '"GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1"', '"GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1"', '"GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1"', '"GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1"', '"GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1"', '"GET /presentations/logstash-monitorama-2013/css/fonts/Roboto-Bold.ttf HTTP/1.1"', '"GET /presentations/logstash-monitorama-2013/css/fonts/Roboto-Regular.ttf HTTP/1.1"', '"GET /presentations/logstash-monitorama-2013/images/frontend-response-codes.png HTTP/1.1"', '"GET /presentations/logstash-monitorama-2013/images/kibana-das

In [24]:
all_pages=[]
for i in Pages_accessed:
    all_pages.append(Pages_accessed[i])

### Apriori

In [25]:
from apyori import apriori

In [26]:
association_rules = apriori(all_pages, min_support=0.1, min_confidence=0.5, min_lift=1, min_length=7)
association_results = list(association_rules)

In [28]:
support=[]
items=[]
rhs=[]
lhs=[]
con=[]
lift=[]
for i in association_results:
    support.append(i.support)
    items.append(i.items)
    rhs.append(i.ordered_statistics[0][1])
    lhs.append(i.ordered_statistics[0][0])
    con.append(i.ordered_statistics[0][2])
    lift.append(i.ordered_statistics[0][3])

In [29]:
df=pd.DataFrame({'Items':items,'Antecedent':lhs,'Precedent':rhs,'Support':support, 'Confidence':con,'Lift':lift})

In [31]:
df.head()

Unnamed: 0,Items,Antecedent,Precedent,Support,Confidence,Lift
0,"(""GET /favicon.ico HTTP/1.1"", ""GET /images/jor...","(""GET /favicon.ico HTTP/1.1"")","(""GET /images/jordan-80.png HTTP/1.1"")",0.18,0.580645,2.592166
1,"(""GET /favicon.ico HTTP/1.1"", ""GET /images/web...","(""GET /favicon.ico HTTP/1.1"")","(""GET /images/web/2009/banner.png HTTP/1.1"")",0.18,0.580645,2.639296
2,"(""GET /favicon.ico HTTP/1.1"", ""GET /reset.css ...","(""GET /favicon.ico HTTP/1.1"")","(""GET /reset.css HTTP/1.1"")",0.18,0.580645,2.524544
3,"(""GET /favicon.ico HTTP/1.1"", ""GET /style2.css...","(""GET /favicon.ico HTTP/1.1"")","(""GET /style2.css HTTP/1.1"")",0.18,0.580645,2.502781
4,"(""GET /images/jordan-80.png HTTP/1.1"", ""GET /i...","(""GET /images/jordan-80.png HTTP/1.1"")","(""GET /images/web/2009/banner.png HTTP/1.1"")",0.216,0.964286,4.383117


In [30]:
s_p=df[df['Support']>0.20].sort_values('Support',ascending=False)
for i,items in df.iterrows():
    # first index of the inner list
    # Contains base item and add item
    # pair = item[0]
    # items = [x for x in pair]
    # print(item[0])
    print("Rule: " + str(items['Antecedent']) + " -> " + str(items['Precedent']))
    #second index of the inner list
    print("Support: " + str(items['Support']))
    #third index of the list located at 0th
    #of the third index of the inner list
    print("Confidence: " + str(items['Confidence']))
    print("Lift: " + str(items['Lift']))
    print("=====================================")

Rule: frozenset({'"GET /favicon.ico HTTP/1.1"'}) -> frozenset({'"GET /images/jordan-80.png HTTP/1.1"'})
Support: 0.18
Confidence: 0.5806451612903225
Lift: 2.592165898617511
Rule: frozenset({'"GET /favicon.ico HTTP/1.1"'}) -> frozenset({'"GET /images/web/2009/banner.png HTTP/1.1"'})
Support: 0.18
Confidence: 0.5806451612903225
Lift: 2.6392961876832843
Rule: frozenset({'"GET /favicon.ico HTTP/1.1"'}) -> frozenset({'"GET /reset.css HTTP/1.1"'})
Support: 0.18
Confidence: 0.5806451612903225
Lift: 2.5245441795231414
Rule: frozenset({'"GET /favicon.ico HTTP/1.1"'}) -> frozenset({'"GET /style2.css HTTP/1.1"'})
Support: 0.18
Confidence: 0.5806451612903225
Lift: 2.5027808676307
Rule: frozenset({'"GET /images/jordan-80.png HTTP/1.1"'}) -> frozenset({'"GET /images/web/2009/banner.png HTTP/1.1"'})
Support: 0.216
Confidence: 0.9642857142857143
Lift: 4.383116883116883
Rule: frozenset({'"GET /images/jordan-80.png HTTP/1.1"'}) -> frozenset({'"GET /reset.css HTTP/1.1"'})
Support: 0.214
Confidence: 0.955