In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('datasets/WEB_LOG_DATA.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5866 entries, 0 to 5865
Data columns (total 6 columns):
ip           5866 non-null object
date_time    5866 non-null object
request      5866 non-null object
step         5866 non-null int64
session      5866 non-null int64
user_id      5866 non-null int64
dtypes: int64(3), object(3)
memory usage: 275.0+ KB


In [2]:
df.head()

Unnamed: 0,ip,date_time,request,step,session,user_id
0,c210-49-32-6.rochd2.,18/Aug/2017:21:25:07,/,1,3,3
1,visp.inabox.telstra.,19/Aug/2017:08:24:28,/,1,12,12
2,adsl-61-95-54-84.requ,19/Aug/2017:08:33:01,/,1,13,13
3,d220-236-91-52.adsl.n,19/Aug/2017:09:16:06,/,1,15,15
4,allptrs.eq.edu.au,19/Aug/2017:09:47:54,/,1,22,22


In [3]:
df = df[df['request']!='/robots.txt']
df = df[df['request']!='/favicon.ico']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4917 entries, 0 to 5865
Data columns (total 6 columns):
ip           4917 non-null object
date_time    4917 non-null object
request      4917 non-null object
step         4917 non-null int64
session      4917 non-null int64
user_id      4917 non-null int64
dtypes: int64(3), object(3)
memory usage: 268.9+ KB


In [4]:
sessions = df.groupby(['user_id'])['request'].apply(list)
print(sessions.head(10))

user_id
2                         [/code/Global/code/menu.html]
3      [/, /guarantee.html, /more.html, /services.html]
5                            [/code/Ultra/services.htm]
6     [/richlands, /richlands/, /richlands/fileuploa...
7     [/newfarm, /newfarm/specials/, /richlands, /ri...
9                        [/code/Global/code/oform.html]
10    [/cgi-bin/FormMail.pl, /cgi-bin/FormMail.pl, /...
11    [/newfarm/, /richlands/, /richlands/contact/, ...
12                                                  [/]
13                                    [/, /direct.html]
Name: request, dtype: object


In [5]:
from apyori import apriori
# type cast the transactions from pandas into normal list format and run apriori
session_list = list(sessions)
results = list(apriori(session_list, min_support=0.03, min_confidence=0.1))
# print first 5 rules
print(results[:5])

[RelationRecord(items=frozenset({'/'}), support=0.4399271844660194, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'/'}), confidence=0.4399271844660194, lift=1.0)]), RelationRecord(items=frozenset({'/newfarm/'}), support=0.15351941747572814, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'/newfarm/'}), confidence=0.15351941747572814, lift=1.0)]), RelationRecord(items=frozenset({'/newfarm/javascript/menu.js'}), support=0.16868932038834952, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'/newfarm/javascript/menu.js'}), confidence=0.16868932038834952, lift=1.0)]), RelationRecord(items=frozenset({'/newfarm/pricelist'}), support=0.10800970873786407, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'/newfarm/pricelist'}), confidence=0.10800970873786407, lift=1.0)]), RelationRecord(items=frozenset({'/newfarm/pricelist/'}), support=0.1049757281553398, ordere

In [6]:
def convert_apriori_results_to_pandas_df(results):
    rules = []

    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
            rule_set.support, rule.confidence, rule.lift])

    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])
result_df = convert_apriori_results_to_pandas_df(results)
result_df.head(20)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
0,,/,0.439927,0.439927,1.0
1,,/newfarm/,0.153519,0.153519,1.0
2,,/newfarm/javascript/menu.js,0.168689,0.168689,1.0
3,,/newfarm/pricelist,0.10801,0.10801,1.0
4,,/newfarm/pricelist/,0.104976,0.104976,1.0
5,,/services.html,0.141383,0.141383,1.0
6,/direct.html,/,0.037621,0.704545,1.601505
7,/index.html,/,0.03034,0.675676,1.535881
8,/,/more.html,0.067961,0.154483,1.831565
9,/more.html,/,0.067961,0.805755,1.831565


In [7]:
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Lift', ascending=False)
result_df.head(50)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
170,"/richlands,/richlands/javascript/menu.js",/richlands/,0.033981,0.982456,20.757535
164,"/newfarm/specials/,/newfarm/pricelist",/newfarm/specials,0.032767,0.830769,20.744056
237,"/newfarm/specials/,/newfarm/pricelist,/newfarm...",/newfarm/specials,0.031553,0.825397,20.609909
229,"/newfarm/specials/,/newfarm/pricelist/,/newfar...",/newfarm/specials,0.030947,0.822581,20.539589
256,"/newfarm/specials/,/newfarm/pricelist,/newfarm...",/newfarm/specials,0.030947,0.822581,20.539589
225,"/newfarm/specials/,/newfarm/pricelist,/newfarm...",/newfarm/specials,0.030947,0.822581,20.539589
86,"/richlands/,/",/richlands,0.03034,0.980392,20.451725
85,"/richlands,/",/richlands/,0.03034,0.961538,20.315582
167,"/newfarm/specials/,/newfarm/pricelist/",/newfarm/specials,0.031553,0.8125,20.287879
171,"/richlands/,/richlands/javascript/menu.js",/richlands,0.033981,0.933333,19.470042


In [8]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260 entries, 170 to 12
Data columns (total 5 columns):
Left_side     260 non-null object
Right_side    260 non-null object
Support       260 non-null float64
Confidence    260 non-null float64
Lift          260 non-null float64
dtypes: float64(3), object(2)
memory usage: 12.2+ KB


In [9]:
result_df

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
170,"/richlands,/richlands/javascript/menu.js",/richlands/,0.033981,0.982456,20.757535
164,"/newfarm/specials/,/newfarm/pricelist",/newfarm/specials,0.032767,0.830769,20.744056
237,"/newfarm/specials/,/newfarm/pricelist,/newfarm...",/newfarm/specials,0.031553,0.825397,20.609909
229,"/newfarm/specials/,/newfarm/pricelist/,/newfar...",/newfarm/specials,0.030947,0.822581,20.539589
256,"/newfarm/specials/,/newfarm/pricelist,/newfarm...",/newfarm/specials,0.030947,0.822581,20.539589
225,"/newfarm/specials/,/newfarm/pricelist,/newfarm...",/newfarm/specials,0.030947,0.822581,20.539589
86,"/richlands/,/",/richlands,0.030340,0.980392,20.451725
85,"/richlands,/",/richlands/,0.030340,0.961538,20.315582
167,"/newfarm/specials/,/newfarm/pricelist/",/newfarm/specials,0.031553,0.812500,20.287879
171,"/richlands/,/richlands/javascript/menu.js",/richlands,0.033981,0.933333,19.470042
