In [40]:
%pip install numpy pandas pyfpgrowth

Collecting pyfpgrowth
  Downloading pyfpgrowth-1.0.tar.gz (1.6 MB)
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
      --------------------------------------- 0.0/1.6 MB 991.0 kB/s eta 0:00:02
     ----- ---------------------------------- 0.2/1.6 MB 3.0 MB/s eta 0:00:01
     ------------------- -------------------- 0.8/1.6 MB 6.2 MB/s eta 0:00:01
     -------------------------------- ------- 1.3/1.6 MB 7.7 MB/s eta 0:00:01
     ---------------------------------------- 1.6/1.6 MB 8.0 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pyfpgrowth
  Building wheel for pyfpgrowth (setup.py): started
  Building wheel for pyfpgrowth (setup.py): finished with status 'done'
  Created wheel for pyfpgrowth: filename=pyfpgrowth-1.0-py2.py3-none-any.whl size=5510 sha256=5ceb35a01c317c4199ad8afce598ad2889bef5e29715260f57d144893bc122b9
  Stored in directory: c:\users\ad

In [41]:
import pandas as pd
import numpy as np
import pyfpgrowth 

In [42]:
df = pd.read_csv('./data/www.csv', sep='\t')
df = df[df.notna().any(axis = 'columns')]
print('Number of rows before removing duplicates:', len(df)) 
df = df.drop_duplicates('Query', keep='first') 
df.reset_index(drop=True, inplace=True) 
print('Number of rows after removing duplicates:', len(df))

Number of rows before removing duplicates: 9999
Number of rows after removing duplicates: 9816


In [45]:
# Convert the Query column to a list of lists 
associations = df['Query'].apply(lambda x: x.split()).tolist() 
 
num_records = len(associations)
print(num_records)

9816


In [46]:
sigma = 100 
min_support = sigma / num_records 
print(f"Minimum support: {min_support}") 
 
# the function find_frequent_patterns takes SIGMA as second parameter 
patterns = pyfpgrowth.find_frequent_patterns(associations, sigma) 
patterns

Minimum support: 0.010187449062754686


{('high',): 101,
 ('with',): 104,
 ('my',): 110,
 ('home',): 113,
 ('you',): 117,
 ('i',): 117,
 ('is',): 118,
 ('state',): 123,
 ('what',): 128,
 ('city',): 132,
 ('florida',): 136,
 ('school',): 154,
 ('lyrics',): 175,
 ('how',): 178,
 ('how', 'to'): 130,
 ('new',): 222,
 ('http',): 224,
 ('free',): 225,
 ('on',): 232,
 ('county',): 237,
 ('a',): 340,
 ('to',): 426,
 ('and',): 546,
 ('for',): 553,
 ('in', 'the'): 117,
 ('of', 'the'): 207,
 ('in',): 844,
 ('of',): 955}

In [47]:
# number of frequent itemsets found 
num_frequent_itemsets = len(patterns) 
 
# maximum size of frequent itemsets 
max_itemset_size = max(len(itemset) for itemset in patterns) 
 
print(f'Number of frequent itemsets: {num_frequent_itemsets}') 
print(f'Maximum size of frequent itemsets: {max_itemset_size}')

Number of frequent itemsets: 28
Maximum size of frequent itemsets: 2


In [49]:
#  the support of an itemset is the fraction of records containing the items in the itemset (in this case, the keywords) 
support = {key: value / len(df) for key, value in patterns.items()} 
support 

{('high',): 0.010289323553382234,
 ('with',): 0.010594947025264874,
 ('my',): 0.011206193969030154,
 ('home',): 0.011511817440912795,
 ('you',): 0.011919315403422982,
 ('i',): 0.011919315403422982,
 ('is',): 0.01202118989405053,
 ('state',): 0.012530562347188265,
 ('what',): 0.013039934800325998,
 ('city',): 0.013447432762836185,
 ('florida',): 0.013854930725346373,
 ('school',): 0.015688671556642216,
 ('lyrics',): 0.0178280358598207,
 ('how',): 0.018133659331703342,
 ('how', 'to'): 0.013243683781581092,
 ('new',): 0.022616136919315404,
 ('http',): 0.022819885900570498,
 ('free',): 0.022921760391198046,
 ('on',): 0.023634881825590873,
 ('county',): 0.024144254278728607,
 ('a',): 0.03463732681336593,
 ('to',): 0.04339853300733496,
 ('and',): 0.055623471882640586,
 ('for',): 0.05633659331703342,
 ('in', 'the'): 0.011919315403422982,
 ('of', 'the'): 0.0210880195599022,
 ('in',): 0.08598207008964955,
 ('of',): 0.09729013854930725}