In [1]:
from mlxtend.frequent_patterns import fpgrowth, association_rules, apriori
import timeit
import pandas as pd
import numpy as np
import json
import csv

In [6]:
#! pip install mlxtend


0

In [29]:
#! python -m wget 'https://github.com/dbdmg/data-science-lab/raw/master/datasets/online_retail.csv' -o online_retail.csv
0

0

In [16]:
! python -m wget "https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/modified_coco.json" -o coco.json


Saved under coco.json


In [19]:
file = "coco.json"
with open(file) as f:
    coco_data = json.load(f)


In [22]:
coco_data[0]

{'file_name': '000000095096.png',
 'image_id': 95096,
 'annotations': ['car', 'car', 'train', 'stop sign']}

{  
"file_name": "000000465265.png",  
"image_id": 465265,  
"annotations": \[  
    "person",  
    "person",  
    "person",  
    "fire hydrant",  
    "handbag",  
    "chair",  
    "cell phone"  
]  
}  
This means that the image contains 3 people, a fire hydrant, a handbag, a chair and a cell phone. 

In [23]:
len(coco_data)

5000

## 2.1 Association rules from frequent itemsets

This exercise will work on the Online Retail Data Set.  
In particular, you will do some data preprocessing on the dataset to extract all itemsets available (where each itemset is the collection of items contained in a single invoice).  
Then, using FP-Growth and Apriori implementations, you will extract a list of frequent itemsets.  
From those, you will finally extract several different association rules.

1. First, you need to load the dataset into memory, using the csv module.  
Make sure you identify all valid rows.  
Also consider that rows having an InvoiceNo that starts with C should be discarded, as they indicate that the invoice is about a cancelled purchase.

• InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter §c’, it indicates a cancellation.  
• StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each
distinct product.  
• Description: Product (item) name. Nominal.  
• Quantity: The quantities of each product (item) per transaction. Numeric.  
• InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.  
• UnitPrice: Unit price. Numeric, Product price per unit in sterling.  
• CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.  
• Country: Country name. Nominal, the name of the country where each customer resides.  

In [2]:
file = "online_retail.csv"
retail_data = []
with open(file) as f:
    label = next(f)
    labels_index = {i: k for i, k in enumerate(label.strip().split(","))}
    for row in csv.reader(f):
        if len(row) == 8 and row[0][0] not in ("c", "C"):
            retail_data.append(list([
                row[0],
                row[1],
                row[2].lower().strip(),
                float(row[3]),
                row[4],
                float(row[5]),
                row[6],
                row[7],]
            ))

labels_index

{0: 'InvoiceNo',
 1: 'StockCode',
 2: 'Description',
 3: 'Quantity',
 4: 'InvoiceDate',
 5: 'UnitPrice',
 6: 'CustomerID',
 7: 'Country'}

In [3]:
len(retail_data)

532621

2. Now that you have a dataset of items, you should aggregate it at an “invoice” level.  
For each invoice (identified by InvoiceNo) there can be multiple items (from multiple rows) in the dataset.  
For each invoice, you should build a list of all items belonging to it.

In [25]:
# chars of invoice
set([y for x in retail_data for y in x[0]])

{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A'}

In [4]:
invoice_itemset = {}
global_itemset = set()
for row in retail_data:
    invoice = row[0]
    stock_desc = row[2]
    if row[0] in invoice_itemset:
        invoice_itemset[invoice].append(stock_desc)
    else:
        invoice_itemset[invoice] = [stock_desc]
    global_itemset.add(stock_desc)

In [5]:
invoice_itemset["574021"]

['gardeners kneeling pad keep calm',
 'hot water bottle keep calm',
 'doormat keep calm and come in']

In [8]:
len(global_itemset), len(invoice_itemset)

(4179, 22064)

3. You should now have a list (one for each invoice) of lists (each list containing the items bought for
that invoice).  
Now, we need to convert this into a matrix form.  
Of the many possible formats, we will use the one expected by the Mlxtend library, which is as follows.  
Given an ordered list of M
possible items (in this case, all possible products that can be bought), and given N itemsets (in this
case, invoices), we should build a matrix of N rows and M columns.  
The element at the ith row and jth column should be 1 if the ith itemset (invoice) contains the jth item (product), 0 otherwise.

In [11]:
mat_fast = []
all_items_index = {k:i for i, k in enumerate(global_itemset)}
for iv in invoice_itemset.values():
    presence = np.zeros(len(global_itemset))
    mask = [all_items_index[val] for val in iv]
    presence[mask] = 1
    mat_fast.append(presence)
    

In [6]:
mat = []
for inv in invoice_itemset.values():
    row = []
    for gx in global_itemset:
        if gx in inv:
            row.append(1)
        else:
            row.append(0)
    mat.append(row)        

In [12]:
len(mat[0]), len(mat), len(mat_fast[0]), len(mat_fast)

(4179, 22064, 4179, 22064)

In [16]:
filt = mat_fast[0] > 0
mat_fast[0][filt]

array([1., 1., 1., 1., 1., 1., 1.])

In [14]:
df_np = pd.DataFrame(data= mat_fast, columns= global_itemset)
df_np.head()

Unnamed: 0,Unnamed: 1,cherry blossom luggage tag,pink tall porcelain t-light holder,small hammered silver candlepot,dolly girl baby gift set,christmas star wish list chalkboard,antique glass dressing table pot,wrap summer rose design,"letter ""r"" bling key ring",star t-light holder willie winkie,...,white rosebud pearl earrings,bundle of 3 school exercise books,skulls greeting card,lunch bag suki design,embossed heart trinket box,storage tin vintage doiley,red gingham rose jewellery box,wrap i love london,set 10 cards dinky tree 17076,mirrored wall art skulls
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df = pd.DataFrame(data= mat)

In [24]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4169,4170,4171,4172,4173,4174,4175,4176,4177,4178
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


4. With the df that you defined in the previous exercise,  
you can now use the fp_growth function. This function, which is described in the detail in the official documentation.  
The first argument required is the previously built DataFrame, df.  
The second is the minimum support (minsup), i.e. the minimum fraction of the entire dataset in which the itemset should show up for it to be considered “frequent”.  
Try using different values of minsup, such as 0.5, 0.1, 0.05, 0.02, 0.01.  
How many results do you obtain as minsup varies?  
You can check the number of frequent itemsets identified and print them all with the following code snipped:
```
    fi = fpgrowth(df, 0.05)
    print(len(fi))
    print(fi.to_string())


In [25]:
for x in (0.5,0.1,0.05,0.02,0.01):
    print(x, len(fpgrowth(df, x)))

0.5 0
0.1 1
0.05 23
0.02 304
0.01 1471


In [26]:
fi = fpgrowth(df, 0.1)


In [27]:
fi

Unnamed: 0,support,itemsets
0,0.102429,(2821)


In [28]:
df.iloc[:,2821].sum() / len(df)

0.10242929659173314

In [31]:
fi.values[0][1]

frozenset({2821})

given a tupple (spotty bunting, party bunting), we can check if
spotty bunting ==> party bunting  
or 
party bunting ==> spotty bunting  

lets say minconf = 0.5

In [19]:
# Get all the supports
filt = fi.itemsets == frozenset(('spotty bunting', 'party bunting'))
P_sp = fi.loc[filt, "support"].values[0]
filt = fi.itemsets == frozenset(['spotty bunting'])
P_s = fi.loc[filt, "support"].values[0]
filt = fi.itemsets == frozenset(['party bunting'])
P_p = fi.loc[filt, "support"].values[0]


In [20]:
# spotty bunting ==> party bunting
conf = P_sp / P_s
conf

0.4233128834355828

In [21]:
# party bunting ==> spotty bunting
conf = P_sp / P_p
conf

0.2864768683274021

we can say that [ spotty bunting ==> party bunting ] holds with a probability of above 40%

In [22]:
association_rules(fi, min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(alarm clock bakelike red),(alarm clock bakelike green),0.047634,0.044416,0.029007,0.608944,13.709936,0.026891,2.443597
1,(alarm clock bakelike green),(alarm clock bakelike red),0.044416,0.047634,0.029007,0.653061,13.709936,0.026891,2.745054
2,(alarm clock bakelike pink),(alarm clock bakelike red),0.035488,0.047634,0.021302,0.600255,12.601366,0.019611,2.382436
3,(wooden frame antique white),(wooden picture frame white finish),0.044008,0.049855,0.024429,0.555098,11.134253,0.022235,2.135627
4,(red hanging heart t-light holder),(white hanging heart t-light holder),0.033584,0.102429,0.022435,0.668016,6.52173,0.018995,2.703658
5,(jumbo bag pink polkadot),(jumbo bag red retrospot),0.055203,0.094815,0.037391,0.67734,7.143799,0.032157,2.805382
6,(jumbo storage bag suki),(jumbo bag red retrospot),0.053662,0.094815,0.032814,0.611486,6.449253,0.027726,2.329867
7,(jumbo bag baroque black white),(jumbo bag red retrospot),0.042286,0.094815,0.026514,0.62701,6.612974,0.022504,2.426832
8,(jumbo bag pink vintage paisley),(jumbo bag red retrospot),0.039204,0.094815,0.023477,0.598844,6.315914,0.01976,2.256441
9,(60 teatime fairy cake cases),(pack of 72 retrospot cake cases),0.037527,0.059826,0.020531,0.547101,9.144884,0.018286,2.075904


7. Extract the association rules from the frequent itemsets extracted with minsup = 0.01.  
You can find the documentation for association_rules() on the official documentation.  
You can use the confidence as the metric to identify the rules, and a minimum threshold of 0.85  
(feel free to vary
these values and observe how the results vary).


In [23]:
fi2 = fpgrowth(df, min_support= 0.01, use_colnames= True)

In [145]:
? association_rules

[1;31mSignature:[0m
 [0massociation_rules[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0mmetric[0m[1;33m=[0m[1;34m'confidence'[0m[1;33m,[0m[1;33m
[0m    [0mmin_threshold[0m[1;33m=[0m[1;36m0.8[0m[1;33m,[0m[1;33m
[0m    [0msupport_only[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Generates a DataFrame of association rules including the
metrics 'score', 'confidence', and 'lift'

Parameters
-----------
df : pandas DataFrame
  pandas DataFrame of frequent itemsets
  with columns ['support', 'itemsets']

metric : string (default: 'confidence')
  Metric to evaluate if a rule is of interest.
  **Automatically set to 'support' if `support_only=True`.**
  Otherwise, supported metrics are 'support', 'confidence', 'lift',
  'leverage', and 'conviction'
  These metrics are computed as follows:

  - support(A->C) = support(A+C) [aka 'support'], range: [0, 1]

  - confidence(A->C) =

In [24]:
association_rules(fi2, min_threshold=0.85)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(charlotte bag pink polkadot, strawberry charl...",(red retrospot charlotte bag),0.017177,0.046864,0.014775,0.860158,18.354481,0.01397,6.815824
1,"(charlotte bag pink polkadot, charlotte bag su...",(red retrospot charlotte bag),0.012282,0.046864,0.011104,0.904059,19.291256,0.010528,9.934613
2,"(charlotte bag pink polkadot, strawberry charl...",(red retrospot charlotte bag),0.01192,0.046864,0.010968,0.920152,19.634657,0.010409,11.936898
3,"(woodland charlotte bag, charlotte bag suki de...",(red retrospot charlotte bag),0.013733,0.046864,0.011784,0.858086,18.310257,0.01114,6.716286
4,"(jumbo shopper vintage red paisley, jumbo stor...",(jumbo bag red retrospot),0.014005,0.094815,0.012146,0.867314,9.147426,0.010819,6.822003
5,"(set/20 red retrospot paper napkins, set/6 red...",(set/6 red spotty paper plates),0.013416,0.023885,0.012011,0.89527,37.482435,0.01169,9.320323
6,(herb marker rosemary),(herb marker thyme),0.010877,0.010741,0.010016,0.920833,85.726864,0.009899,12.495897
7,(herb marker thyme),(herb marker rosemary),0.010741,0.010877,0.010016,0.932489,85.726864,0.009899,14.651378
8,"(charlotte bag pink polkadot, charlotte bag su...",(red retrospot charlotte bag),0.013234,0.046864,0.011376,0.859589,18.342333,0.010756,6.78819
9,"(suki shoulder bag, jumbo bag red retrospot)",(dotcom postage),0.012781,0.032088,0.011285,0.882979,27.517009,0.010875,8.271244


8. (*) Rerun the experiments from point 4 with apriori()
Do the results match with the ones found by FP-Growth?  
Is Apriori faster or slower than FP-Growth?
You can measure how long a function call takes with the following code snippet:
```python
import timeit
# number=1 means that it executes the function only once
timeit.timeit(lambda: apriori(df, 0.01), number=1)
```




In [158]:
? apriori

[1;31mSignature:[0m
 [0mapriori[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0mmin_support[0m[1;33m=[0m[1;36m0.5[0m[1;33m,[0m[1;33m
[0m    [0muse_colnames[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mmax_len[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mverbose[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mlow_memory[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Get frequent itemsets from a one-hot DataFrame

Parameters
-----------
df : pandas DataFrame
  pandas DataFrame the encoded format. Also supports
  DataFrames with sparse data; for more info, please
  see (https://pandas.pydata.org/pandas-docs/stable/
       user_guide/sparse.html#sparse-data-structures)

  Please note that the old pandas SparseDataFrame format
  is no longer supported in mlxtend >= 0.17.2.

  The allowed values are either 0/1 or True/False.
  For example,



In [25]:
ap_fi = apriori(df, min_support= 0.02)
ap_fi

Unnamed: 0,support,itemsets
0,0.065899,(0)
1,0.037527,(3)
2,0.036213,(7)
3,0.052438,(19)
4,0.023885,(50)
...,...,...
299,0.020305,"(3720, 2655)"
300,0.021347,"(3844, 2655)"
301,0.023794,"(4048, 2813)"
302,0.020350,"(3844, 3189)"


In [162]:
? timeit.timeit

[1;31mSignature:[0m
 [0mtimeit[0m[1;33m.[0m[0mtimeit[0m[1;33m([0m[1;33m
[0m    [0mstmt[0m[1;33m=[0m[1;34m'pass'[0m[1;33m,[0m[1;33m
[0m    [0msetup[0m[1;33m=[0m[1;34m'pass'[0m[1;33m,[0m[1;33m
[0m    [0mtimer[0m[1;33m=[0m[1;33m<[0m[0mbuilt[0m[1;33m-[0m[1;32min[0m [0mfunction[0m [0mperf_counter[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mnumber[0m[1;33m=[0m[1;36m1000000[0m[1;33m,[0m[1;33m
[0m    [0mglobals[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m Convenience function to create Timer object and call timeit method.
[1;31mFile:[0m      c:\programdata\anaconda3\lib\timeit.py
[1;31mType:[0m      function


In [26]:
timeit.timeit(lambda: apriori(df, 0.02), number=1)


63.1158604000002

In [27]:
timeit.timeit(lambda: fpgrowth(df, 0.02), number=1)


8.867982900000015

## 2.2 Apriori implementation

In [2]:
 from itertools import combinations, permutations
 from collections import defaultdict

In [3]:
data = [
    {'a','b'},
    {'b','c','d'},
    {'a','c','d','e'},
    {'a','d','e'},
    {'a','b','c'},
    {'a','b','c','d'},
    {'b','c'},
    {'a','b','c'},
    {'a','b','d'},
    {'b','c','e'}
]

In [4]:
def scan(data, itemset, limit):
    """Scans dataset and computes support for given itemsets
    Parameters:
    data: list of all transactions
    itemset: list of transactions to be evaluated
    limit: float; minsup threshold

    Returns:
    a list of dictionaries, where each row is a frequent item with values as support 
    """
    N = len(data)
    C = defaultdict(lambda: 0)
    for items in itemset:
        #items = line.split("_")
        support_count = 0
        for t in data:
            if len(t) >= len(items):
                # only the transactions with same or larger N. of items
                support_count += all([x in t for x in items])
        if support_count/N >= limit:
            C[tuple(items)] = support_count/N
    return dict(C)


In [5]:
def prune_by_subset(f_subsets, new_cands):
    for items in new_cands:
        combs = combinations(items, len(items) - 1)
        all_combs = [tuple(i) for i in combs]
        for c in all_combs:
            if c not in f_subsets:
                if items in new_cands:
                    new_cands.remove(items)
                continue
    return new_cands


In [11]:
def generate_candidates(itemset):
    new_cands = []
    pref_main = ""

    # Edge case, when creating candidates for level 2
    if len(itemset[0]) == 1:
        combs = combinations([t for l in itemset for t in l], 2)
        return list(combs)

    for i in range(len(itemset)):
        main_pref = itemset[i][:-1]
        for j in range(i+1, len(itemset)):
            pref = itemset[j][:-1]
            if set(pref) == set(main_pref):
                new_cand = itemset[i] + (itemset[j][-1],)
                new_cands.append(new_cand)
    return new_cands

#pref_matching([('a', 'b'), ('a', 'c'), ('a', 'd'), ('a', 'e'), ('b', 'c'), ('b', 'd'), ('c', 'd'), ('c', 'e'), ('d', 'e')])



In [13]:
def main_apriori(data,minsup = 0.1):
    """ Apriori algorithm to find frequent transactions
    __parameters:__  
         data: unorderd list of transactions with unique, unordered items  
         minsup: minimum support level for a transaction to be frequent  
         where support(t) = freq(t) / |data|

     __returns:__
         A list of dictionaries,  
         where keys are the tuples representing frequent transactions and values as their Support.
    """

    from itertools import combinations
    from collections import defaultdict

    # level: 1
    candidates = []
    fr_items = []
    new_cands = [tuple([w], ) for w in sorted(set([x for subset in data for x in subset ]))]
    fr_items.append(scan(data, new_cands, minsup))
    
    # levels > 1
    while(fr_items[-1] != {}):
        prev_fr_items = list(fr_items[-1].keys())
        new_cands = generate_candidates(prev_fr_items)
        new_cands = prune_by_subset(prev_fr_items, new_cands)
        fr_items.append(scan(data, new_cands, minsup))

    return fr_items[:-1]



In [8]:
from pprint import PrettyPrinter
pp = PrettyPrinter(width=20, compact=True, indent = 4)

In [14]:
fi = main_apriori(data, 0.2)
pp.pprint(fi)

[   {   ('a',): 0.7,
        ('b',): 0.8,
        ('c',): 0.7,
        ('d',): 0.5,
        ('e',): 0.3},
    {   ('a', 'b'): 0.5,
        ('a', 'c'): 0.4,
        ('a', 'd'): 0.4,
        ('a', 'e'): 0.2,
        ('b', 'c'): 0.6,
        ('b', 'd'): 0.3,
        ('c', 'd'): 0.3,
        ('c', 'e'): 0.2,
        ('d', 'e'): 0.2},
    {   ('a', 'b', 'c'): 0.3,
        ('a', 'b', 'd'): 0.2,
        ('a', 'c', 'd'): 0.2,
        ('a', 'd', 'e'): 0.2,
        ('b', 'c', 'd'): 0.2}]


In [15]:
file = "coco.json"
with open(file) as f:
    coco_data = json.load(f)

In [16]:
coco_data_items = [set(image['annotations']) for image in coco_data]

In [68]:
coco_data_items[:2]

[{'car', 'stop sign', 'train'},
 {'bench', 'chair', 'dining table', 'person', 'potted plant'}]

In [17]:
coco_fi = main_apriori(coco_data_items, 0.02)
#pp.pprint(coco_fi)
sum([len(x) for x in coco_fi])

144

In [75]:
global_itemset = set([x for t in coco_data_items for x in t ])

In [76]:
global_itemset.__len__()

78

In [77]:

mat = []
for inv in coco_data_items:
    row = []
    for gx in global_itemset:
        if gx in inv:
            row.append(1)
        else:
            row.append(0)
    mat.append(row)   

In [78]:
df = pd.DataFrame(data = mat, columns= global_itemset)
df.head()

Unnamed: 0,person,scissors,mouse,tennis racket,banana,car,vase,skateboard,fire hydrant,oven,...,umbrella,bird,fork,couch,handbag,toothbrush,airplane,bear,clock,horse
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
print(fpgrowth(df, 0.1, use_colnames= True).to_string())

    support                      itemsets
0    0.3704                         (car)
1    0.1332                   (stop sign)
2    0.5886                      (person)
3    0.4338                       (bench)
4    0.1346                (fire hydrant)
5    0.1286                       (truck)
6    0.3230               (traffic light)
7    0.1230                     (handbag)
8    0.2386                 (car, person)
9    0.3208               (person, bench)
10   0.1032                  (car, truck)
11   0.1978          (traffic light, car)
12   0.1902       (traffic light, person)
13   0.1362  (traffic light, car, person)
14   0.1224             (person, handbag)


In [83]:
print(apriori(df, 0.02, use_colnames= True).to_string())

     support                                       itemsets
0     0.5886                                       (person)
1     0.0214                                (tennis racket)
2     0.3704                                          (car)
3     0.0344                                   (skateboard)
4     0.1346                                 (fire hydrant)
5     0.3230                                (traffic light)
6     0.0354                                   (cell phone)
7     0.4338                                        (bench)
8     0.0602                                        (chair)
9     0.0852                                     (backpack)
10    0.0300                               (baseball glove)
11    0.1286                                        (truck)
12    0.0368                                  (sports ball)
13    0.0762                                      (bicycle)
14    0.0276                                          (dog)
15    0.0410                            

In [18]:
timeit.timeit(lambda: main_apriori(coco_data_items, 0.02), number= 10)/10

2.382462940000005

In [224]:
timeit.timeit(lambda: apriori(df, 0.02, use_colnames= True), number= 1)

0.38145710000026156

In [225]:
timeit.timeit(lambda: fpgrowth(df, 0.02, use_colnames= True), number= 1)

0.22929500000100234

In [None]:
! pip install jupyter_contrib_nbextensions && jupyter contrib nbextension install 