# Test of algorithm efficency

In [29]:
import numpy as np
import pandas as pd
from apyori import apriori as apriori_baseline
import datetime
import time
from tqdm import tqdm

# efficient_apriori
from efficient_apriori import apriori as eff_apriori

# mlxtend
from mlxtend.frequent_patterns import apriori as mlxtend_apriori
from mlxtend.preprocessing import TransactionEncoder

print("Libraries imported")

Libraries imported


In [30]:
df = pd.read_pickle("../data/clean_df/clean_df.pkl")

# Time measurament of baseline and implemented algorithm
We run 10 times both the algorithms and compare the score at the end of their executions.

## Implementation of mlxtend
I intended to use this as a baseline... but in reality it has the same results of efficient apriori, while taking less time in the computation

In [31]:
mlxtend_times = []

for i in tqdm(range(10)):

    start = time.time()

    curr_day = datetime.date(2020, 7, 24)  # first day of the data
    last_day = datetime.date(2020, 8, 30)  # last day of the data

    # iterate on every day of the dataset (until we reach the last one)
    while (curr_day <= last_day):

        # prepare the list of item of the day
        texts_for_apriori = []
        for t in df.loc[df['date'] == curr_day]["text"]:
            texts_for_apriori.append(t)

        te = TransactionEncoder()
        te_ary = te.fit(texts_for_apriori).transform(texts_for_apriori)
        mlxtend_input = pd.DataFrame(te_ary, columns=te.columns_)

        # computation of APriori [only if Total tweets > 0]
        if len(texts_for_apriori) > 0:
            res = mlxtend_apriori(mlxtend_input, min_support=0.012)

        # add 1 day and repeat the procedure
        curr_day = curr_day + datetime.timedelta(days=1)

    end = time.time()
    mlxtend_apriori_time = end - start
    mlxtend_times.append(mlxtend_apriori_time)

100%|██████████| 10/10 [03:27<00:00, 20.76s/it]


## Implementation of efficient apriori

In [32]:
eff_apriori_times = []

for i in tqdm(range(10)):

    start = time.time()

    curr_day = datetime.date(2020, 7, 24)  # first day of the data
    last_day = datetime.date(2020, 8, 30)  # last day of the data

    # iterate on every day of the dataset (until we reach the last one)
    while (curr_day <= last_day):

        # prepare the list of item of the day
        texts_for_apriori = []
        for t in df.loc[df['date'] == curr_day]["text"]:
            texts_for_apriori.append(t)

        # computation of APriori [only if Total tweets > 0]
        if len(texts_for_apriori) > 0:
            res1, res2 = eff_apriori(texts_for_apriori, min_support=0.012,  min_confidence=0.8)

        # add 1 day and repeat the procedure
        curr_day = curr_day + datetime.timedelta(days=1)

    end = time.time()
    eff_apriori_time = end - start
    eff_apriori_times.append(eff_apriori_time)

100%|██████████| 10/10 [10:38<00:00, 63.80s/it]


In [34]:
res_df = pd.DataFrame(
    {'eff_apriori_times': eff_apriori_times,
     'mlxtend_times': mlxtend_times
    })

res_df

Unnamed: 0,eff_apriori_times,mlxtend_times
0,63.44737,20.809627
1,63.320418,20.751803
2,63.164772,20.748259
3,63.634666,20.852998
4,63.63354,20.732749
5,63.416001,20.758522
6,64.004702,20.741559
7,65.473641,20.738574
8,64.172281,20.745498
9,63.741084,20.734611


In [36]:
print("Mean eff_apriori_times: ", sum(eff_apriori_times)/10)
print("Mean mlxtend_times: ", sum(mlxtend_times)/10)

Mean eff_apriori_times:  63.8008475780487
Mean mlxtend_times:  20.76141996383667


## The results are clear: *mlxtend* is a better implementation of APriori