# Test of algorithm efficency

In [1]:
import numpy as np
import pandas as pd
from apyori import apriori as apriori_baseline
import datetime
import time
from tqdm import tqdm

# EFFICIENT APRIORI
from efficient_apriori import apriori as eff_apriori

# MLXTEND APRIORI
from mlxtend.frequent_patterns import apriori as mlxtend_apriori
from mlxtend.preprocessing import TransactionEncoder

print("Libraries imported")

Libraries imported


In [2]:
df = pd.read_pickle("../data/clean_df/clean_df.pkl")

# Time measurament of baseline and implemented algorithm
We run `n` times both the algorithms and compare the score at the end of their executions.

In [3]:
n = 100

## Implementation of mlxtend
I intended to use this as a baseline... but in reality it has the same results of efficient apriori, while taking less time in the computation

In [4]:
mlxtend_times = []

for i in tqdm(range(n)):

    start = time.time()

    curr_day = datetime.date(2020, 7, 24)  # first day of the data
    last_day = datetime.date(2020, 8, 30)  # last day of the data

    # iterate on every day of the dataset (until we reach the last one)
    while (curr_day <= last_day):

        # prepare the list of item of the day
        texts_for_apriori = []
        for t in df.loc[df['date'] == curr_day]["text"]:
            texts_for_apriori.append(t)

        te = TransactionEncoder()
        te_ary = te.fit(texts_for_apriori).transform(texts_for_apriori)
        mlxtend_input = pd.DataFrame(te_ary, columns=te.columns_)

        # computation of APriori [only if Total tweets > 0]
        if len(texts_for_apriori) > 0:
            res = mlxtend_apriori(mlxtend_input, min_support=0.012)

        # add 1 day and repeat the procedure
        curr_day = curr_day + datetime.timedelta(days=1)

    end = time.time()
    mlxtend_apriori_time = end - start
    mlxtend_times.append(mlxtend_apriori_time)

100%|██████████| 100/100 [34:33<00:00, 20.73s/it]


## Implementation of efficient apriori

In [5]:
eff_apriori_times = []

for i in tqdm(range(n)):

    start = time.time()

    curr_day = datetime.date(2020, 7, 24)  # first day of the data
    last_day = datetime.date(2020, 8, 30)  # last day of the data

    # iterate on every day of the dataset (until we reach the last one)
    while (curr_day <= last_day):

        # prepare the list of item of the day
        texts_for_apriori = []
        for t in df.loc[df['date'] == curr_day]["text"]:
            texts_for_apriori.append(t)

        # computation of APriori [only if Total tweets > 0]
        if len(texts_for_apriori) > 0:
            res1, res2 = eff_apriori(texts_for_apriori, min_support=0.012)

        # add 1 day and repeat the procedure
        curr_day = curr_day + datetime.timedelta(days=1)

    end = time.time()
    eff_apriori_time = end - start
    eff_apriori_times.append(eff_apriori_time)

100%|██████████| 100/100 [1:46:11<00:00, 63.72s/it]


In [6]:
res_df = pd.DataFrame(
    {'eff_apriori_times': eff_apriori_times,
     'mlxtend_times': mlxtend_times
    })

res_df

Unnamed: 0,eff_apriori_times,mlxtend_times
0,63.780246,21.111056
1,63.704696,20.992171
2,63.827229,20.954445
3,64.018045,20.916792
4,64.135442,20.849082
...,...,...
95,65.233571,20.676677
96,66.813680,20.701171
97,66.007201,20.714602
98,65.957592,20.688023


In [8]:
print("Mean eff_apriori_times: ", sum(eff_apriori_times)/n)
print("Mean mlxtend_times: ", sum(mlxtend_times)/n)

Mean eff_apriori_times:  63.71553538799286
Mean mlxtend_times:  20.733588185310364


## The results are clear: *mlxtend* is a better implementation of APriori