# Test of algorithm scalability

In [1]:
import numpy as np
import pandas as pd
from apyori import apriori as apriori_baseline
import datetime
import time
from tqdm import tqdm

# MLXTEND APRIORI
from mlxtend.frequent_patterns import apriori as mlxtend_apriori
from mlxtend.preprocessing import TransactionEncoder

print("Libraries imported")

Libraries imported


In [2]:
df = pd.read_pickle("../data/clean_df/clean_df.pkl")

In [3]:
texts_for_apriori = []
for t in df["text"]:
    texts_for_apriori.append(t)

# to reach over 1M tweets, we must duplicate the df at least 6 times
texts_for_apriori = texts_for_apriori*6

print("Total tweets: ", len(texts_for_apriori))
texts_for_apriori[0:4]

Total tweets:  1074648


[['smell',
  'scent',
  'hand',
  'sanit',
  'today',
  'someon',
  'past',
  'would',
  'think',
  'intox'],
 ['hey',
  'yanke',
  'yankeespr',
  'mlb',
  'made',
  'sens',
  'player',
  'pay',
  'respect'],
 ['dian',
  'wdunlap',
  'realdonaldtrump',
  'trump',
  'never',
  'claim',
  'covid',
  'hoax',
  'claim',
  'effort'],
 ['brookbanktv',
  'gift',
  'covid',
  'give',
  'appreci',
  'simpl',
  'thing',
  'alway',
  'around']]

# Time measurament of APriori based on total number of tweets
We run `n` times the algorithm, each time doubling the number of tweets. We keep track of the times of each run.

In [4]:
tweets_tot_number = [1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 256000, 512000, 1024000]
min_support = 0.012
mlxtend_times = []

for i in range(len(tweets_tot_number)):

    tot_tweets = tweets_tot_number[i]
    texts_for_apriori_custom = texts_for_apriori[0:tot_tweets]

    start = time.time()

    te = TransactionEncoder()
    te_ary = te.fit(texts_for_apriori_custom).transform(texts_for_apriori_custom)
    mlxtend_input = pd.DataFrame(te_ary, columns=te.columns_)
    res = mlxtend_apriori(mlxtend_input, min_support=min_support)

    end = time.time()
    mlxtend_apriori_time = end - start
    mlxtend_times.append([tot_tweets, mlxtend_apriori_time])

    print(">", mlxtend_times[i])

> [1000, 0.04387521743774414]
> [2000, 0.14162588119506836]
> [4000, 1.2596051692962646]
> [8000, 2.7318878173828125]
> [16000, 1.5887510776519775]
> [32000, 3.8342647552490234]
> [64000, 14.126312255859375]
> [128000, 42.44559717178345]
> [256000, 2044.1502013206482]


MemoryError: Unable to allocate 49.9 GiB for an array with shape (512000, 104583) and data type bool

### Results

In [5]:
tw_list = []
times_list = []

for i in range(len(mlxtend_times)):
    tw_list.append(mlxtend_times[i][0])
    times_list.append(mlxtend_times[i][1])

In [6]:
res_df = pd.DataFrame(
    {'number_of_tweets': tw_list,
     'time': times_list
    })

res_df

Unnamed: 0,number_of_tweets,time
0,1000,0.043875
1,2000,0.141626
2,4000,1.259605
3,8000,2.731888
4,16000,1.588751
5,32000,3.834265
6,64000,14.126312
7,128000,42.445597
8,256000,2044.150201


In [7]:
res_df.to_csv("../data/efficiency_and_scalability_results/scalability_res.csv")