# APriori algorithm
We have a clean dataset, we can now find frequent itemset in the data.

The first algorithm that I'll use is APriori.

## Libraries

In [5]:
import pandas as pd
import math
import plotly.express as px     # used to plot the data
import ast                      # used to transform string into list
from collections import Counter # used to count the occurrences of hashtags
from statistics import mean
import string                   # used to remove punctuation from text in an efficient way
import re                       
from tqdm import tqdm           # estimate of time in long for
import datetime
import time

# ORIGINAL APRIORI
from efficient_apriori import apriori as eff_apriori

# BASELINE
from mlxtend.frequent_patterns import apriori as mlxtend_apriori
from mlxtend.preprocessing import TransactionEncoder

print("All libreries imported")

All libreries imported


## Import the clean dataset

In [6]:
df_path = "../data/clean_df/clean_df.pkl"

df = pd.read_pickle(df_path)
df

Unnamed: 0,user_location,date,text,hashtags,source_link
0,astroworld,2020-07-25,"[smell, scent, hand, sanit, today, someon, pas...",[],https://t.co/QZvYbrOgb0
1,"New York, NY",2020-07-25,"[hey, yanke, yankeespr, mlb, made, sens, playe...",[],https://t.co/1QvW0zgyPu
2,"Pewee Valley, KY",2020-07-25,"[dian, wdunlap, realdonaldtrump, trump, never,...",[COVID19],https://t.co/Jkk8vHWHb3
3,Stuck in the Middle,2020-07-25,"[brookbanktv, gift, covid, give, appreci, simp...",[COVID19],https://t.co/Z0pOAlFXcW
4,Jammu and Kashmir,2020-07-25,"[juli, media, bulletin, novel, coronavirusupd,...","[CoronaVirusUpdates, COVID19]",https://t.co/MN0EEcsJHh
...,...,...,...,...,...
179103,"Ilorin, Nigeria",2020-08-29,"[thank, iamohmai, nomin, wearamask, challeng, ...",[WearAMask],https://t.co/IegQDkKudT
179104,Ontario,2020-08-29,"[year, insan, lol, covid]",[COVID19],https://t.co/y48NP0yzgn
179105,🇨🇦 Canada,2020-08-29,"[ctvnew, power, paint, juan, lucena, tribut, g...",[],https://t.co/wnXbbyoCe2
179106,New York City,2020-08-29,"[student, test, posit, covid, major, univers, ...",[COVID19],https://t.co/6aNhSiF5gh


# Simple tests with the various apriori algorithms

### Test of package 1 - `efficient apriori`

In [7]:
# function to display our results (it's a dictionary, we basically order it by the decr. values)
def displayGroup(gr):
    print("Total groups: ", len(gr))
    display({k: v for k, v in sorted(gr.items(), key=lambda item: item[1], reverse=True)})

In [8]:
day = datetime.date(2020, 7, 25)

df_25_jul = df.loc[df['date'] == day]
print("Total tweets: ", len(df_25_jul)) # 16881 tweets

# prepare the input
texts_for_apriori = []
for t in df_25_jul["text"]:
    texts_for_apriori.append(t)

# computation of APriori
start = time.time()
itemsets, rules = eff_apriori(texts_for_apriori, min_support=0.012,  min_confidence=0.8)
end = time.time()

print(" > Elapsed time: ", end - start)

Total tweets:  16881
 > Elapsed time:  5.189152717590332


In [9]:
# save the results in a more convenient way
list_of_groups = []
for item in itemsets.values():
  list_of_groups.append(item)

for res in list_of_groups:
    displayGroup(res)
    print("--- \n")

Total groups:  99


{('covid',): 10248,
 ('case',): 1796,
 ('coronaviru',): 1452,
 ('new',): 1228,
 ('amp',): 1036,
 ('test',): 880,
 ('death',): 877,
 ('peopl',): 858,
 ('report',): 751,
 ('pandem',): 750,
 ('mask',): 743,
 ('posit',): 733,
 ('day',): 707,
 ('us',): 614,
 ('get',): 576,
 ('health',): 504,
 ('total',): 499,
 ('help',): 495,
 ('india',): 472,
 ('time',): 464,
 ('juli',): 462,
 ('updat',): 459,
 ('like',): 459,
 ('go',): 456,
 ('today',): 448,
 ('need',): 441,
 ('state',): 399,
 ('say',): 394,
 ('number',): 383,
 ('wear',): 376,
 ('spread',): 367,
 ('countri',): 357,
 ('patient',): 342,
 ('work',): 339,
 ('even',): 334,
 ('trump',): 333,
 ('live',): 329,
 ('last',): 328,
 ('realdonaldtrump',): 327,
 ('hospit',): 326,
 ('make',): 322,
 ('news',): 319,
 ('world',): 315,
 ('year',): 313,
 ('risk',): 312,
 ('via',): 307,
 ('lockdown',): 304,
 ('school',): 300,
 ('govern',): 298,
 ('home',): 298,
 ('back',): 292,
 ('daili',): 291,
 ('mani',): 287,
 ('viru',): 286,
 ('see',): 277,
 ('month',): 27

--- 

Total groups:  50


{('case', 'covid'): 1477,
 ('coronaviru', 'covid'): 1034,
 ('covid', 'new'): 886,
 ('covid', 'test'): 740,
 ('covid', 'death'): 686,
 ('case', 'new'): 655,
 ('covid', 'posit'): 613,
 ('covid', 'report'): 593,
 ('covid', 'pandem'): 519,
 ('amp', 'covid'): 514,
 ('case', 'report'): 494,
 ('covid', 'day'): 478,
 ('covid', 'peopl'): 450,
 ('case', 'death'): 433,
 ('covid', 'mask'): 384,
 ('covid', 'total'): 383,
 ('posit', 'test'): 364,
 ('covid', 'india'): 364,
 ('case', 'total'): 353,
 ('covid', 'updat'): 350,
 ('covid', 'us'): 327,
 ('case', 'coronaviru'): 327,
 ('covid', 'juli'): 325,
 ('covid', 'get'): 317,
 ('covid', 'health'): 301,
 ('covid', 'spread'): 299,
 ('mask', 'wear'): 294,
 ('covid', 'help'): 293,
 ('covid', 'number'): 291,
 ('new', 'report'): 290,
 ('covid', 'patient'): 280,
 ('covid', 'today'): 269,
 ('death', 'new'): 261,
 ('covid', 'time'): 260,
 ('covid', 'state'): 256,
 ('covid', 'last'): 252,
 ('covid', 'like'): 251,
 ('covid', 'even'): 244,
 ('covid', 'risk'): 242,


--- 

Total groups:  11


{('case', 'covid', 'new'): 545,
 ('case', 'covid', 'report'): 425,
 ('case', 'covid', 'death'): 338,
 ('covid', 'posit', 'test'): 308,
 ('case', 'covid', 'total'): 286,
 ('case', 'new', 'report'): 257,
 ('case', 'coronaviru', 'covid'): 230,
 ('covid', 'new', 'report'): 227,
 ('case', 'covid', 'posit'): 221,
 ('case', 'death', 'new'): 218,
 ('covid', 'death', 'new'): 208}

--- 



### Test of package 2 - `mlxtend`

In [10]:
day = datetime.date(2020, 7, 25)
df_25_jul = df.loc[df['date'] == day]

In [11]:
# prepare the input
texts_for_apriori = []
for t in df_25_jul["text"]:
    texts_for_apriori.append(t)

te = TransactionEncoder()
te_ary = te.fit(texts_for_apriori).transform(texts_for_apriori)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,aa,aaa,aac,aacdiscokaraok,aadhar,aafp,aahataha,aahealth,aajtak,aakash,...,𝕥𝕙𝕖,𝕨𝕙𝕒𝕥,𝕪𝕠𝕦𝕣,𝗖𝗢𝗩𝗜𝗗,𝗚𝗹𝗼𝗯𝗮𝗹,𝗡𝗲𝗽𝗮𝗹,𝗦𝗔,𝗦𝘁𝗮𝘁𝘂𝘀,𝗧𝗼𝘁𝗮𝗹,𝗰𝗮𝘀𝗲𝘀
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16876,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
16877,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
16878,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
16879,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
start = time.time()
res = mlxtend_apriori(df, min_support=0.012, use_colnames=True) # note: we cannot set the min_confidence value
end = time.time()

print(" > Elapsed time: ", end - start)
res

 > Elapsed time:  1.610689401626587


Unnamed: 0,support,itemsets
0,0.013506,(activ)
1,0.014395,(american)
2,0.061371,(amp)
3,0.017298,(back)
4,0.012499,(busi)
...,...,...
155,0.012914,"(case, death, new)"
156,0.015224,"(report, case, new)"
157,0.012322,"(covid, death, new)"
158,0.013447,"(covid, report, new)"


In this istance, less time is required.