# APriori algorithm
We have a clean dataset, we can now find frequent itemset in the data.

The first algorithm that I'll use is APriori.

## Libraries

In [1]:
import pandas as pd
import math
import plotly.express as px     # used to plot the data
import ast                      # used to transform string into list
from collections import Counter # used to count the occurrences of hashtags
from statistics import mean
import string                   # used to remove punctuation from text in an efficient way
from tqdm import tqdm
import datetime
from efficient_apriori import apriori
import json
import time

print("All libreries imported")

All libreries imported


## Import the clean dataset

In [2]:
df_path = "../data/clean_df/clean_df.pkl"
df = pd.read_pickle(df_path)

## APriori on all days

### \#1 Compute APriori algorithm for each day of the dataset

In [3]:
start = time.time()

curr_day = datetime.date(2020, 7, 24)  # first day of the data
last_day = datetime.date(2020, 8, 30)  # last day of the data
results_list = []

# iterate on every day of the dataset (until we reach the last one)
while (curr_day <= last_day):

    # prepare the list of item of the day
    texts_for_apriori = []
    for t in df.loc[df['date'] == curr_day]["text"]:
        texts_for_apriori.append(t)

    print(" >> Computing APriori on day: ", curr_day, " | Total tweets: ", len(texts_for_apriori))


    # computation of APriori [only if Total tweets > 0]
    if len(texts_for_apriori) > 0:
        itemsets, rules = apriori(texts_for_apriori, min_support=0.012,  min_confidence=0.8)

        # Save the results in a more convenient way
        list_of_groups = []
        for item in itemsets.values():
            list_of_groups.append(item)   # there are always groups of 1, 2 and 3 words. Sometimes more.

        threshold = 0.015
        daily_groups = []
        for group in list_of_groups:
            daily_groups.append({k: v/len(texts_for_apriori) for k, v in group.items() if v/len(texts_for_apriori) > threshold})
        
        results_list.append([curr_day, daily_groups])

    else:
        # Save the day and an empty list
        results_list.append([curr_day, [{}]])

    # add 1 day and repeat the procedure
    curr_day = curr_day + datetime.timedelta(days=1)

end = time.time()
print(" > Elapsed time: ", end - start)

 >> Computing APriori on day:  2020-07-24  | Total tweets:  295
 >> Computing APriori on day:  2020-07-25  | Total tweets:  16881
 >> Computing APriori on day:  2020-07-26  | Total tweets:  7500
 >> Computing APriori on day:  2020-07-27  | Total tweets:  7500
 >> Computing APriori on day:  2020-07-28  | Total tweets:  7500
 >> Computing APriori on day:  2020-07-29  | Total tweets:  2780
 >> Computing APriori on day:  2020-07-30  | Total tweets:  1980
 >> Computing APriori on day:  2020-07-31  | Total tweets:  7500
 >> Computing APriori on day:  2020-08-01  | Total tweets:  7500
 >> Computing APriori on day:  2020-08-02  | Total tweets:  7500
 >> Computing APriori on day:  2020-08-03  | Total tweets:  0
 >> Computing APriori on day:  2020-08-04  | Total tweets:  7500
 >> Computing APriori on day:  2020-08-05  | Total tweets:  0
 >> Computing APriori on day:  2020-08-06  | Total tweets:  7214
 >> Computing APriori on day:  2020-08-07  | Total tweets:  1060
 >> Computing APriori on day:  

### Viewing the results
A simple example

In [4]:
#results_list[2]

### \#2 Check if the groups appear in different dates
#### Check 1: groups of two words
#### Check 2: groups of three words

In [5]:
DEBUG = False

final_results = {}

# iterate on every [date, [[group_of_2],[group_of_3]]]
for res in tqdm(results_list):

    date = res[0]           # get the date
    list_of_groups = res[1] # in pos 0 --> 1 word, in pos 1 --> 2 words, ...

    # iterate on every pair of every groups

    for group in list_of_groups:

        for key, value in group.items():

            if DEBUG:
                print(date)
                print(key)

            # if the key is already in the solutions dict --> update its value with current data 
            if key in final_results:
                final_results[key][0].append(date)
                final_results[key][1].append(value)

            # if it is a new group --> create a list with the date value
            else:
                final_results[key] = [[date],[value]]


100%|██████████| 38/38 [00:00<00:00, 19051.34it/s]


## Save the data in a dataframe 
and as .csv and .pkl

In [6]:
words_column = []
dates_column = []
count_column = []
for k,v in final_results.items():
    words_column.append(k)
    dates_column.append(v[0])
    count_column.append(v[1])

final_results_df = pd.DataFrame(
    {'group_of_words': words_column,
     'dates': dates_column,
     'frequencies': count_column
    })

final_results_df

Unnamed: 0,group_of_words,dates,frequencies
0,"(covid,)","[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.5728813559322034, 0.6070730406966411, 0.618..."
1,"(good,)","[2020-07-24, 2020-07-25, 2020-07-28, 2020-08-0...","[0.020338983050847456, 0.015283454771636751, 0..."
2,"(live,)","[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.02711864406779661, 0.019489366743676323, 0...."
3,"(coronaviru,)","[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.0847457627118644, 0.08601386173804869, 0.1,..."
4,"(pandem,)","[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.06440677966101695, 0.0444286475919673, 0.04..."
...,...,...,...
1470,"(hour, last)",[2020-08-30],[0.01528358208955224]
1471,"(covid, india, new)",[2020-08-30],[0.016955223880597014]
1472,"(case, india, new)",[2020-08-30],[0.017194029850746268]
1473,"(case, covid, death, new)",[2020-08-30],[0.020417910447761194]


In [7]:
# save the data (could be useful in the future)
final_results_df.to_csv("../data/results/apriori_df.csv")
final_results_df.to_pickle("../data/results/apriori_df.pkl")