# Algorithm implementation
We have a clean dataset, we can now find frequent itemset in the data.

### --> mlxtend apriori version (faster)

## Libraries

In [1]:
import pandas as pd
import math
import plotly.express as px     # used to plot the data
import ast                      # used to transform string into list
from collections import Counter # used to count the occurrences of hashtags
from statistics import mean
import string                   # used to remove punctuation from text in an efficient way
import re                       
from tqdm import tqdm           # estimate of time in long for
import datetime
import time
import json

# MLXTEND APRIORI
from mlxtend.frequent_patterns import apriori as mlxtend_apriori
from mlxtend.preprocessing import TransactionEncoder

print("All libreries imported")

All libreries imported


## Import the clean dataset

In [2]:
df_path = "../data/clean_df/clean_df.pkl"
df = pd.read_pickle(df_path)

## APriori on all days

### 1. Compute APriori algorithm for each day of the dataset

In [3]:
results_list = []

start = time.time()

curr_day = datetime.date(2020, 7, 24)  # first day of the data
last_day = datetime.date(2020, 8, 30)  # last day of the data

# iterate on every day of the dataset (until we reach the last one)
while (curr_day <= last_day):

    # prepare the list of item of the day
    texts_for_apriori = []
    for t in df.loc[df['date'] == curr_day]["text"]:
        texts_for_apriori.append(t)

    print(" >> Computing APriori on day: ", curr_day, " | Total tweets: ", len(texts_for_apriori))

    te = TransactionEncoder()
    te_ary = te.fit(texts_for_apriori).transform(texts_for_apriori)
    mlxtend_input = pd.DataFrame(te_ary, columns=te.columns_)

    # computation of APriori [only if Total tweets > 0]
    if len(texts_for_apriori) > 0:
        daily_groups = mlxtend_apriori(mlxtend_input, min_support=0.015, use_colnames=True)
        results_list.append([curr_day, daily_groups])
    
    else:
        results_list.append([curr_day, []])

    # add 1 day and repeat the procedure
    curr_day = curr_day + datetime.timedelta(days=1)

end = time.time()
mlxtend_apriori_time = end - start

 >> Computing APriori on day:  2020-07-24  | Total tweets:  295
 >> Computing APriori on day:  2020-07-25  | Total tweets:  16881
 >> Computing APriori on day:  2020-07-26  | Total tweets:  7500
 >> Computing APriori on day:  2020-07-27  | Total tweets:  7500
 >> Computing APriori on day:  2020-07-28  | Total tweets:  7500
 >> Computing APriori on day:  2020-07-29  | Total tweets:  2780
 >> Computing APriori on day:  2020-07-30  | Total tweets:  1980
 >> Computing APriori on day:  2020-07-31  | Total tweets:  7500
 >> Computing APriori on day:  2020-08-01  | Total tweets:  7500
 >> Computing APriori on day:  2020-08-02  | Total tweets:  7500
 >> Computing APriori on day:  2020-08-03  | Total tweets:  0
 >> Computing APriori on day:  2020-08-04  | Total tweets:  7500
 >> Computing APriori on day:  2020-08-05  | Total tweets:  0
 >> Computing APriori on day:  2020-08-06  | Total tweets:  7214
 >> Computing APriori on day:  2020-08-07  | Total tweets:  1060
 >> Computing APriori on day:  

In [4]:
print(" > Elapsed time: ", mlxtend_apriori_time)

 > Elapsed time:  9.73398470878601


### Viewing the results
A simple example

In [5]:
results_list[0]

[datetime.date(2020, 7, 24),
       support                       itemsets
 0    0.023729                        (alert)
 1    0.020339                     (american)
 2    0.033898                          (amp)
 3    0.016949                            (b)
 4    0.027119                         (back)
 ..        ...                            ...
 132  0.016949          (global, news, alert)
 133  0.016949        (global, pandem, alert)
 134  0.016949          (pandem, news, alert)
 135  0.016949         (global, news, pandem)
 136  0.016949  (pandem, global, news, alert)
 
 [137 rows x 2 columns]]

### 2. Check if the *groups of words* appear in *different dates*

In [6]:
final_results = {}

# iterate on every [date, [[group_of_2],[group_of_3]]]
for res in results_list:

    date = res[0]    # get the date
    df = res[1]      # get the df with all the itemsets

    # iterate on every row of the df
    n = len(df)
    
    if n > 0: # we ignore empty dataset

        i = 0
        while i < n:

            key = df["itemsets"][i]
            support = df["support"][i]

            # if the key is already in the solutions dict --> update its value with current data
            if key in final_results:
                final_results[key][0].append(date)
                final_results[key][1].append(support)

            # if it is a new group --> create a list with the date value
            else:
                final_results[key] = [[date],[support]]

            i = i+1

print(">> Operation Completed")

>> Operation Completed


### Store the data in a dataframe

In [7]:
words_column = []
dates_column = []
count_column = []
for k,v in final_results.items():
    words_column.append(k)
    dates_column.append(v[0])
    count_column.append(v[1])

final_results_df = pd.DataFrame(
    {'itemsets': words_column,
     'dates': dates_column,
     'supports': count_column
    })

final_results_df

Unnamed: 0,itemsets,dates,supports
0,(alert),[2020-07-24],[0.023728813559322035]
1,(american),"[2020-07-24, 2020-07-28, 2020-07-29, 2020-07-3...","[0.020338983050847456, 0.0152, 0.0158273381294..."
2,(amp),"[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.03389830508474576, 0.06137077187370416, 0.0..."
3,(b),[2020-07-24],[0.01694915254237288]
4,(back),"[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.02711864406779661, 0.017297553462472602, 0...."
...,...,...,...
1481,"(india, new)",[2020-08-30],[0.018507462686567163]
1482,"(india, new, case)",[2020-08-30],[0.017194029850746268]
1483,"(covid, india, new)",[2020-08-30],[0.016955223880597014]
1484,"(case, covid, new, death)",[2020-08-30],[0.020417910447761194]


### 3. Remove the rows with less than 2 dates
We are not interested in them, they are just frequent topics, not topics frequent in time.

In [8]:
count_dates = []
for l in final_results_df["dates"]:
    count_dates.append(len(l))

final_results_df["tot_dates"] = count_dates
final_results_df = final_results_df.loc[final_results_df['tot_dates'] > 1]
final_results_df = final_results_df.sort_values(by=['tot_dates'], ascending=False)

final_results_df

Unnamed: 0,itemsets,dates,supports,tot_dates
85,(work),"[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.02711864406779661, 0.02008174871156922, 0.0...",26
27,(go),"[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.04067796610169491, 0.02701261773591612, 0.0...",26
31,(help),"[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.023728813559322035, 0.02932290741069842, 0....",26
75,(us),"[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.030508474576271188, 0.0363722528286239, 0.0...",26
35,(know),"[2020-07-24, 2020-07-25, 2020-07-26, 2020-07-2...","[0.023728813559322035, 0.016112789526686808, 0...",26
...,...,...,...,...
212,(talli),"[2020-07-27, 2020-08-30]","[0.015866666666666668, 0.015761194029850746]",2
218,"(covid, day, case)","[2020-07-27, 2020-08-09]","[0.019333333333333334, 0.0212]",2
7,(close),"[2020-07-24, 2020-08-13]","[0.01694915254237288, 0.0204]",2
230,(chang),"[2020-07-29, 2020-08-07]","[0.015107913669064749, 0.016037735849056604]",2


## Save the results
both as .csv and .pkl

In [9]:
# save the data (could be useful in the future)
final_results_df.to_csv("../data/results/results_df.csv")
final_results_df.to_pickle("../data/results/results_df.pkl")

print("Results saved")

Results saved
