In [49]:
import json
import numpy as np
import pandas as pd
import plotly.express as px
import mlxtend as mlx
from tqdm.notebook import tqdm, trange
from itertools import chain

In [50]:
def read_json(path):
    file = open(path,'r+', encoding='utf-8')
    data = {}
    for line in file.readlines():
        try: 
            dic = json.loads(line)
            if dic['author'] in data.keys():
                if dic['subreddit'] in data[dic['author']].keys():
                    data[dic['author']][dic['subreddit']] +=1
                else:
                    data[dic['author']][dic['subreddit']] =1
            else:
                data[dic['author']] = {}
                data[dic['author']][dic['subreddit']] = 1
        except json.JSONDecodeError:
            continue
    return data

In [51]:
def jsonKeys2int(x):
    if isinstance(x, dict):
            return {int(k):v for k,v in x.items()}
    return x

In [52]:
data = json.load(open('reddit_scrapper/data/scrapped_data2.json','r+'))
subreddit_names_list = json.load(open('reddit_scrapper/data/list_of_unique_subreddits2.json','r+'))
# subreddit_index = json.load(open('subreddit_index.json','r+'))
# index_subreddit = json.load(open('index_subreddit.json','r+'),object_hook=jsonKeys2int)

In [53]:
subreddit_index = dict(zip(subreddit_names_list,range(len(subreddit_names_list))))
index_subreddit =  dict(zip(range(len(subreddit_names_list)),subreddit_names_list))

In [54]:
def create_matrix(data,matrix_width,subreddit_index):
    """ Creates matrix filled with zeros and iterates over it filling the cells based on 
        the subreddit-index dictionary"""
    matrix = np.zeros(shape=(len(data),matrix_width))
    for idx,redditor in enumerate(data.values()):
        for key,value in redditor.items():
            matrix[idx,subreddit_index[key]] = value
    return matrix

In [55]:
def update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """ Updating index-subreddit,subreddit-index dictionaries and subreddit_names_list according to the
        new_redditors- new incoming data"""
    for redditor in new_redditors.values():
        for i in redditor.keys():
            if i not in subreddit_names_list:
                subreddit_index[i] = len(subreddit_index)
                index_subreddit[len(index_subreddit)] = i
                subreddit_names_list.append(i)
    return subreddit_index,index_subreddit,subreddit_names_list

In [56]:
def resize_matrix_width(matrix,subreddit_index):
    """ Updating the width of the main matrix to match it with the incoming data"""
    if matrix.shape[1] != len(subreddit_index):
        extension = np.zeros((matrix.shape[0],len(subreddit_index)-matrix.shape[1]))
        matrix = np.hstack((matrix, extension))
    return matrix 

In [57]:
def update(matrix,new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """Function that manages new incoming data and combines it with the main matrix"""
    subreddit_index,index_subreddit,subreddit_names_list = \
    update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list)
    matrix = resize_matrix_width(matrix,subreddit_index)
    recent_data = create_matrix(new_redditors,matrix.shape[1],subreddit_index)
    matrix = np.vstack((matrix, recent_data))
    return matrix,subreddit_index,index_subreddit,subreddit_names_list

### Create matrix

Remove rows and columns where all cells contains integer less or equal to 5. E.g. when a redditor comments less than threshold (6) times in a given reddit we consider an inconsiderable input so we floor it to zero. If a given redditor has commented less than the threshold in all the reddits (0/False vector) we remove him from the data. Same applies to reddit sites.

In [58]:
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)

In [59]:
mask = np.where(matrix>8,True,False)
rows = ~np.all(mask==False,axis=1)
columns = ~np.all(mask==False,axis=0)
data = matrix[np.ix_(rows,columns)]
df = pd.DataFrame(data,columns=np.squeeze(np.argwhere(columns)))
df.rename(columns=index_subreddit,inplace=True)

In [60]:
upper_limit = 1000 ##  Choose number of most popular reddits
lower_limit = 20

In [61]:
most_popular_reddits = df.sum(axis=0).sort_values(ascending=False)[lower_limit:upper_limit].index

In [62]:
df_bool = df.loc[:,most_popular_reddits].astype(bool).astype(int)
#df_bool.drop(columns=["AskReddit", "wallstreetbets", "todayilearned", "Genshin_Memepact"], inplace=True)

In [63]:
df_bool

Unnamed: 0,soccer,AmItheAsshole,teenagers,aww,amcstock,movies,PSTH,nextfuckinglevel,Showerthoughts,mildlyinteresting,...,Reddit_Island,CODZombies,upvote,Meditation,disneyvacation,RealGirls,criterion,securityguards,grandrapids,HighStrangeness
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6756,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [1]:
frequent_itemsets = apriori(df_bool, min_support=0.003, use_colnames=True)
rules = association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.1)

NameError: name 'apriori' is not defined

In [None]:
rules

In [19]:
rules.to_json("data/arules.json")

In [20]:
df.shape

(5938, 1754)