In [1]:
import json
import numpy as np
import pandas as pd
import plotly.express as px
import mlxtend as mlx
from tqdm.notebook import tqdm, trange
from itertools import chain

In [2]:
def read_json(path):
    file = open(path,'r+', encoding='utf-8')
    data = {}
    for line in file.readlines():
        try: 
            dic = json.loads(line)
            if dic['author'] in data.keys():
                if dic['subreddit'] in data[dic['author']].keys():
                    data[dic['author']][dic['subreddit']] +=1
                else:
                    data[dic['author']][dic['subreddit']] =1
            else:
                data[dic['author']] = {}
                data[dic['author']][dic['subreddit']] = 1
        except json.JSONDecodeError:
            continue
    return data

In [3]:
# data = read_json('data/RC_2009-10')

In [4]:
def jsonKeys2int(x):
    if isinstance(x, dict):
            return {int(k):v for k,v in x.items()}
    return x

In [5]:
data = json.load(open('data.json','r+'))
subreddit_names_list = json.load(open('subreddits_names.json','r+'))
subreddit_index = json.load(open('subreddit_index.json','r+'))
index_subreddit = json.load(open('index_subreddit.json','r+'),object_hook=jsonKeys2int)

In [6]:
def create_matrix(data,matrix_width,subreddit_index):
    """ Creates matrix filled with zeros and iterates over it filling the cells based on 
        the subreddit-index dictionary"""
    matrix = np.zeros(shape=(len(data),matrix_width))
    for idx,redditor in enumerate(data.values()):
        for key,value in redditor.items():
            matrix[idx,subreddit_index[key]] = value
    return matrix

In [7]:
def update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """ Updating index-subreddit,subreddit-index dictionaries and subreddit_names_list according to the
        new_redditors- new incoming data"""
    for redditor in new_redditors.values():
        for i in redditor.keys():
            if i not in subreddit_names_list:
                subreddit_index[i] = len(subreddit_index)
                index_subreddit[len(index_subreddit)] = i
                subreddit_names_list.append(i)
    return subreddit_index,index_subreddit,subreddit_names_list

In [8]:
def resize_matrix_width(matrix,subreddit_index):
    """ Updating the width of the main matrix to match it with the incoming data"""
    if matrix.shape[1] != len(subreddit_index):
        extension = np.zeros((matrix.shape[0],len(subreddit_index)-matrix.shape[1]))
        matrix = np.hstack((matrix, extension))
    return matrix 

In [9]:
def update(matrix,new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """Function that manages new incoming data and combines it with the main matrix"""
    subreddit_index,index_subreddit,subreddit_names_list = \
    update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list)
    matrix = resize_matrix_width(matrix,subreddit_index)
    recent_data = create_matrix(new_redditors,matrix.shape[1],subreddit_index)
    matrix = np.vstack((matrix, recent_data))
    return matrix,subreddit_index,index_subreddit,subreddit_names_list

In [10]:
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)

In [11]:
# matrix,subreddit_index,index_subreddit,subreddit_names_list = update(matrix,new_redditors,subreddit_index,index_subreddit,subreddit_names_list)

In [12]:
matrix = np.where(matrix>5,True,False)
matrix = matrix[~np.all(matrix == False, axis=1), :]
df = pd.DataFrame(matrix)
df.rename(columns=index_subreddit,inplace=True)

In [13]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [14]:
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(self),(AskReddit),0.015186,0.480390,0.011720,0.771772,1.606551,0.004425,2.276711
1,(todayilearned),(AskReddit),0.013818,0.480390,0.010489,0.759076,1.580123,0.003851,2.156736
2,(DoesAnybodyElse),(AskReddit),0.046242,0.480390,0.034476,0.745562,1.551992,0.012262,2.042187
3,(entertainment),(reddit.com),0.015232,0.287121,0.010762,0.706587,2.460933,0.006389,2.429606
4,(offbeat),(reddit.com),0.015460,0.287121,0.011264,0.728614,2.537649,0.006825,2.626802
...,...,...,...,...,...,...,...,...,...
406,"(reddit.com, funny, politics, science)","(pics, WTF)",0.013088,0.066627,0.010078,0.770035,11.557374,0.009206,4.058758
407,"(pics, science, politics, WTF)","(reddit.com, funny)",0.014274,0.054633,0.010078,0.706070,12.923797,0.009299,3.216302
408,"(funny, pics, science, politics)","(reddit.com, WTF)",0.012222,0.085462,0.010078,0.824627,9.649102,0.009034,5.214815
409,"(funny, science, politics, WTF)","(reddit.com, pics)",0.011583,0.091800,0.010078,0.870079,9.477937,0.009015,6.990385
