In [1]:
import json
import numpy as np
import pandas as pd
import plotly.express as px
import mlxtend as mlx
from tqdm.notebook import tqdm, trange
from itertools import chain

In [2]:
def read_json(path):
    file = open(path,'r+', encoding='utf-8')
    data = {}
    for line in file.readlines():
        try: 
            dic = json.loads(line)
            if dic['author'] in data.keys():
                if dic['subreddit'] in data[dic['author']].keys():
                    data[dic['author']][dic['subreddit']] +=1
                else:
                    data[dic['author']][dic['subreddit']] =1
            else:
                data[dic['author']] = {}
                data[dic['author']][dic['subreddit']] = 1
        except json.JSONDecodeError:
            continue
    return data

In [3]:
# data = read_json('data/RC_2009-10')

In [4]:
def jsonKeys2int(x):
    if isinstance(x, dict):
            return {int(k):v for k,v in x.items()}
    return x

In [15]:
data = json.load(open('../reddit_scrapper/data/scrapped_data2.json','r+'))
subreddit_names_list = json.load(open('../reddit_scrapper/data/list_of_unique_subreddits2.json','r+'))
subreddit_index = dict(zip(subreddit_names_list,range(len(subreddit_names_list))))
index_subreddit =  dict(zip(range(len(subreddit_names_list)),subreddit_names_list))

In [16]:
def create_matrix(data,matrix_width,subreddit_index):
    """ Creates matrix filled with zeros and iterates over it filling the cells based on 
        the subreddit-index dictionary"""
    matrix = np.zeros(shape=(len(data),matrix_width))
    for idx,redditor in enumerate(data.values()):
        for key,value in redditor.items():
            matrix[idx,subreddit_index[key]] = value
    return matrix

In [17]:
def update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """ Updating index-subreddit,subreddit-index dictionaries and subreddit_names_list according to the
        new_redditors- new incoming data"""
    for redditor in new_redditors.values():
        for i in redditor.keys():
            if i not in subreddit_names_list:
                subreddit_index[i] = len(subreddit_index)
                index_subreddit[len(index_subreddit)] = i
                subreddit_names_list.append(i)
    return subreddit_index,index_subreddit,subreddit_names_list

In [18]:
def resize_matrix_width(matrix,subreddit_index):
    """ Updating the width of the main matrix to match it with the incoming data"""
    if matrix.shape[1] != len(subreddit_index):
        extension = np.zeros((matrix.shape[0],len(subreddit_index)-matrix.shape[1]))
        matrix = np.hstack((matrix, extension))
    return matrix 

In [19]:
def update(matrix,new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """Function that manages new incoming data and combines it with the main matrix"""
    subreddit_index,index_subreddit,subreddit_names_list = \
    update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list)
    matrix = resize_matrix_width(matrix,subreddit_index)
    recent_data = create_matrix(new_redditors,matrix.shape[1],subreddit_index)
    matrix = np.vstack((matrix, recent_data))
    return matrix,subreddit_index,index_subreddit,subreddit_names_list

In [20]:
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)

In [21]:
# matrix,subreddit_index,index_subreddit,subreddit_names_list = update(matrix,new_redditors,subreddit_index,index_subreddit,subreddit_names_list)

In [22]:
matrix = np.where(matrix>5,True,False)
matrix = matrix[~np.all(matrix == False, axis=1), :]
df = pd.DataFrame(matrix)
df.rename(columns=index_subreddit,inplace=True)

In [23]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [28]:
frequent_itemsets = apriori(df, min_support=0.0005, use_colnames=True)
association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(UWMCShareholders),(wallstreetbets),0.000884,0.170815,0.000632,0.714286,4.181636,0.000481,2.902148
1,(Genshin_Memepact),(Genshin_Impact),0.002148,0.031712,0.001516,0.705882,22.259199,0.001448,3.292179
2,(MVIS),(wallstreetbets),0.000884,0.170815,0.000632,0.714286,4.181636,0.000481,2.902148
3,(SNDL),(wallstreetbets),0.00139,0.170815,0.001011,0.727273,4.257665,0.000773,3.040345
4,(wallstreetbetsOGs),(wallstreetbets),0.003159,0.170815,0.002274,0.72,4.215089,0.001735,2.961375
5,(Vitards),(wallstreetbets),0.001011,0.170815,0.000758,0.75,4.390717,0.000585,3.31674
6,(Advice),(AskReddit),0.002021,0.261402,0.001642,0.8125,3.108235,0.001114,3.939187
7,(PolskaPolityka),(Polska),0.000758,0.022868,0.000758,1.0,43.729282,0.000741,inf
8,(konfa),(Polska),0.000632,0.022868,0.000505,0.8,34.983425,0.000491,4.88566
9,(poland),(Polska),0.001011,0.022868,0.000884,0.875,38.263122,0.000861,7.817056
