In [1]:
import json
import numpy as np
import pandas as pd
import plotly.express as px
import mlxtend as mlx
from tqdm.notebook import tqdm, trange
from itertools import chain
import time

In [2]:
def read_json(path):
    file = open(path,'r+', encoding='utf-8')
    data = {}
    for line in file.readlines():
        try: 
            dic = json.loads(line)
            if dic['author'] in data.keys():
                if dic['subreddit'] in data[dic['author']].keys():
                    data[dic['author']][dic['subreddit']] +=1
                else:
                    data[dic['author']][dic['subreddit']] =1
            else:
                data[dic['author']] = {}
                data[dic['author']][dic['subreddit']] = 1
        except json.JSONDecodeError:
            continue
    return data

In [3]:
def jsonKeys2int(x):
    if isinstance(x, dict):
            return {int(k):v for k,v in x.items()}
    return x

In [4]:
data = json.load(open('../reddit_scrapper/data/scrapped_data.json','r+'))
subreddit_names_list = json.load(open('../reddit_scrapper/data/list_of_unique_subreddits.json','r+'))


In [5]:
subreddit_index = dict(zip(subreddit_names_list,range(len(subreddit_names_list))))
index_subreddit =  dict(zip(range(len(subreddit_names_list)),subreddit_names_list))

In [6]:
def create_matrix(data,matrix_width,subreddit_index):
    """ Creates matrix filled with zeros and iterates over it filling the cells based on 
        the subreddit-index dictionary"""
    matrix = np.zeros(shape=(len(data),matrix_width))
    for idx,redditor in enumerate(data.values()):
        for key,value in redditor.items():
            matrix[idx,subreddit_index[key]] = value
    return matrix

In [7]:
def update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """ Updating index-subreddit,subreddit-index dictionaries and subreddit_names_list according to the
        new_redditors- new incoming data"""
    for redditor in new_redditors.values():
        for i in redditor.keys():
            if i not in subreddit_names_list:
                subreddit_index[i] = len(subreddit_index)
                index_subreddit[len(index_subreddit)] = i
                subreddit_names_list.append(i)
    return subreddit_index,index_subreddit,subreddit_names_list

In [8]:
def resize_matrix_width(matrix,subreddit_index):
    """ Updating the width of the main matrix to match it with the incoming data"""
    if matrix.shape[1] != len(subreddit_index):
        extension = np.zeros((matrix.shape[0],len(subreddit_index)-matrix.shape[1]))
        matrix = np.hstack((matrix, extension))
    return matrix 

In [9]:
def update(matrix,new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """Function that manages new incoming data and combines it with the main matrix"""
    subreddit_index,index_subreddit,subreddit_names_list = \
    update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list)
    matrix = resize_matrix_width(matrix,subreddit_index)
    recent_data = create_matrix(new_redditors,matrix.shape[1],subreddit_index)
    matrix = np.vstack((matrix, recent_data))
    return matrix,subreddit_index,index_subreddit,subreddit_names_list

### Create matrix

Remove rows and columns where all cells contains integer less or equal to 5. E.g. when a redditor comments less than threshold (6) times in a given reddit we consider an inconsiderable input so we floor it to zero. If a given redditor has commented less than the threshold in all the reddits (0/False vector) we remove him from the data. Same applies to reddit sites.

In [10]:
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)

In [11]:
mask = np.where(matrix>2,True,False)
rows = ~np.all(mask==False,axis=1)
columns = ~np.all(mask==False,axis=0)
del mask
data = matrix[np.ix_(rows,columns)]
df = pd.DataFrame(data,columns=np.squeeze(np.argwhere(columns)))
del rows
del columns
del data
del matrix
df.rename(columns=index_subreddit,inplace=True)

In [12]:
df.shape

(33044, 21383)

In [13]:
upper_limit = 10000 ##  Choose number of most popular reddits
lower_limit = 20

In [14]:
most_popular_reddits = df.sum(axis=0).sort_values(ascending=False)[lower_limit:upper_limit].index

In [15]:
df_bool = df.loc[:,most_popular_reddits].astype(bool).astype(int)

In [16]:
del df
del most_popular_reddits

In [17]:
df_bool

Unnamed: 0,Cringetopia,nextfuckinglevel,funny,hockey,pics,neoliberal,leagueoflegends,nfl,gaming,tennis,...,RivalsOfAether,DerechoGenial,BBWVideos,R6STeams,mintmobile,RealLifeShinies,mc5,VideoGameDealsCanada,recruiting,HairyCurvy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# DVC
We can generate rules with parameters (upper_limit,min_support)
## JSON:
(5000, 0.0004) (5000, 0.00035) (10000, 0.00035)
## DB:
(5000, 0.00032) (10000, 0.00032)
and save them to JSON and to database with file names:
## JSON:
arules-upper_limit-min_support.json
## Database:
arules-upper_limit-min_support.db
for db use script in getrulesdb

In [19]:
frequent_itemsets = apriori(df_bool, min_support=0.00035,
                            use_colnames=True, low_memory=True,
                            verbose=2, max_len=7)

Processing 81416 combinations | Sampling itemset size 76


In [20]:
del df_bool

In [21]:
rules = association_rules(frequent_itemsets, 
                  metric='lift', 
                  min_threshold=1.01)

In [22]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Cringetopia),(nextfuckinglevel),0.044759,0.079712,0.009866,0.220419,2.765198,0.006298,1.180491
1,(nextfuckinglevel),(Cringetopia),0.079712,0.044759,0.009866,0.123766,2.765198,0.006298,1.090167
2,(funny),(Cringetopia),0.079742,0.044759,0.006839,0.085769,1.916250,0.003270,1.044857
3,(Cringetopia),(funny),0.044759,0.079742,0.006839,0.152806,1.916250,0.003270,1.086242
4,(pics),(Cringetopia),0.072449,0.044759,0.006083,0.083960,1.875842,0.002840,1.042794
...,...,...,...,...,...,...,...,...,...
3963457,(OnlyfansAddict),"(HotOnlyfans, onlyfanssubscribe, OnlyfansXXX, ...",0.001483,0.000424,0.000363,0.244898,578.029155,0.000363,1.323763
3963458,(OnlyfansXXX),"(HotOnlyfans, onlyfanssubscribe, OnlyfansAddic...",0.002239,0.000363,0.000363,0.162162,446.540541,0.000362,1.193115
3963459,(onlyfans_get_noticed),"(HotOnlyfans, onlyfanssubscribe, OnlyfansAddic...",0.001634,0.000393,0.000363,0.222222,564.854701,0.000363,1.285208
3963460,(OnlyfansAmateurs),"(HotOnlyfans, onlyfanssubscribe, OnlyfansAddic...",0.002028,0.000363,0.000363,0.179104,493.194030,0.000362,1.217739


In [23]:
del frequent_itemsets

In [24]:
# rules.to_json("../data/arules-10000-0005|"+str(time.time())+".json")
rules.to_json("../data/arules-10000-00035.json")

In [25]:
del rules