In [0]:
import networkx as nx
from random import choice
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pylab
import numpy as np
import seaborn as sns
import os
from tqdm import tqdm
from sklearn.preprocessing import minmax_scale, robust_scale, normalize
%matplotlib inline

In [0]:
!tar xf users.tar.xz

In [0]:
def raw_matrix(data_loc='users/', period = None):
    period = str(period) if period else ''
    user_list = next(os.walk(data_loc))[1]
    file_names = data_loc + '{}/subreddits' + period + '.csv'
    df_list = []
    for user in tqdm(user_list):
        filename = file_names.format(user)
        try:
            df_list.append(pd.read_csv(filename, index_col=0, names=[user], header=0))
        except FileNotFoundError as e:
            print(e)
            continue
    raw_df = pd.concat(df_list, axis=1, sort=False).fillna(0)
    return raw_df

In [0]:
def modified_min_max_scaler(m, axis = 0):
    m = m.astype('float')
    m[m == 0] = 'nan'
    if axis == 0:
        m = minmax_scale(m)
    else:
        m = minmax_scale(m.T).T
    m[np.isnan(m)] = 0
    return m

def modified_robust_scaler(m, axis = 0):
    m = m.astype('float')
    m[m == 0] = 'nan'
    if axis == 0:
        m = robust_scale(m)
    else:
        m = robust_scale(m.T).T
    m[np.isnan(m)] = 0
    return m

In [0]:
def AGM_matrix(raw_df, node_type='u', norm=None):
    users_list = list(raw_df)
    subs_list = list(raw_df.index)
    raw_matrix = raw_df.values
    if norm != None:
        raw_matrix = modified_min_max_scaler(raw_matrix, norm)
    if node_type == 's':
        raw_matrix = raw_matrix.T
    sim_matrix = 1 - np.exp(- raw_matrix.T @ raw_matrix)
    column_names = users_list if node_type == 'u' else subs_list
    return pd.DataFrame(data = sim_matrix, index=column_names, columns=column_names)

def cosine_matrix(raw_df, node_type='u', norm=None):
    users_list = list(raw_df)
    subs_list = list(raw_df.index)
    raw_matrix = raw_df.values
    if norm != None:
        raw_matrix = normalize(raw_matrix, axis=norm, norm='l2')
    if node_type == 's':
        raw_matrix = raw_matrix.T
    sim_matrix = raw_matrix.T @ raw_matrix
    column_names = users_list if node_type == 'u' else subs_list
    return pd.DataFrame(data = sim_matrix, index=column_names, columns=column_names)

In [0]:
def create_graph(sim_df):
    dist = sim_df
    G = nx.Graph()
    names = sim_df.index
    l = len(names)
    G.add_nodes_from(sim_df.index)
    for i in tqdm(range(l - 1)):
        for j in range(i + 1, l):
            u = names[i]
            v = names[j]
            weight = dist[u][v]
            if weight == np.inf:
                continue
            G.add_edge(u, v, weight = weight)
    return G

In [0]:
def nbr_subgraph(G, node, n = 20):
    edges = sorted(list(G.edges(node, data = True)), key = lambda x: x[2]['weight'])
    sg = nx.Graph()
    sg.add_edges_from(edges[:n])
    return sg


In [0]:
def graph_with_length(G):
    pos = nx.spring_layout(G, weight ='weight')
    nx.draw_networkx(G, pos=pos)


In [85]:
df = raw_matrix('users/', 30)


  0%|          | 0/514 [00:00<?, ?it/s][A
  8%|▊         | 43/514 [00:00<00:01, 429.92it/s][A
 17%|█▋        | 89/514 [00:00<00:00, 438.48it/s][A
 27%|██▋       | 137/514 [00:00<00:00, 447.74it/s][A
 36%|███▌      | 184/514 [00:00<00:00, 454.06it/s][A
 45%|████▍     | 231/514 [00:00<00:00, 457.89it/s][A

[Errno 2] File b'users/quoiega/subreddits30.csv' does not exist: b'users/quoiega/subreddits30.csv'



 54%|█████▎    | 275/514 [00:00<00:00, 449.46it/s][A
 62%|██████▏   | 319/514 [00:00<00:00, 445.11it/s][A
 70%|███████   | 361/514 [00:00<00:00, 434.86it/s][A
 79%|███████▉  | 406/514 [00:00<00:00, 436.84it/s][A
 88%|████████▊ | 450/514 [00:01<00:00, 437.34it/s][A
 96%|█████████▌| 494/514 [00:01<00:00, 435.44it/s][A
100%|██████████| 514/514 [00:01<00:00, 441.56it/s][A

[Errno 2] File b'users/opiumzxq/subreddits30.csv' does not exist: b'users/opiumzxq/subreddits30.csv'


In [0]:
cos_df = cosine_matrix(df, node_type = 's', norm = 1)

In [0]:
agm_df = AGM_matrix(df, node_type = 's', norm = 1)

In [0]:
G = create_graph(sim_df)

In [99]:
name = choice(sim_df.index)
sub = nbr_subgraph(G, name, 10)
graph_with_length(sub)
plt.show()
try:
    print(sim_df[name].sort_values(ascending=False)[:10])
except Exception as e:
    print(e)

NameError: ignored

In [102]:
name = 'ukpolitics'
print(name)
n = cos_df[name].sort_values(ascending=False)[:10]
print(n)
nl = list(n.index)
rdf = []
for x in nl:
  rdf.append(df[x].sort_values(ascending=False)[:10])
pd.concat(rdf, axis=1, sort=False).fillna(0).head(10)

ukpolitics
ukpolitics       1.000000
LabourUK         0.929659
shield           0.907794
americandad      0.907794
familyguy        0.907794
TwentyFour       0.907794
suits            0.907794
unitedkingdom    0.907794
Jeopardy         0.907794
Terminator       0.907794
Name: ukpolitics, dtype: float64


KeyError: ignored

In [105]:
print(name)
n = agm_df[name].sort_values(ascending=False)[:10]
print(n)
nl = list(n.index)
rdf = []
for x in nl:
  rdf.append(df[x].sort_values(ascending=False)[:10])
pd.concat(rdf, axis=1, sort=False).fillna(0).head(10)

ukpolitics
ukpolitics               0.701262
hillaryclinton           0.632121
MarvelStudiosSpoilers    0.632121
scotus                   0.632121
marvelstudios            0.632121
LiveFromNewYork          0.632121
canada                   0.464739
television               0.384742
ABoringDystopia          0.360840
ChapoTrapHouse           0.358820
Name: ukpolitics, dtype: float64


KeyError: ignored

In [0]:
np.sum(df.loc['The_Donald'] != 0)

143

In [107]:
name = 'ukpolitics'
print(name)
print()
print(cos_df[name].sort_values(ascending=False)[:10])
print()
print(agm_df[name].sort_values(ascending=False)[:10])

ukpolitics

ukpolitics       1.000000
LabourUK         0.929659
shield           0.907794
americandad      0.907794
familyguy        0.907794
TwentyFour       0.907794
suits            0.907794
unitedkingdom    0.907794
Jeopardy         0.907794
Terminator       0.907794
Name: ukpolitics, dtype: float64

ukpolitics               0.701262
hillaryclinton           0.632121
MarvelStudiosSpoilers    0.632121
scotus                   0.632121
marvelstudios            0.632121
LiveFromNewYork          0.632121
canada                   0.464739
television               0.384742
ABoringDystopia          0.360840
ChapoTrapHouse           0.358820
Name: ukpolitics, dtype: float64


In [94]:
df[['Fish_EyeMouth', 'Maedhre', 'ashamedseesaw', 'jonyprepperisrael', 'Montiexx',
'Robotman6900', 'CJnella91', 'Waterkoker', 'Baconboi212121', 'DogfaceZed']].sort_values('Fish_EyeMouth', ascending=False).head(10)

Unnamed: 0,Fish_EyeMouth,Maedhre,ashamedseesaw,jonyprepperisrael,Montiexx,Robotman6900,CJnella91,Waterkoker,Baconboi212121,DogfaceZed
memes,57.0,32.0,15.0,22.0,11.0,13.0,21.0,5.0,7.0,25.0
AskReddit,7.0,1.0,0.0,7.0,15.0,0.0,5.0,0.0,11.0,3.0
hypixel,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
history,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Showerthoughts,2.0,0.0,0.0,13.0,0.0,0.0,1.0,0.0,0.0,0.0
teenagers,2.0,0.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,1.0
politics,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MurderedByWords,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vexillology,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MoeMorphism,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
