In [84]:
import pandas as pd
import networkx as nx
from urllib.parse import urlparse

In [90]:
data = pd.read_csv("reddit_post_data.csv")
data = data.loc[data.title!='title']

In [123]:
def extract_url_host(url):
    suffix = ['www.','mobile.','.com','.org','.au','.uk','.co']
    parsed = urlparse(url)[1]
    for sx in suffix:
        parsed = parsed.replace(sx,'')
    parsed = parsed.replace('en.m.','en.')
    return parsed


def assign_host_ids(data):
    url_id_map = {'cleaned_url':[],'host_id':[]}
    for ix,url in enumerate(data.cleaned_url.value_counts().keys()):
        url_id_map['cleaned_url'].append(url)
        url_id_map['host_id'].append(ix)

    url_id_frame = pd.DataFrame.from_dict(url_id_map)
    data = pd.merge(data,url_id_frame,left_on='cleaned_url',right_on = 'cleaned_url')
    return data

def add_subreddit_ids(data):
    unk_sr = data.subreddit.unique().tolist()
    max_host_id = data.host_id.max()+1

    sr_id_map = {'subreddit':[],'subreddit_id':[]}

    for ix,sr in enumerate(unk_sr):
        sr_id = ix+max_host_id
        sr_id_map['subreddit'].append(sr)
        sr_id_map['subreddit_id'].append(sr_id)

    sr_id_frame = pd.DataFrame.from_dict(sr_id_map)
    data = pd.merge(data,sr_id_frame,left_on='subreddit',right_on = 'subreddit')
    
    return data

def clean_data(data):
    data['cleaned_url'] = data['url'].apply(lambda x: extract_url_host(x))
    data = data.loc[(~data.cleaned_url.isin(['i.redd.it','reddit','v.redd.it']))&
                           (~data.cleaned_url.str.contains('img'))&
                          (~data.cleaned_url.str.contains('youtu'))]
    
    data = assign_host_ids(data)
    
    data = add_subreddit_ids(data)
    
    return data




def generate_node_attributes(data,G):
    subreddits = data['subreddit'].unique().tolist()
    names = {}
    types = {}
    for sr in subreddits:
        cur = data.loc[data['subreddit']==sr]
        cur_id = cur['subreddit_id'].values.tolist()[0]
        names[cur_id] = sr
        types[cur_id] = 'SUBREDDIT'
        
    hosts = data['cleaned_url'].unique().tolist()
    for host in hosts:
        cur = data.loc[data['cleaned_url']==host]
        cur_id = cur['host_id'].values.tolist()[0]
        names[cur_id] = host
        types[cur_id] = 'WEBPAGE'
        
        
    nx.set_node_attributes(G,names,'node_name')
    nx.set_node_attributes(G,types,'node_type')
    
    return G
    
    
def create_reddit_bp_G(data):
    edges = [(row['host_id'],row['subreddit_id']) for k,row in data.iterrows()]
    rednet = nx.Graph()
    rednet.add_edges_from(edges)
    
    rednet = generate_node_attributes(data,rednet)
    
    print(nx.info(rednet))
    
    return rednet

In [77]:
data = clean_data(data)

In [125]:
rednet = create_reddit_bp_G(data)

Graph with 2358 nodes and 4645 edges


In [136]:
for i in rednet.neighbors(2337):
    print(rednet.nodes()[i])

{'node_name': 'independent', 'node_type': 'WEBPAGE'}
{'node_name': 'dailymail', 'node_type': 'WEBPAGE'}
{'node_name': 'thehill', 'node_type': 'WEBPAGE'}
{'node_name': 'nbcnews', 'node_type': 'WEBPAGE'}
{'node_name': 'yahoo', 'node_type': 'WEBPAGE'}
{'node_name': 'latimes', 'node_type': 'WEBPAGE'}
{'node_name': 'cnn', 'node_type': 'WEBPAGE'}
{'node_name': 'time', 'node_type': 'WEBPAGE'}
{'node_name': 'reuters', 'node_type': 'WEBPAGE'}
{'node_name': 'ibtimes', 'node_type': 'WEBPAGE'}
{'node_name': 'theguardian', 'node_type': 'WEBPAGE'}
{'node_name': 'washingtonpost', 'node_type': 'WEBPAGE'}
{'node_name': 'fastcompany', 'node_type': 'WEBPAGE'}
{'node_name': 'mediaite', 'node_type': 'WEBPAGE'}
{'node_name': 'newsweek', 'node_type': 'WEBPAGE'}
{'node_name': 'businessinsider', 'node_type': 'WEBPAGE'}
{'node_name': 'techdirt', 'node_type': 'WEBPAGE'}
{'node_name': 'abcnews.go', 'node_type': 'WEBPAGE'}
{'node_name': 'nypost', 'node_type': 'WEBPAGE'}
{'node_name': 'sciencemag', 'node_type': 'WE

In [94]:
degs = dict(nx.degree(rednet))
{k: v for k, v in sorted(degs.items(), key=lambda item: item[1],reverse=True)}

{2339: 405,
 2328: 327,
 2344: 314,
 2340: 293,
 2325: 259,
 2337: 248,
 2338: 238,
 2331: 191,
 2327: 169,
 2351: 166,
 2332: 164,
 2326: 152,
 2352: 151,
 2343: 135,
 2336: 129,
 2330: 121,
 2323: 115,
 2324: 113,
 2329: 102,
 2345: 99,
 2341: 89,
 2320: 85,
 2333: 82,
 2321: 79,
 2322: 78,
 2342: 76,
 2349: 70,
 2334: 51,
 2335: 42,
 2350: 36,
 1: 28,
 2: 25,
 3: 25,
 10: 25,
 0: 24,
 5: 24,
 11: 24,
 2347: 24,
 7: 23,
 15: 22,
 29: 22,
 24: 21,
 4: 21,
 12: 21,
 6: 20,
 13: 20,
 16: 18,
 19: 18,
 17: 18,
 33: 18,
 61: 17,
 37: 17,
 28: 16,
 76: 16,
 63: 16,
 66: 16,
 9: 16,
 42: 15,
 78: 15,
 23: 15,
 26: 15,
 72: 15,
 21: 14,
 20: 14,
 27: 14,
 40: 13,
 31: 13,
 79: 13,
 96: 12,
 44: 12,
 41: 12,
 18: 12,
 34: 12,
 84: 12,
 75: 11,
 59: 11,
 110: 11,
 64: 11,
 54: 11,
 57: 11,
 2348: 11,
 55: 10,
 50: 10,
 46: 10,
 32: 10,
 52: 10,
 95: 10,
 45: 10,
 36: 10,
 56: 10,
 98: 10,
 74: 10,
 122: 10,
 14: 10,
 8: 10,
 43: 10,
 70: 10,
 151: 9,
 159: 9,
 124: 9,
 114: 9,
 115: 9,
 130: 9

In [103]:
data.loc[data.subreddit_id == 2331]

Unnamed: 0,id,title,url,ups,post_date,subreddit,posted_by_id,posted_by_name,author_karma,author_user_created,cleaned_url,host_id,subreddit_id
6242,tadum8,Russian Agent Maria Butina Claims Ukrainians A...,https://www.rollingstone.com/politics/politics...,1566,2022-03-09,ukrainianconflict,2v89ysw6,Wickedkiss246,26159,2018-12-27 09:13:49,rollingstone,75,2331
6243,ta5q97,Russian troops stranded in 40-mile convoy near...,https://www.independent.co.uk/news/world/europ...,23570,2022-03-09,ukrainianconflict,9hy8hkx,HypnotizedNeverLie,164863,2017-08-05 20:38:57,independent,2,2331
6244,t6ns5w,'Three of Putin's top commanders killed on fro...,https://www.independent.co.uk/news/world/europ...,6783,2022-03-04,ukrainianconflict,,,,,independent,2,2331
6245,t66y97,Ukraine warns of disaster ‘10 times larger tha...,https://www.independent.co.uk/news/world/russi...,3820,2022-03-04,ukrainianconflict,eqwls,TopHalfAsian,18655,2014-01-10 15:24:31,independent,2,2331
6246,t4xjsz,Navalny calls for Russians to fight against ‘w...,https://www.independent.co.uk/news/world/europ...,2759,2022-03-02,ukrainianconflict,1tlg57y,evissimus,334955,2017-08-07 00:07:59,independent,2,2331
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7079,tcialw,The Russian Orthodox Church in Amsterdam (The ...,https://www.at5.nl/artikelen/214098/amsterdams...,1375,2022-03-12,ukrainianconflict,4ohht,noorderling,10512,2011-01-02 22:42:52,at5.nl,2063,2331
7080,tj620h,Russian spread fake news on Twitter and Telegr...,https://www.t-online.de/nachrichten/ausland/id...,1374,2022-03-21,ukrainianconflict,4zhs,cito,59675,2006-04-26 11:10:14,t-online.de,2069,2331
7081,t8v2vw,Ukraine Is Buying Bulletproof Vests and Night-...,https://cryptotelegram.com/ukraine-is-buying-b...,1374,2022-03-07,ukrainianconflict,jnxlcen,cryptomir,19270,2017-11-03 16:21:34,cryptotelegram,2097,2331
7082,u2p68w,Russia threatens to attack NATO and US vehicle...,https://www.svoboda.org/a/zamglavy-mid-konvoi-...,1360,2022-04-13,ukrainianconflict,knsrdu2l,shpak37,63171,2022-03-13 14:19:49,svoboda,2122,2331
