In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import networkx as nx
import numpy as np
import pandas as pd
import random
import os
import sklearn.cluster 
import plotly.express as px
import urllib
from collections import Counter
from collections import defaultdict

import msg_passing
import utils
import run
import display
import parse_data
import run

In [10]:
def calculate_influence(g, n, iters, path_length, discount, save_period=None, print_period=None):
    initialized = defaultdict(bool)
    update_mag = []
    for n in g.nodes():
        g.nodes()[n]["value"] = 0

    for iter in range(iters):
        curr_node = n 
        issue_weight = 1/discount
        seen = []
        for _ in range(path_length):
            next_node = random.choice(list(g.neighbors(curr_node)))
            seen.append(next_node)
            issue_weight *= discount*g[curr_node][next_node][0]["weight"]
            if(not initialized[next_node]):
                g.nodes()[next_node]["next_value"] = issue_weight 
                g.nodes()[next_node]["num_visits"] = 1
                initialized[next_node] = True 
                continue 
            curr_avg = g.nodes()[next_node]["value"]
            g.nodes()[next_node]["next_value"] = curr_avg + (issue_weight-curr_avg) / g.nodes()[next_node]["num_visits"]
            g.nodes()[next_node]["num_visits"] += 1
        if(save_period and (iter % save_period == 0 or iter == iters-1)):
            update_mag.append(sum([abs(g.nodes()[n]["next_value"] - g.nodes()[n]["value"]) for n in seen]))
        if(print_period and (iter % print_period == 0 or iter == iters-1)):
            print("Completed iter " + str(iter) + " last update mag: " + str(update_mag[-1]))
        for n in seen:
            g.nodes()[n]["value"] = g.nodes()[n]["next_value"]

    return initialized, update_mag

def sort_influenced(g, initialized):
    influences = [(i, g.nodes()[i]["value"], g.nodes()[i]["num_visits"]) for i, n in initialized.items() if n]
    influences = sorted(influences, key=lambda x: abs(x[1]), reverse=True)
    return influences 

def get_first_differing(inf1, inf2):
    for i, inf in enumerate(inf1):
        for j, target_inf in enumerate(inf2):
            if(target_inf[0] == inf[0] and ((target_inf[1] < 0 and inf[1] > 0) or (target_inf[1] > 0 and inf[1] < 0))):
                return i, inf, j, target_inf

def get_influence_path(g, start, end):
    path = [start]
    while(path[-1] != end):
        neighbors = [(n, g.nodes()[n]["value"]) for n in g.neighbors(start)]

def analyze_pair(g, node1, node2):
    g1 = g.copy()
    g2 = g.copy()
    iters = 100000
    init1, update_mag1 = calculate_influence(g1, node1, iters, 10, 0.95)
    init2, update_mag2 = calculate_influence(g2, node2, iters, 10, 0.95) 
    inf1 = sort_influenced(g1, init1)
    inf2 = sort_influenced(g2, init2)
    first_differing1 = get_first_differing(inf1, inf2)
    first_differing2 = get_first_differing(inf2, inf1)
    return first_differing1, first_differing2, g1, g2, init1, init2, inf1, inf2 



In [13]:
g = msg_passing.load_graph_csv("input/Incremental_Datasets/gun_regulations_network.csv", clean_data=True)
pg, _ = msg_passing.prune_graph(g) 
fd1, fd2, g1, g2, init1, init2, inf1, inf2 = analyze_pair(pg, "biden", "republican")
print(fd1)
print(fd2)

"""
pg_paul, _ = msg_passing.prune_graph(g)
pg_biden = pg_paul.copy()
iters = 1000000
pper = iters // 10
sper = iters // 1000
init_paul, update_mag_paul = calculate_influence(pg_paul, "paul krugman", iters, 10, 0.95, print_period=pper, save_period=sper)
init_biden, update_mag_biden = calculate_influence(pg_biden, "biden", iters, 10, 0.95, print_period=pper, save_period=sper)
"""

(201, ('mike kelly', -0.09078538814029342, 1145), 236, ('mike kelly', 0.06667292645536324, 1077))
(236, ('mike kelly', 0.06667292645536324, 1077), 201, ('mike kelly', -0.09078538814029342, 1145))


'\npg_paul, _ = msg_passing.prune_graph(g)\npg_biden = pg_paul.copy()\niters = 1000000\npper = iters // 10\nsper = iters // 1000\ninit_paul, update_mag_paul = calculate_influence(pg_paul, "paul krugman", iters, 10, 0.95, print_period=pper, save_period=sper)\ninit_biden, update_mag_biden = calculate_influence(pg_biden, "biden", iters, 10, 0.95, print_period=pper, save_period=sper)\n'

In [9]:
for i in inf2:
    if(i[1] < 0):
        print(i)

In [36]:
def get_place(influences, name):
    for i, inf in enumerate(influences):
        if(inf[0] == name):
            return i



influences_paul = sort_influenced(pg_paul, init_paul) 
influences_biden = sort_influenced(pg_biden, init_biden)

print(get_first_differing(influences_paul, influences_biden))
print(get_first_differing(influences_biden, influences_paul))

print("PAUL:")
print(influences_paul[:10])
print("BIDEN") 
print(influences_biden[:10])

(111, ('nouriel roubini', 0.013547398043609726, 29167), 58, ('nouriel roubini', -0.0666612777915885, 28681))
(58, ('nouriel roubini', -0.0666612777915885, 28681), 111, ('nouriel roubini', 0.013547398043609726, 29167))
PAUL:
[('senate democratic deal', 0.4141467559164906, 42344), ('inflation reduction act’s ‘the inflation reduction act’', 0.3961240099831666, 28375), ('u.s. transportation', 0.37006667886622147, 28416), ('alfredo ortiz', 0.3667696247265916, 28811), ('msnbc', 0.35523712636198396, 56594), ('senate', 0.3531404682126282, 28168), ('job creator’s network', 0.3361946457616918, 28433), ('democratic', 0.31886365627476315, 28346), ('super', 0.2923428853177559, 29363), ('joe manchin', 0.2873413153502854, 42896)]
BIDEN
[('inflation reduction act’s ‘the inflation reduction act’', 0.4493146551232062, 29704), ('senate', 0.41658039647058215, 27667), ('alfredo ortiz', 0.40915122548555677, 29154), ('senate democratic deal', 0.3987694722371398, 41863), ('job creator’s network', 0.3758537296

In [30]:
infile = "input/Incremental_Datasets/trump_impeachment_network.csv" 
df = pd.read_csv(infile) 
def get_hosts(df):
    urls = df["url"] 
    hosts = [urllib.parse.urlparse(url).hostname for url in urls]
    c = Counter(hosts) 
    print(c)
"""
get_hosts(df)
hosts = [
    "www.msn.com",
    "www.marketwatch.com",
    "www.theepochtimes.com",
    "www.foxnews.com",
    "www.politico.com",
    "news.yahoo.com",
    "www.washingtonexaminer.com",
    "markets.businessinsider.com",
    "www.zerohedge.com",
    "www.breitbart.com",
    "dailycaller.com",
    "www.foxbusiness.com",
    "www.washingtonpost.com"    
]
df = parse_data.filter_by_host(df, hosts[0])
g = msg_passing.load_graph_df(df, True, "from_node", "raw_answer")
print(len(g.nodes()))
pg, _ = msg_passing.prune_graph(g)
print(len(pg.nodes()))
print(pg.nodes())

iters = 10000
num_print = 10
num_save = 1000
pper, sper = iters // num_print, iters // num_save

msg_passing.initialize_node_values(pg, size=3)
hist, diagnostic_hist = msg_passing.pass_messages_with_random_walks(
    pg, 10**-3, 10**-3, iters, True, print_period=pper, save_period=sper, history={}, 
    discount=0.95, path_length=10, batch_size=10
)
""" 




Counter({'www.msn.com': 672, 'news.yahoo.com': 414, 'www.politico.com': 371, 'www.jsonline.com': 313, 'www.theepochtimes.com': 298, 'www.breitbart.com': 199, 'www.foxnews.com': 183, 'www.npr.org': 163, 'www.ny1.com': 148, 'www.washingtonexaminer.com': 146, 'www.wsws.org': 142, 'www.theday.com': 139, 'www.washingtonpost.com': 136, 'www.mediaite.com': 133, 'townhall.com': 131, 'www.washingtontimes.com': 115, 'www.sott.net': 110, 'abcnews.go.com': 106, 'www.channel3000.com': 96, 'www.crainsnewyork.com': 89, 'www.thenation.com': 89, 'www.opensecrets.org': 79, 'www.nydailynews.com': 75, 'www.wbur.org': 74, 'www.yahoo.com': 74, 'newsbusters.org': 74, 'www.sun-sentinel.com': 71, 'www.kgw.com': 71, 'www.zerohedge.com': 70, 'www.cnn.com': 69, 'www.economist.com': 69, 'www.naturalnews.com': 60, 'kvia.com': 56, 'www.usatoday.com': 54, 'www.theamericanconservative.com': 52, 'www.baltimoresun.com': 50, 'www.orlandosentinel.com': 50, 'www.mcall.com': 50, 'www.capitalgazette.com': 50, 'www.9news.com'

In [33]:
_ = display.plot_diagnostic(diagnostic_hist)
_ = display.plot_confusion_matrix(pg, utils.get_top_n_nodes(pg, 20), 2)

In [None]:

def predict_links():
    war_fname = "output/gun_regulations_v2_raw_answer_processed_names_uniform_init_update_weights_30K_lr_3_dim_10.graphml" 
    war_fname_hist = "output/gun_regulations_v2_raw_answer_processed_names_uniform_init_update_weights_30K_lr_3_dim_10.pkl" 
    war_fname = "output/combined_v2_general_msg_passing_batch_10_path_10_100K_lr_3_dim_3.graphml" 
    war_fname_hist = "output/combined_v2_general_msg_passing_batch_10_path_10_100K_lr_3_dim_3.pkl" 
    wg = msg_passing.load_graph_graphml(war_fname)
    pg, aux = msg_passing.prune_graph(wg)
    avg_g = msg_passing.avg_edge_weights(pg)
    hist, diagnostic_hist = msg_passing.load_history(war_fname_hist)
    def sign(n):
        if(n > 0):
            return 1
        elif(n < 0):
            return -1
        else:
            return 0

    outliers = []
    for n in pg.nodes():
        for n2 in pg.neighbors(n):
            weights = np.array([v["weight"] for v in wg[n][n2].values()])
            avg_weight = np.average(weights)

            angle = utils.between_angle(pg.nodes()[n]["value"], pg.nodes()[n2]["value"])
            if(abs(avg_weight) > 0.1 or abs(np.pi/2-angle) < 1):
                continue
            if(sign((np.pi/2) - angle) != sign(avg_weight)):
                outliers.append((n, n2, avg_weight, (np.pi/2-angle)/1.5))

    print(len(outliers))
    print(outliers)

In [6]:
def get_url_host_set(df):
    hosts = set(df["url"].apply(parse_data.get_url_host))
    return hosts

def load(df, clean_data, source_col="from_node", target_col="raw_answer"):
    g = nx.MultiGraph()
    df["publish_date"] = df["publish_date"].fillna("")
    df["url_host"] = df["url"].apply(parse_data.get_url_host)
    hosts = get_url_host_set(df)
    if(clean_data):
        parse_data.clean_df(df)
    

    edge_questions = defaultdict(set)
    for _, row in df.iterrows():
        edge_weight, ques = utils.get_ques_val(row["valence"])
        if(not edge_weight):
            continue

        # don't add duplicate questions to the graph
        source, target = row[source_col], row[target_col]
        if((ques in edge_questions[(source, target)]) or (ques in edge_questions[(target, source)])):
                continue
        edge_questions[(source, target)].add(ques)
        edge_questions[(target, source)].add(ques)
        
        source, target = row[source_col], row[target_col]
        g.add_edge(source, target,
            weight=edge_weight,
            valence=row["valence"],
            confidence=row["confidence"],
            publish_date=row["publish_date"],
            url_host=row["url_host"],
            #full_text=row["full_text"],
            #summary=row["summary"],
            #keywords=row["keywords"],
            #publish_date=row["publish_date"],
            #authors=row["authors"],
            #url=row["url"],
            #leaf_label=row["leaf_label"],
            #root_label=row["root_label"]
        )

    pg, aux = msg_passing.prune_graph(g) 
    for host in hosts:
        g.add_node("NEWS SOURCE: " + host)
    new_edges = []
    for u, v, h in g.edges(data="url_host"):
        new_edges.append((u, "NEWS SOURCE: " + h))
        new_edges.append((v, "NEWS SOURCE: " + h))

    for u, v in new_edges:
        g.add_edge(u, v) 
    
    g.remove_nodes_from(aux)

    return g


df = pd.read_csv("input/Incremental_Datasets/roe_v_wade_network.csv")
g = load(df, True) 
print(len(g.nodes()))

433


In [7]:
msg_passing.save_graph(g, "output/test.graphml")