In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import networkx as nx
import numpy as np
import pandas as pd
import random
import os
import sklearn.cluster 
import plotly.express as px

import msg_passing
import utils
import run
import display
import parse_data

In [3]:
def load_results(fdir, f_prefix):
    g = msg_passing.load_graph_graphml(fdir + f_prefix + ".graphml")
    hist, diagnostic_hist = msg_passing.load_history(fdir + f_prefix + ".pkl")
    return g, hist, diagnostic_hist

def view_history(fdir, f_prefix):
    hist, _ = msg_passing.load_history(fdir + f_prefix + ".pkl") 
    return hist.keys()

def plot_run(fdir, f_prefix, target):
    print("Plotting run for", fdir + f_prefix)
    g = msg_passing.load_graph_graphml(fdir + f_prefix + ".graphml")
    hist, diagnostic_hist = msg_passing.load_history(fdir + f_prefix + ".pkl")
    display.plot_diagnostic(diagnostic_hist)
    display.plot_history(hist, target=target)

In [5]:
infiles = [
    "global_warming_network",
    "gun_regulations_network",
    "immigration_network",
    "inflation_network",
    "roe_v_wade_network",
    "trump_impeachment_network",
    "ukraine_war_network",
    "vaccine_hesitancy_network",
    "combined"
]

infiles = [
    "gun_regulations",
    "immigration",
    "recession_fears",
    "roe_v_wade",
    "ukraine_war",
    "vaccine_hesitancy",
]

indir = "input/Networks/"

#outdir = "output/Incremental_Datasets_2/"
#outdir = "output/archive/"
outdir = "output/Networks_v1/"

in_suffix = "_network.csv"

#out_suffix = "_random_walks_lr_10-3_30K_dc_095_pl_10_bs_10"
#out_suffix = "_v3_random_walk_batch_10_path_10_50K_lr_3_dim_3"
out_suffix = "_network_random_walks_lr_10-3_20K_dc_095_pl_{path_length}_bs_10"

net = infiles[5] 
g, hist, diagnostic_hist = load_results(outdir, net + out_suffix)

#print(hist.keys())
#display.plot_history_with_reference(hist, "anthony fauci")
#display.plot_top_n_cluster_evaluations(g, [20, 50, 100], 2, 10)
#print(view_history(outdir, net + out_suffix))
#plot_run(outdir, net + out_suffix, "biden")
#_ = display.plot_edge_weight_histogram(g, log_scale=False)
#_ = display.plot_degree_histogram(g, log_scale=True)

#fig = display.plot_confusion_matrix(g, utils.get_top_n_nodes(g, 20), 2, title=net)
#_ = display.plot_cos_dist_histogram(g, title=net)
#fig.write_html("images/vaccine_heatmap.html")

write_dir = "images/networks_cluster_hdbscan/"
#write_dir = "output/cyto_pruned_networks/"
for n in infiles:
    print(n)
    fname = outdir + n + out_suffix + ".graphml"
    #fname = indir + n + in_suffix
    #g = msg_passing.load_graph_csv(fname, clean_data=True)
    #msg_passing.save_graph(g, write_dir + n + "_full.graphml")
    g = msg_passing.load_graph_graphml(fname)
    pg, _ = msg_passing.prune_graph(g)
    #print(parse_data.compute_network_stats(g, include_diameter=False))
    top_nodes = 50#len(pg.nodes())
    fig = display.plot_confusion_matrix(pg, utils.get_top_n_nodes(pg, top_nodes), 2, title=n, show=False)
    fig.write_html(write_dir + n + "_50_-1s.html")
    #msg_passing.save_graph(pg, write_dir + n + "_count.graphml")


gun_regulations
[-1  0  1  2]
['letitia james', 'dr. mehmet oz', 'gop', 'everytown for gun safety', 'matt castelli', 'u.s. supreme court', 'georgia', 'gabby giffords', 'buffalo', 'national rifle association', 'state', 'nra', 'arizona', 'democrat', 'national association for gun rights', 'republicans', 'kathy hochul', 'new york', 'republican', 'gun owners of america', 'democratic', 'texas', 'democrats']
immigration
[-1  0  1]
['former president donald trump', 'democratic', 'steve bannon', 'washington', 'doug ducey', 'mexico', 'migrants']
recession_fears
[-1  0  1  2]
['elon musk', 'white house council of economic advisers', 'silicon valley', 'mark zuckerberg', 'jason furman', 'meta', 'bloomberg', 'saudi arabia', 'larry summers', 'san francisco', 'biden']
roe_v_wade
[-1  0  1  2]
['texas', 'democrats', 'kentucky', 'aclu', 'planned parenthood of illinois', 'illinois', 'democratic', 'joe biden']
ukraine_war
[-1  0  1]
['ukraine', 'michael fanone', 'democrats', 'united nations', 'cowboys', '

In [4]:
def get_shortest_path_network(g, nodes1, nodes2):
    subgraph_nodes = set()
    for n1 in nodes1:
        for n2 in nodes2:
            sps = list(nx.all_shortest_paths(g, n1, n2))
            for sp in sps:
                for n in sp:
                    subgraph_nodes.add(n) 
    
    return g.subgraph(subgraph_nodes)

gf = "output/Networks_v1/gun_regulations_network_random_walks_lr_10-3_20K_dc_095_pl_{path_length}_bs_10.graphml"
#gf = "output/Networks_v1/roe_v_wade_network_random_walks_lr_10-3_20K_dc_095_pl_{path_length}_bs_10.graphml"
gf = "output/Networks_v1/recession_fears_network_random_walks_lr_10-3_20K_dc_095_pl_{path_length}_bs_10.graphml"
g = msg_passing.load_graph_graphml(gf)
pg, _ = msg_passing.prune_graph(g) 
n1 = ["second amendment", "bruen", "greg abbott", "gerald smith", "iowa firearms coalition", "supreme court"]
#n2 = ["second amendment foundation", "adam kraut"]
n2 = ["joe biden", "kamala harris", "a ban on assault weapons", "biden", "white house"]
#n1 = ["donald trump", "white house", "supreme court", "clarence thomas"]
#n2 = ["republican", "republicans", "gop", "arizona", "planned parenthood", "anti-abortion"]
n1 = ["opec", "fox news", "saudis"]
n2 = ["jerome powell", "joe biden", "janet yellen"]

sg = get_shortest_path_network(pg, n1, n2)
write_dir = "output/cyto_pruned_networks/"
outname = write_dir + "recession_1.graphml" 
msg_passing.save_graph(sg, outname)

In [None]:
networks = [
    "global_warming", 
    "gun_regulations",
    "immigration",
    "inflation",
    "roe_v_wade",
    "trump_impeachment",
    "ukraine_war",
    "vaccine_hesitancy",
]
prefix = "output/Incremental_Datasets_old/" 
prefix = "output/Incremental_Datasets_weighted/" 
network = networks[6]
suffix = "_network_random_walks_lr_10-3_15K_dc_095_pl_10_bs_10.graphml"

fg1 = prefix + network + suffix + ".graphml" 
fh1 = prefix + network + suffix + ".pkl"

#fg = "output/gun_regulations_v2_raw_answer_dedup_ques_50K_lr_3_dim_10.graphml"
#fh = "output/gun_regulations_v2_raw_answer_dedup_ques_50K_lr_3_dim_10.pkl"
#fg = "output/combined_v2_general_msg_passing_batch_10_path_10_100K_lr_3_dim_3.graphml"
#fh = "output/combined_v2_general_msg_passing_batch_10_path_10_100K_lr_3_dim_3.pkl"


#fg = "output/gun_regulations_v2_general_msg_passing_batch_10_path_10_500K_lr_3_dim_3.graphml" 
#fh = "output/gun_regulations_v2_general_msg_passing_batch_10_path_10_500K_lr_3_dim_3.pkl"

fg = "output/archive/combined_v3_random_walk_batch_10_path_10_50K_lr_3_dim_3.graphml"
fh = "output/archive/combined_v3_random_walk_batch_10_path_10_50K_lr_3_dim_3.pkl"

#hist, diagnostic_hist = msg_passing.load_history(fh) 
g = msg_passing.load_graph_graphml(fg) 
pg, aux = msg_passing.prune_graph(g)

g1 = msg_passing.load_graph_graphml(fg1) 
pg1, aux1 = msg_passing.prune_graph(g1)
#display.plot_diagnostic(diagnostic_hist) 
top_nodes = [u[0] for u in utils.node_degrees(pg1)]
real_top_nodes = []
i = 0
added = 0
while(added < 20):
    if(top_nodes[i] in pg.nodes()):
        real_top_nodes.append(top_nodes[i]) 
        added += 1
    i += 1
#pg = msg_passing.initialize_node_values(pg, size=3)
#fig = display.node_confusion_matrix(pg, real_top_nodes, title=network) 
fig = display.node_confusion_matrix(pg1, real_top_nodes, title="single network")
fig = display.confusion_matrix(pg, 20, title=network)
fig.write_html("images/" + network + suffix + ".html")
fig = display.cos_dist_histogram(pg, title=network)
print("final loss:", msg_passing.loss_cos_dist(pg))

In [38]:
# save cluster evals

infiles = [
    "global_warming_network",
    "gun_regulations_network",
    "immigration_network",
    "inflation_network",
    "roe_v_wade_network",
    "trump_impeachment_network",
    "ukraine_war_network",
    "vaccine_hesitancy_network"
]
outdir = "output/Incremental_Datasets/"
#outdir = "output/archive/"
out_suffix = "_random_walks_lr_10-3_40K_dc_095_pl_10_bs_10"
#out_suffix = "_v3_random_walk_batch_10_path_10_50K_lr_3_dim_3"
imdir = "images/clustering/"
top_n_nodes = [20, 50, 100]
for f in infiles:
    fname = outdir + f 
    g, hist, diagnostic_hist = load_results(outdir, f + out_suffix)
    pg, _ = msg_passing.prune_graph(g)
    #msg_passing.initialize_node_values(pg, size=3)
    curr_top_n_nodes = [i for i in top_n_nodes]
    curr_top_n_nodes.append(len(pg.nodes()))
    fig = display.plot_top_n_cluster_evaluations(pg, curr_top_n_nodes, 2, 10, title=f, with_random_baseline=True, show=False)
    fig.write_html(imdir + f + f"_cluster_evals.html")

In [64]:
# save cluster evals

infiles = [
    "global_warming_network",
    "gun_regulations_network",
    "immigration_network",
    "inflation_network",
    "roe_v_wade_network",
    "trump_impeachment_network",
    "ukraine_war_network",
    "vaccine_hesitancy_network"
]
outdir = "output/Incremental_Datasets_extreme/"
#outdir = "output/archive/"
out_suffix = "_random_walks_lr_10-3_30K_dc_095_pl_10_bs_10"
#out_suffix = "_v3_random_walk_batch_10_path_10_50K_lr_3_dim_3"
imdir = "images/clustering/"
top_n_nodes = [20, 50, 100]
issues, scores, top_ns, is_randoms = [], [], [], []
combined = msg_passing.load_graph_graphml(outdir + "combined" + out_suffix + ".graphml")
for f in infiles:
    fname = outdir + f 
    g, hist, diagnostic_hist = load_results(outdir, f + out_suffix)
    pg, _ = msg_passing.prune_graph(g)
    cp_g = pg.copy()
    msg_passing.initialize_node_values(cp_g, size=pg.nodes()[list(pg)[0]]["value"].shape[0])
    curr_top_n_nodes = [i for i in top_n_nodes]
    curr_top_n_nodes.append(len(pg.nodes()))
    for n in curr_top_n_nodes:
        num = str(n) 
        if(n == len(pg.nodes())):
            num = "all"
        nodes = utils.get_top_n_nodes(pg, n)
        score = parse_data.evaluate_clusters(combined, nodes, 2)
        random_score = parse_data.evaluate_clusters(cp_g, nodes, 2) 
        issues.append(f)
        scores.append(score)
        top_ns.append(num)
        is_randoms.append(False) 
        issues.append(f)
        scores.append(random_score)
        top_ns.append(num) 
        is_randoms.append(True) 

df = pd.DataFrame({"Issues": issues, "Silhouette Score": scores, "Top N Nodes Clustered": top_ns, "Is Random Baseline": is_randoms})
plt_title = "Cluster Evaluations"
fig = px.line(df, x="Issues", y="Silhouette Score", color="Top N Nodes Clustered", line_dash="Is Random Baseline")
fig.show()
fig.write_html("images/clustering_extreme/cluster_eval_combined.html")

In [7]:
# save cluster evals

infiles = [
    "global_warming_network",
    "gun_regulations_network",
    "immigration_network",
    "inflation_network",
    "roe_v_wade_network",
    "trump_impeachment_network",
    "ukraine_war_network",
    "vaccine_hesitancy_network"
    #"combined"
]
outdir = "output/Incremental_Datasets_extreme/"

#outdir = "output/archive/"
out_suffix = "_random_walks_lr_10-3_30K_dc_095_pl_10_bs_10"
#out_suffix = "_v3_random_walk_batch_10_path_10_50K_lr_3_dim_3"
imdir = "images/confusion_matrices_extreme/"
top_n_nodes = [20, 50, 100]
combined = msg_passing.load_graph_graphml(outdir + "combined" + out_suffix + ".graphml")
for f in infiles:
    fname = outdir + f 
    g, hist, diagnostic_hist = load_results(outdir, f + out_suffix)
    pg, _ = msg_passing.prune_graph(g)
    cp_g = pg.copy()
    msg_passing.initialize_node_values(cp_g, size=3)
    curr_top_n_nodes = [i for i in top_n_nodes]
    curr_top_n_nodes.append(len(pg.nodes()))
    for n in curr_top_n_nodes:
        curr_nodes = utils.get_top_n_nodes(pg, n)
        fig = display.plot_confusion_matrix(pg, curr_nodes, 2, title=f + " top " + str(n) + " nodes", show=False)
        fig.write_html(imdir + f + f"_{n}_nodes.html")
        fig = display.plot_confusion_matrix(cp_g, curr_nodes, 2, title=f + " random top " + str(n) + " nodes", show=False)
        fig.write_html(imdir + f + f"_{n}_nodes_random.html")
        fig = display.plot_confusion_matrix(combined, curr_nodes, 2, title=f + " combined top " + str(n) + " nodes", show=False) 
        fig.write_html(imdir + f + f"_{n}_nodes_combined.html")

In [8]:
outdir = "output/Incremental_Datasets_variable_path/"
for f in os.listdir(outdir):
    if("graphml" not in f or "cyto" in f):
        continue
    print(f)
    full_fname = outdir + f
    g = msg_passing.load_graph_graphml(full_fname)
    pg, _ = msg_passing.prune_graph(g) 
    curr_nodes = utils.get_top_n_nodes(pg, 20)
    fig = display.plot_confusion_matrix_with_random_baseline(pg, curr_nodes, 2, title=f, show=False)
    outf = "images/cm_variable/" + f[:-len(".graphml")] + ".html"
    print(outf)
    fig.write_html(outf)

global_warming_network_random_walks_lr_10-3_20K_dc_095_pl_2_bs_10.graphml
images/cm_variable/global_warming_network_random_walks_lr_10-3_20K_dc_095_pl_2_bs_10.html
gun_regulations_network_random_walks_lr_10-3_20K_dc_095_pl_6_bs_10.graphml
images/cm_variable/gun_regulations_network_random_walks_lr_10-3_20K_dc_095_pl_6_bs_10.html
immigration_network_random_walks_lr_10-3_20K_dc_095_pl_4_bs_10.graphml
images/cm_variable/immigration_network_random_walks_lr_10-3_20K_dc_095_pl_4_bs_10.html
inflation_network_random_walks_lr_10-3_20K_dc_095_pl_2_bs_10.graphml
images/cm_variable/inflation_network_random_walks_lr_10-3_20K_dc_095_pl_2_bs_10.html
roe_v_wade_network_random_walks_lr_10-3_20K_dc_095_pl_2_bs_10.graphml
images/cm_variable/roe_v_wade_network_random_walks_lr_10-3_20K_dc_095_pl_2_bs_10.html
trump_impeachment_network_random_walks_lr_10-3_20K_dc_095_pl_9_bs_10.graphml
images/cm_variable/trump_impeachment_network_random_walks_lr_10-3_20K_dc_095_pl_9_bs_10.html
ukraine_war_network_random_walks

In [73]:
titles = [
    "global_warming_network",
    "gun_regulations_network",
    "immigration_network",
    "inflation_network",
    "roe_v_wade_network",
    "trump_impeachment_network",
    "ukraine_war_network",
    "vaccine_hesitancy_network"
]

titles = titles[:6]

histdir = "output/Incremental_Datasets_weighted/" 
#suffix = "_random_walks_lr_10-3_30K_dc_095_pl_10_bs_10"
#suffix = "_random_walks_lr_10-3_20K_dc_095_pl_10_bs_10"
suffix = "_random_walks_lr_10-3_15K_dc_095_pl_10_bs_10"
hist_suffix = suffix + ".pkl"
graph_suffix = suffix + ".graphml"

hist_files = [histdir + t + hist_suffix for t in titles]
graph_files = [histdir + t + graph_suffix for t in titles]
hist, dhist = msg_passing.load_history(hist_files[1])
g = msg_passing.load_graph_graphml(graph_files[1])
pg, _ = msg_passing.prune_graph(g)
#fig = display.plot_diagnostic(dhist)
#fig = display.plot_cos_dist_histogram(pg)
nodes = utils.get_top_n_nodes(pg, 20)
#nodes.append("russia's war in ukraine")
#fig = display.plot_confusion_matrix(pg, nodes, 2)
#fig = display.plot_top_n_cluster_evaluations(pg, [20, 50, 100], 2, 10, with_random_baseline=True)
#fig = display.plot_diagnostic_grid(hist_files, titles, "Update Magnitude and Loss", 4, 2)
#fig = display.plot_cos_dist_histogram_grid(graph_files, titles, "Histogram of Pairwaise Cosine Distances - Random Baseline", 2, 4, random_baseline=True)

"""

for i in range(len(titles)):
    g = msg_passing.load_graph_graphml(graph_files[i])
    g = msg_passing.load_graph_csv("Input/Incremental_Datasets/" + titles[i] + ".csv", clean_data=True)
    pg, _ = msg_passing.prune_graph(g)
    pg = utils.largest_connected_component(pg)
    msg_passing.save_graph(pg, "output/cyto_pruned/" + titles[i]+ ".graphml")
"""
top_n_nodes = ["20", "50", "100", "all"]
#fig = display.plot_k_cluster_evals(graph_files, titles, top_n_nodes, 2, "Clustering Comparison with Random Baseline")


for i in range(len(titles)):
    g = msg_passing.load_graph_graphml(graph_files[i])
    pg, _ = msg_passing.prune_graph(g)
    fig = display.plot_confusion_matrix_with_random_baseline(pg, utils.get_top_n_nodes(pg, 20), 2, titles[i] + " top 20 nodes", show=False)
    #fig = display.plot_top_n_cluster_evals(pg, top_n_nodes, 2, 10, "N Cluster Scores - " + titles[i], show=True)
    fig.write_html("images/clustering_weighted/cm_" + titles[i] + ".html")


#fig.write_html("images/diagnostic_grid_3.html")
#fig.write_html("images/histo_grid.html")
#fig.write_html("images/cluster_eval_test.html")
#fig.write_html("images/cm_test.html")
#print(fig)

In [77]:
titles = [
    "global_warming_network",
    "gun_regulations_network",
    "immigration_network",
    "inflation_network",
    "roe_v_wade_network",
    "trump_impeachment_network",
    "ukraine_war_network",
    "vaccine_hesitancy_network"
]

titles = titles[:8]

histdir = "output/Incremental_Datasets_2/" 
hist_suffix = "_random_walks_lr_10-3_30K_dc_095_pl_10_bs_10.pkl"
graph_suffix = "_random_walks_lr_10-3_30K_dc_095_pl_10_bs_10.graphml"

hist_files = [histdir + t + hist_suffix for t in titles]
graph_files = [histdir + t + graph_suffix for t in titles]

g = msg_passing.load_graph_graphml(graph_files[4])
pg, _ = msg_passing.prune_graph(g)

subnet = [ 
'supreme court',
'defund the police',
'joe biden',
'maga republicans',
'biden administration',
'republican',
'a ban on assault weapons',
'biden',
'pennsylvania',
'an end to gun control "nationwide."',
'scotus',
'bruen',
'right to bear arms',
'second amendment',
'white house',
'donald trump',
'florida',
'texas',
'ron desantis',
'democrats',
'nra',
'national rifle association',
]

subnet = [ 
    'mark kelly',
    'roe v. wade',
    'democratic',
    'pro-life republicans'
]
test = utils.ego_network(pg, "roe v. wade", radius=2)
sub = pg.subgraph(subnet)
msg_passing.save_graph(test, "output/save.graphml")
msg_passing.save_graph(sub, "output/sub.graphml")

"""
abs_thresh = 0.8 
edges = []
num_pos = 0
num_neg = 0
for u, v, w in pg.edges(data="weight"):
    if(abs(w) > abs_thresh):
        edges.append((u, v, w))
        if(w > 0):
            num_pos += 1
        else:
            num_neg += 1

e2 = []
for n in pg.nodes():
    for neighbor in pg.neighbors(n):
        if(len(pg[n][neighbor].keys()) > 1):
            weights = [pg[n][neighbor][k]["weight"] for k in pg[n][neighbor].keys()]
            e2.append([n, neighbor, weights])

print(utils.node_degrees(pg)[0][1])
print(utils.node_degrees(pg))  
print(len(e2), len(e2)/2)
for e in e2:
    print(e)

"""

#print(num_pos, num_neg)
#print(edges)

if(False):
    for i in range(len(titles)):
        g = msg_passing.load_graph_graphml(graph_files[i])
        pg, _ = msg_passing.prune_graph(g)
        fig = display.plot_confusion_matrix_with_random_baseline(pg, utils.get_top_n_nodes(pg, 20), 2, titles[i] + " top 20 nodes", show=False)
        fig.write_html("images/cm_pruned/confusion_matrix_plot_" + titles[i] + ".html")

#fig.write_html("images/diagnostic_grid.html")
#fig.write_html("images/histo_grid.html")
#fig.write_html("images/cluster_eval_extreme.html")
#print(fig)