# analysis

In [1]:
import pathlib
import numpy as np
import pandas as pd
import networkx as nx
from scipy.spatial import distance
from n2i.__main__ import n2i_main, n2i_nx_graph
from n2i.node2vec import read_graph

%pylab inline
import seaborn as sns

Populating the interactive namespace from numpy and matplotlib


In [2]:
graph_path = '../data/graph/lesmiserables/lesmiserables_edgelist.txt'

graph = read_graph(weighted=False, 
               graph=graph_path,
               directed=False)

In [None]:
%%time
emb = n2i_nx_graph(nx_graph=graph, topics=3, tf=False)

In [None]:
# similarity between connected nodes
def sim_in(G):
    sims = []
    for i in G.nodes:
        for j in list(G.neighbors(i)):
            sims.append(1 - distance.cosine(G.nodes[i]['interests'], G.nodes[j]['interests']))
    return np.mean(sims)

def select_notedge(G):
    v1 = np.random.choice(G.nodes())
    v2 = np.random.choice(G.nodes())

    while (v1,v2) in G.edges or v1==v2:
        v1 = np.random.choice(G.nodes())
        v2 = np.random.choice(G.nodes())
    return v1, v2
#     n = nx.number_of_nodes(G)
#     while True:
#         a, b = np.random.randint(0, n, size=2)
#         if (a, b) not in G.edges:
#             return a, b

# similarity between disconnected nodes
def sim_out(G, samples):
    sims_out = []
    for c in range(samples):
        i, j = select_notedge(G)
        sims_out.append(1 - distance.cosine(G.nodes[i]['interests'], G.nodes[j]['interests']))
    return np.mean(sims_out)

def homophily(G):
    return sim_in(G) / sim_out(G, 10000)

In [None]:
graph = '../data/graph/lesmiserables/lesmiserables_edgelist.txt'
G = read_graph(weighted=False, 
               graph=graph,
               directed=False)
p_val = [0.1, 0.25, 0.5, 0.75, 1.25, 1.5, 1.75, 2, 4, 8, 10, 20, 40]
q_val = [0.1, 0.25, 0.5, 0.75, 1.25, 1.5, 1.75, 2, 4, 8, 10, 20, 40]

topics=[15]
dimensions=[128]
walk_length=[80]
num_walks=[10]
window_size=[10]
iiter=[1]

In [None]:
args_list = []

nr_experiments = 20

for t in topics:
    for d in dimensions:
        for wk in walk_length:
            for n in num_walks:
                for wi in window_size:
                    for ii in iiter:
                        for p in p_val:
                            for q in q_val:
                                for seed in range(nr_experiments):
                                    args = [t, d, wk, n, wi, ii, p, q, seed]
                                    args_list.append(args)

In [None]:
def run_experiment(*args):
    t, d, wk, n, wi, ii, p, q, seed = args
    G_emb = n2i_nx_graph(nx_graph=G, topics=t, 
             dimensions=d, walk_length=wk,
             num_walks=n, window_size=wi,
             iiter=ii, p=p, q=q,
             seed = seed + int(1000*(q+p)))
    for i in G.nodes:
        G.node[i]['interests'] = G_emb[i]
    si = sim_in(G)
    so = sim_out(G, 5000)
    return args + (si/so,)

In [None]:
result = list(map(lambda x: run_experiment(*x), args_list))

In [None]:
df = pd.DataFrame(result, columns=['t', 'd', 'wk', 'n', 'wi', 'ii', 'p', 'q', 'seed', 'hom'])
df2 = df.groupby(['p', 'q'])['hom'].mean().unstack()

In [None]:
df.loc()[[2420]]

In [None]:
sns.heatmap(df2)
plt.title('Homophily (mean of 20 realizations)')

In [None]:
def analysis(graph, weighted, directed):
    '''
    Pipeline for the heatmap creation
    '''
    # read graph
    G = graph
    # experiments
    result = list(map(lambda x: run_experiment(*x), args_list))
    # df
    df = pd.DataFrame(result, columns=['t', 'd', 'wk', 'n', 'wi', 'ii', 'p', 'q', 'seed', 'hom'])
    df2 = df.groupby(['p', 'q'])['hom'].mean().unstack()
    
    return df2

In [None]:
graph = nx.karate_club_graph()
weighted =  False if graph.edge_attr_dict_factory() == {} else True
directed = graph.is_directed()
df = analysis(graph, weighted, directed)

# plot
sns.heatmap(df)
plt.title('Homophily (mean of 20 realizations)')

# testing score in link prediction

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [4]:
emb = n2i_nx_graph(nx_graph=graph, topics=12, tf=True, seed=10, translate=False, reduce=False, 
                   window_size=20,
                   walk_length=100,
                   num_walks=30,
                   dimensions=20)

[[39, 55, 16, 19, 18, 20, 16, 26, 43, 26, 72, 26, 16, 19, 23, 20, 22, 20, 23, 12, 11, 2, 0, 7, 0, 1, 0, 5, 0, 2, 3, 0, 1, 0, 8, 0, 4, 0, 9, 0, 1, 0, 5, 0, 8, 0, 1, 0, 9, 0, 8, 0, 11, 72, 11, 10, 11, 24, 70, 71, 25, 70, 58, 61, 65, 58, 55, 57, 62, 59, 57, 59, 60, 59, 48, 74, 48, 65, 63, 65, 61, 62, 58, 64, 60, 62, 60, 61, 58, 59, 60, 65, 76, 66, 59, 64, 61, 65, 60, 62], [69, 71, 68, 24, 42, 41, 70, 25, 11, 44, 28, 45, 28, 27, 43, 26, 54, 51, 53, 51, 26, 24, 26, 72, 26, 24, 26, 16, 20, 21, 17, 19, 17, 21, 16, 22, 16, 26, 54, 26, 11, 25, 55, 56, 55, 56, 55, 57, 48, 68, 48, 55, 65, 55, 54, 51, 55, 54, 51, 53, 51, 55, 26, 27, 71, 70, 27, 26, 54, 51, 53, 51, 52, 39, 55, 39, 52, 39, 55, 56, 55, 57, 58, 70, 58, 59, 57, 59, 65, 63, 55, 61, 60, 62, 61, 63, 48, 64, 48, 11], [29, 27, 70, 71, 68, 75, 69, 48, 68, 70, 75, 48, 58, 63, 48, 25, 69, 68, 71, 41, 24, 41, 68, 71, 41, 57, 59, 55, 41, 57, 55, 25, 71, 25, 26, 24, 50, 24, 42, 25, 26, 55, 16, 18, 20, 23, 27, 48, 66, 64, 59, 57, 65, 64, 76, 66, 6

Instructions for updating:
Use tf.cast instead.


Learning embeddings:   0%|          | 9/92400 [00:00<3:15:28,  7.88it/s, loss=11.8]

IndexError: list index out of range

In [None]:
emb

In [None]:
X = []
y = []
for u, v in graph.edges():
    arr_u = list(emb[u])
    arr_v = list(emb[v])
    X.append(arr_u + arr_v)
    y.append(1)

In [None]:
import itertools as IT
import random

missing = [pair for pair in IT.combinations(graph.nodes(), 2) if not graph.has_edge(*pair)]
no_edges = random.choices(missing, k=len(graph.edges()))
for u, v in no_edges:
    arr_u = list(emb[u])
    arr_v = list(emb[v])
    X.append(arr_u + arr_v)
    y.append(0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Create one-vs-rest logistic regression object
clf = LogisticRegression(random_state=0, multi_class='ovr')
# Train model
model = clf.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
def make_score(**kwargs):
    for key, value in kwargs.items():
        print(key, value)
        prova(value)
    

In [None]:
make_score(topics=128)

In [None]:
def prova(i):
    print(i)