# Fix 

In [1]:
from n2i.__main__ import n2i_main, n2i_nx_graph
from n2i.node2vec import read_graph
import tensorflow as tf
import networkx as nx
from scipy.spatial import distance
import itertools as IT
import pandas as pd

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [3]:
EPSILON = 1E-5
METRIC_NAMES = ['ratio', 'sub', 'like']
def homophily_metrics(similarity_edges, similarity_nonedges):
    return {
        'ratio': np.mean(similarity_edges) / np.mean(similarity_nonedges),
        'sub': np.mean(similarity_edges) - np.mean(similarity_nonedges),
        'like': np.sum(np.log(similarity_edges + EPSILON)) + np.sum(np.log(1 - similarity_nonedges + EPSILON)) ,
    }

In [4]:
# similarity between connected nodes
def sim_in(G):
    sims = []
    for i in G.nodes:
        for j in list(G.neighbors(i)):
            #print(G.nodes[i]['interests'], emb[i])
            sims.append(1 - distance.cosine(G.nodes[i]['interests'], G.nodes[j]['interests']))
    return np.mean(sims)

def select_notedge(G):
    v1 = np.random.choice(G.nodes())
    v2 = np.random.choice(G.nodes())

    while (v1,v2) in G.edges or v1==v2:
        v1 = np.random.choice(G.nodes())
        v2 = np.random.choice(G.nodes())
    return v1, v2
#     n = nx.number_of_nodes(G)
#     while True:
#         a, b = np.random.randint(0, n, size=2)
#         if (a, b) not in G.edges:
#             return a, b

# similarity between disconnected nodes
def sim_out(G, samples):
    sims_out = []
    for c in range(samples):
        i, j = select_notedge(G)
        sims_out.append(1 - distance.cosine(G.nodes[i]['interests'], G.nodes[j]['interests']))
    return np.mean(sims_out)

def homophily(G):
    return sim_in(G) / sim_out(G, 5000)

In [5]:
def evaluate_metrics(graph, emb):
    '''
    Returns the three metrics for si and so defined in homophily_metrics function
    '''
    # building the graph
    for i in graph.nodes:
        graph.nodes[i]['interests'] = emb[int(i)]
        
    # calculating si and so
    output = {}
    si = sim_in(graph)
    so = sim_out(graph, 5000)
    output['si'] = round(si, 4)
    output['so'] = round(so, 4)
    for name, metric in homophily_metrics(si, so).items():
        output[name] = round(metric, 4)

    return output

In [6]:
def evaluate_score(graph, emb):
    '''
    Evaluetes the score in link prediction
    '''
    # creating X and y concatenating embeddings of the two linked nodes (y=1)
    X = []
    y = []
    for u, v in graph.edges():
        arr_u = list(emb[int(u)])
        arr_v = list(emb[int(v)])
        X.append(arr_u + arr_v)
        y.append(1)
        
    # adding no-linked nodes to X and y (y=0)
    missing = [pair for pair in IT.combinations(graph.nodes(), 2) if not graph.has_edge(*pair)]
    import random
    random.seed(42)
    no_edges = random.choices(missing, k=len(graph.edges()))
    for u, v in no_edges:
        arr_u = list(emb[int(u)])
        arr_v = list(emb[int(v)])
        X.append(arr_u + arr_v)
        y.append(0)
        
    # testing
    from sklearn.utils import shuffle
    X, y = shuffle(X, y, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.32, random_state=42)
    # Create one-vs-rest logistic regression object
    clf = LogisticRegression(random_state=0, multi_class='ovr')
    # Train model
    model = clf.fit(X_train, y_train)
    
    return model.score(X_test, y_test)

## dim 2 - Les miserables

In [7]:
def n2v_results(G, p, q, beta_list, av_list, bv_list, prior='beta', topics=2, seed=42):
    '''
    Returns a list of parameters and homophily metrics
    '''
    result = []
    for b in beta:
        for av in alpha_prior_value:
            for bv in beta_prior_value:
                tf.reset_default_graph()
                tf_emb = n2i_nx_graph(nx_graph=G, 
                                       seed=seed, 
                                       window_size=10,
                                       walk_length=80,
                                       num_walks=10,
                                       dimensions=topics,
                                       p=p,
                                       q=q,
                                       beta=b,
                                       alpha_value=av,
                                       beta_value=bv,
                                       prior=prior)

                result.append([b, av, bv, evaluate_metrics(graph=G, emb=tf_emb)])
    return result

In [8]:
graph_path = '../data/graph/lesmiserables/lesmiserables_edgelist.txt'

G = read_graph(weighted=False, 
                   graph=graph_path,
                   directed=False)

In [9]:
# high homophily
p = 0.01
q = 10
beta = [5]
prior = 'beta'
# if prior is beta:
alpha_prior_value = [0.5, 1.]
beta_prior_value = [0.5, 1.]

result = n2v_results(G, p, q, beta, alpha_prior_value, beta_prior_value, topics=2, seed=33)

AttributeError: module 'tensorflow' has no attribute 'reset_default_graph'

In [19]:
# high homophily
p = 0.05
q = 50
beta = [3]
prior = 'beta'
# if prior is beta:
alpha_prior_value = [2.]
beta_prior_value = [2.]

result = n2v_results(G, p, q, beta, alpha_prior_value, beta_prior_value, topics=2, seed=33)

In [20]:
print(result)

[[3, 2.0, 2.0, {'si': 0.9629, 'so': 0.869, 'ratio': 1.108, 'sub': 0.0939, 'like': -2.0702}]]


In [59]:
print('beta', 'a-value', 'b-value')
for _ in result:
    print(_)

beta a-value b-value
[0, 0.5, 0.5, {'si': 0.8183, 'so': 0.7183, 'ratio': 1.1392, 'sub': 0.1, 'like': -1.4673}]
[0, 0.5, 1.0, {'si': 0.8183, 'so': 0.7183, 'ratio': 1.1392, 'sub': 0.1, 'like': -1.4673}]
[0, 1.0, 0.5, {'si': 0.8183, 'so': 0.7183, 'ratio': 1.1392, 'sub': 0.1, 'like': -1.4673}]
[0, 1.0, 1.0, {'si': 0.8183, 'so': 0.7183, 'ratio': 1.1392, 'sub': 0.1, 'like': -1.4673}]
[5, 0.5, 0.5, {'si': 0.7828, 'so': 0.6699, 'ratio': 1.1685, 'sub': 0.1129, 'like': -1.3532}]
[5, 0.5, 1.0, {'si': 0.7205, 'so': 0.5969, 'ratio': 1.207, 'sub': 0.1236, 'like': -1.2364}]
[5, 1.0, 0.5, {'si': 0.8988, 'so': 0.8326, 'ratio': 1.0796, 'sub': 0.0662, 'like': -1.894}]
[5, 1.0, 1.0, {'si': 0.8484, 'so': 0.7534, 'ratio': 1.1261, 'sub': 0.095, 'like': -1.5643}]


In [68]:
# low homophily
p = 10
q = 0.01
beta = [0, 5]
prior = 'beta'
# if prior is beta:
alpha_prior_value = [0.5, 1.]
beta_prior_value = [0.5, 1.]

result = n2v_results(G, p, q, beta, alpha_prior_value, beta_prior_value, topics=5, seed=99)

In [69]:
print('beta', 'a-value', 'b-value')
for _ in result:
    print(_)

beta a-value b-value
[0, 0.5, 0.5, {'si': 0.9521, 'so': 0.9296, 'ratio': 1.0243, 'sub': 0.0225, 'like': -2.7022}]
[0, 0.5, 1.0, {'si': 0.9521, 'so': 0.9296, 'ratio': 1.0243, 'sub': 0.0225, 'like': -2.7022}]
[0, 1.0, 0.5, {'si': 0.9521, 'so': 0.9296, 'ratio': 1.0243, 'sub': 0.0225, 'like': -2.7022}]
[0, 1.0, 1.0, {'si': 0.9521, 'so': 0.9296, 'ratio': 1.0243, 'sub': 0.0225, 'like': -2.7022}]
[5, 0.5, 0.5, {'si': 0.8867, 'so': 0.8466, 'ratio': 1.0473, 'sub': 0.0401, 'like': -1.9949}]
[5, 0.5, 1.0, {'si': 0.9558, 'so': 0.9393, 'ratio': 1.0176, 'sub': 0.0165, 'like': -2.8464}]
[5, 1.0, 0.5, {'si': 0.9207, 'so': 0.8758, 'ratio': 1.0513, 'sub': 0.0449, 'like': -2.1687}]
[5, 1.0, 1.0, {'si': 0.9256, 'so': 0.889, 'ratio': 1.0412, 'sub': 0.0366, 'like': -2.2756}]
