In [88]:
# imports
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

### Child vocabulary

In [52]:
vocab = [line.strip() for line in open('../data/vertomul.txt')]

### Unpack Layers

In [53]:
free_assoc = pickle.load( open( "free_assoc.pickle", "rb" ))
co_oc = pickle.load( open( "co_oc.pickle", "rb" ))
phon_conn = pickle.load( open( "phon_conn.pickle", "rb" ))
feat_norms = pickle.load( open( "feat_norms.pickle", "rb" ))
word_emb_layer = pickle.load( open( "word_emb_layer.pickle", "rb" ))
visual_layer = pickle.load( open( "visual_graph.pickle", "rb" ))
lancaster_layer = pickle.load( open( "lancaster.pickle", "rb" ))

In [54]:
# intersection between two sets -- what they have in common
len(set(free_assoc) & set(lancaster_layer))

489

In [55]:
# lancaster_layer - it introduces new nodes 
lists = [ co_oc, phon_conn, free_assoc, feat_norms, word_emb_layer, visual_layer]

### Construct weighted graph

In [56]:
aggregate = nx.Graph()

In [57]:
aggregate.add_nodes_from(vocab)

In [58]:
aggregate.number_of_nodes()

529

In [59]:
# setting the weights. If the connection between two nodes is established across multiple layers, we up its weight
for a_list in lists:
 
    for pair in a_list:
      
        if aggregate.has_edge(pair[0], pair[1]):
            aggregate[pair[0]][pair[1]]['weight'] += 1
        else:
#             print("adding nodes on", pair[0], pair[1])
            aggregate.add_edge(pair[0], pair[1], weight=1)
    print(aggregate.number_of_nodes())

529
529
529
529
529
529


In [60]:
# TODO: ISSUE HERE!! The network has more nodes than the vocab. Caused by the Lancaster layer

In [79]:
def normalize_weights(G):
    '''
     This function creates a list of
     normalized weights for each node 
     in the G graph. The sum of the 
     weights sum to 1.
    '''
    normalized_weights = []
    weights = list(G.degree(weight='weight')) # strength of each node
    overall = sum(node[1] for node in weights) # sum of all weights in graph 
    
    for node in weights:
        perc = node[1]/overall
        normalized_weights.append(perc)
    
    return normalized_weights

In [80]:
def get_node_connections(G, batch):
    '''
     This function returns a list of
     tuples with (node, edge) pairings
     for each connection a node has. 
    '''
    list_conns = []
    for node in batch:
        list_conns.append(G.edges(node))
    return list_conns

In [81]:
 def preferential_attachment(G, vocab, weights, batch_size):
    '''
     preferential attachment function call takes in a graph 
     and a list of tuples --> for each node in the batch list,
     get all of its edges and create a list of tuples.
    '''
    
    # create a random batch of nodes
    batch = np.random.choice(vocab, batch_size, p=weights)
    
    # create a list of tuples (node, edge) for each edge a node has
    list_conns = get_node_connections(G, batch)
    
    node_avgs_list = []
    for arr in list_conns:

        preds = nx.preferential_attachment(aggregate, arr)
        node_avgs = np.mean([p for u,v,p in preds])
        node_avgs_list.append(node_avgs)
    
    zip_lists = list(zip(batch, node_avgs_list))
    
    print(zip_lists) # list with averages preferential attachment scorer for each node in batch
    
    return batch_avg, batch

In [82]:
preferential_attachment(aggregate, vocab, normalized_weights, 20)

[('daddy', 1864.0), ('street', 554.0), ('now', 2864.0), ('cold', 1497.0), ('walk', 1152.0), ('bus', 2985.0), ('hair', 1505.0), ('time', 1781.0), ('sing', 991.0), ('tiger', 3344.0), ('nice', 1647.0), ('dress', 3162.0), ('horse', 4015.0), ('couch', 3190.0), ('truck', 2794.0), ('potty', 1417.0), ('little', 2475.0), ('penguin', 3331.0), ('buy', 1400.0), ('kiss', 412.0)]

batch average:  2119.0


(2119.0,
 array(['daddy', 'street', 'now', 'cold', 'walk', 'bus', 'hair', 'time',
        'sing', 'tiger', 'nice', 'dress', 'horse', 'couch', 'truck',
        'potty', 'little', 'penguin', 'buy', 'kiss'], dtype='<U12'))

In [87]:
preferential_attachment(aggregate, vocab, None, 20)

[('toothbrush', 46.0), ('broken', 105.0), ('farm', 742.0), ('ouch', 171.0), ('jello', 147.0), ('full', 435.0), ('pour', 520.0), ('hand', 1524.0), ('sink', 1716.0), ('sidewalk', 75.0), ('why', 2781.0), ('long', 1317.0), ('banana', 3554.0), ('tractor', 3526.0), ('face', 857.0), ('snow', 548.0), ('now', 2864.0), ('build', 1039.0), ('tooth', 243.0), ('high', 409.0)]

batch average:  1130.95


(1130.95,
 array(['toothbrush', 'broken', 'farm', 'ouch', 'jello', 'full', 'pour',
        'hand', 'sink', 'sidewalk', 'why', 'long', 'banana', 'tractor',
        'face', 'snow', 'now', 'build', 'tooth', 'high'], dtype='<U12'))