In [240]:
# imports
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from scipy.stats import pearsonr

### Child vocabulary

In [241]:
vocab = [line.strip() for line in open('../data/vertomul.txt')]

### Unpack Layers

In [242]:
free_assoc = pickle.load( open( "free_assoc.pickle", "rb" ))
co_oc = pickle.load( open( "co_oc.pickle", "rb" ))
phon_conn = pickle.load( open( "phon_conn.pickle", "rb" ))
feat_norms = pickle.load( open( "feat_norms.pickle", "rb" ))
word_emb_layer = pickle.load( open( "word_emb_layer.pickle", "rb" ))
visual_layer = pickle.load( open( "visual_graph.pickle", "rb" ))
lancaster_layer = pickle.load( open( "lancaster.pickle", "rb" ))

In [244]:
# lancaster_layer - it introduces new nodes 
lists = [ co_oc, phon_conn, free_assoc, feat_norms, word_emb_layer, visual_layer, lancaster_layer]

### Construct weighted graph

In [245]:
aggregate = nx.Graph()

In [246]:
aggregate.add_nodes_from(vocab)

In [247]:
aggregate.number_of_nodes()

529

In [248]:
# setting the weights. If the connection between two nodes is established across multiple layers, we up its weight
for a_list in lists:
 
    for pair in a_list:
      
        if aggregate.has_edge(pair[0], pair[1]):
            aggregate[pair[0]][pair[1]]['weight'] += 1
        else:
#             print("adding nodes on", pair[0], pair[1])
            aggregate.add_edge(pair[0], pair[1], weight=1)
    print(aggregate.number_of_nodes())

529
529
529
529
529
529
529


In [249]:
# TODO: ISSUE HERE!! The network has more nodes than the vocab. Caused by the Lancaster layer

In [250]:
def normalize_weights(G):
    '''
     This function creates a list of
     normalized weights for each node 
     in the G graph. The sum of the 
     weights sum to 1.
    '''
    normalized_weights = []
    weights = list(G.degree(weight='weight')) # strength of each node
    overall = sum(node[1] for node in weights) # sum of all weights in graph 
    
    for node in weights:
        perc = node[1]/overall
        normalized_weights.append(perc)
    
    return normalized_weights

In [251]:
def get_node_connections(G, batch):
    '''
     This function returns a list of
     tuples with (node, edge) pairings
     for each connection a node has. 
    '''
    list_conns = []
    for node in batch:
        list_conns.append(G.edges(node))
    return list_conns

In [252]:
 def preferential_attachment(G, vocab, weights, batch_size):
    '''
     preferential attachment function call takes in a graph 
     and a list of tuples --> for each node in the batch list,
     get all of its edges and create a list of tuples.
    '''
    
    # create a random batch of nodes
    batch = np.random.choice(vocab, batch_size,replace=False,p=weights)
    
    # create a list of tuples (node, edge) for each edge a node has
    list_conns = get_node_connections(G, batch)
    
    node_avgs_list = []
    for arr in list_conns:
        # compute preferential attachment on batch nodes
        preds = nx.preferential_attachment(aggregate, arr)
        
        # get the average of all the attachments for each node
        node_avgs = np.mean([p for u,v,p in preds])
        node_avgs_list.append(node_avgs)
    
    # list with averages preferential attachment scorer for each node in batch
    zip_lists = list(zip(batch, node_avgs_list))

    return zip_lists

In [253]:
normalized_weights = normalize_weights(aggregate)

In [254]:
res = preferential_attachment(aggregate, vocab, normalized_weights, 20)
res

[('bug', 7888.0),
 ('animal', 6202.0),
 ('a', 14494.0),
 ('comb', 5284.0),
 ('box', 7605.0),
 ('little', 8605.0),
 ('before', 5403.0),
 ('downtown', 6505.0),
 ('all', 12796.0),
 ('throw', 1996.0),
 ('mailman', 9756.0),
 ('finish', 9705.0),
 ('spill', 5012.0),
 ('airplane', 12941.0),
 ('red', 2553.0),
 ('ant', 5141.0),
 ('egg', 2597.0),
 ('where', 10883.0),
 ('what', 9440.0),
 ('please', 5850.0)]

### Correlation between batch and AoA dataset

In [255]:
# create dataframe for batch --> (node, score)
df = pd.DataFrame(data=res, columns=['word', 'preferential attachment score'])

In [256]:
# create dataframe for aoa dataset --> (word, age_avg)
aoa = pd.read_excel('../data/AoA_ratings.xlsx')
aoa.drop(aoa.columns[[1, 2, 3, 5, 6]], axis = 1, inplace = True) # we are only interested in the mean column

### A note on the Pearson correlation
The Pearson correlation varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.
The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets.


In [257]:
def AoA_correlation(aoa, df):
    
    # get list of batch words
    res_words = [word[0] for word in res]   
    
    # reduce the aoa dataframe to the 
    # rows that match our batch
    for index, row in aoa.iterrows():
        if row['Word'] not in res_words:
            aoa.drop(index, inplace=True)
            
    # rearrage the df dataframe in ascending order 
    # (the aoa dataframe is already sorted)
    df.sort_values(["word"], ascending = (True), inplace=True)
    
    # if the two sets are not the same, it means that the aoa 
    # dataset doesn't have some words tha the vocab has
    if len(aoa) != len(df): 
        print("The lists are not the same")
        
        # get the data into lists
        df_scores = df['word'].to_list()
        aoa_scores = aoa['Word'].to_list()
        
        # intersection of the two sets
        valid_words = list(set(df_scores) & set(aoa_scores)) 

        # remove the words that are not found 
        # in the aoa from the df
        for index, row in df.iterrows(): 
            print(row['word'])
            if row['word'] not in valid_words:
                df.drop(index, inplace=True)
        
    # the sets are now the same and we can
    # correlate them
    df_scores = df['preferential attachment score'].to_list()
    aoa_scores = aoa['Rating.Mean'].to_list()
    correlation, p_value = pearsonr(df_scores, aoa_scores)# returns the correlation score and the 

    return correlation 
        

In [258]:
corr = AoA_correlation(aoa, df)
corr

-0.08957911265388938