In [1]:
#imports
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
import networkx as nx
import torch
from networkx.algorithms import approximation
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from tqdm.notebook import tqdm
import pickle
from nltk.stem import WordNetLemmatizer
import collections

# Read child vocabulary 

In [2]:
words = [line.strip() for line in open('../data/vertomul.txt')]
print(len(words))

529


# Recreate the Multiplex Network

In [3]:
def mean_degree_connectivity(graph_name):
    num_edges = 0
    for node in graph_name.nodes():
        num_edges += len(graph_name.edges(node))

    return num_edges/graph_name.number_of_nodes()

In [4]:
def perc_nodes_in_lcc(graph_name):
    largest_cc = max(nx.connected_components(graph_name), key=len)
    lenght = len(largest_cc)
    return lenght/graph_name.number_of_nodes()
   

In [5]:
def mean_shortest_path_lcc(graph_name):
    S = [graph_name.subgraph(c).copy() for c in nx.connected_components(graph_name)]
    comps = [len(max(nx.connected_components(i), key=len)) for i in S]
    index_max = max(range(len(comps)), key=comps.__getitem__)
    return nx.average_shortest_path_length(S[index_max])


## Get the data for the 4 layers

In [6]:
phonological_sim_words = [line.strip() for line in open('../data/phcmul.txt')]
feature_sharing_words = [line.strip() for line in open('../data/mrmul.txt')]
free_association_words = [line.strip() for line in open('../data/famul.txt')]
co_occurrances_words = [line.strip() for line in open('../data/cumul.txt')]

## Clean up the data and store it into lists

In [7]:
phonological_sim_list = []
feature_sharing_list = []
free_association_list = []
co_occurrances_list = []

for pair in phonological_sim_words:
    phonological_sim_list.append(tuple(map(str, pair.split('\t'))))
    
for pair in feature_sharing_words:
    feature_sharing_list.append(tuple(map(str, pair.split('\t'))))
    
for pair in free_association_words:
    free_association_list.append(tuple(map(str, pair.split('\t'))))
    
for pair in co_occurrances_words :
    co_occurrances_list.append(tuple(map(str, pair.split('\t'))))



## Instantiate a graph for multiplex and populate it

In [8]:
multiplex = nx.Graph()
multiplex.add_nodes_from(words)

## We'll add one layer at a time:

## Free associations Layer

In [9]:
free_assoc = nx.Graph()
free_assoc.add_nodes_from(words)
len(free_assoc.nodes)

529

In [11]:
# Add the free association layer to the multiplex
for pair in free_association_list:
    multiplex.add_edge(pair[0], pair[1])
    
# Add the free association layer to its graph
for pair in free_association_list:
    free_assoc.add_edge(pair[0], pair[1])

In [12]:
len(free_assoc.nodes)

529

### 1) Mean degree of connectivity _k_

In [13]:
mean_degree_connectivity(free_assoc)

9.228733459357278

### 2) Mean Clustering Coefficient _CC_

In [14]:
nx.average_clustering(free_assoc)

0.1941295782803327

### 3) Assortativity Coefficient _a_

In [15]:
nx.degree_assortativity_coefficient(free_assoc)

-0.10131692039561056

### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [16]:
perc_nodes_in_lcc(free_assoc)

0.996219281663516

### 5) Mean Shortest Path lenght in the Largest Connected Component _d_

In [17]:
mean_shortest_path_lcc(free_assoc)

3.1650493142185123

### We have added one layer to multiplex

In [18]:
multiplex.number_of_edges()

2441

In [19]:
pickle.dump(free_association_list, open( "free_assoc.pickle", "wb" ) )

In [20]:
len(free_assoc.nodes)

529

## Feature Sharing Norms Layer

In [21]:
feat_norms = nx.Graph()
feat_norms.add_nodes_from(words)

In [22]:
for pair in feature_sharing_list:
        multiplex.add_edge(pair[0], pair[1])
    
for pair in feature_sharing_list:
    feat_norms.add_edge(pair[0], pair[1])

In [23]:
len(feat_norms.nodes)

529

### 1) Mean degree of connectivity _k_

In [24]:
mean_degree_connectivity(feat_norms)

9.032136105860113

### 2) Mean Clustering Coefficient _CC_

In [25]:
nx.average_clustering(feat_norms)

0.1520222342517882

### 3) Assortativity Coefficient _a_

In [26]:
nx.degree_assortativity_coefficient(feat_norms)

-0.010608927346368343

### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [27]:
perc_nodes_in_lcc(feat_norms)

0.24196597353497165

### 5) Mean Shortest Path lenght in the Largest Connected Component _d_

In [28]:
mean_shortest_path_lcc(feat_norms)

1.7570127952755905

### We have added two layers to multiplex

In [29]:
multiplex.number_of_edges()

4686

In [30]:
pickle.dump(feature_sharing_list, open( "feat_norms.pickle", "wb" ) )

In [31]:
len(feat_norms.nodes)

529

## Co-occurrences Layer

In [32]:
co_oc = nx.Graph()
co_oc.add_nodes_from(words)

In [33]:
for pair in co_occurrances_list:
    multiplex.add_edge(pair[0], pair[1])
    
for pair in co_occurrances_list:
    co_oc.add_edge(pair[0], pair[1])

In [34]:
len(co_oc.nodes)

529

### 1) Mean Degree of connectivity _k_

In [35]:
mean_degree_connectivity(co_oc)

8.117202268431003

### 2) Mean Clustering Coefficient _CC_

In [36]:
nx.average_clustering(co_oc)

0.34767890644326055

### 3) Assortativity Coefficient _a_

In [37]:
nx.degree_assortativity_coefficient(co_oc)

-0.43924523453557973

### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [38]:
perc_nodes_in_lcc(co_oc)

0.6238185255198487

### 5) Mean shortest path length of the Largest Connect Component _d_

In [39]:
mean_shortest_path_lcc(co_oc)

2.154517822602929

### We have added three layers to multiplex

In [40]:
multiplex.number_of_edges()

6696

In [41]:
pickle.dump(co_occurrances_list, open( "co_oc.pickle", "wb" ))

In [42]:
len(co_oc.nodes)

529

## Phonological Layer

In [43]:
phon_conn = nx.Graph()
phon_conn.add_nodes_from(words)

In [44]:
for pair in phonological_sim_list:
    multiplex.add_edge(pair[0], pair[1])

for pair in phonological_sim_list:
     phon_conn.add_edge(pair[0], pair[1])

In [45]:
len(phon_conn.nodes)

529

### 1) Mean Degree of Connectivity _a_

In [46]:
mean_degree_connectivity(phon_conn)

1.3194706994328922

### 2) Mean Clustering Coefficient _CC_

In [47]:
nx.average_clustering(phon_conn)

0.1176973624988748

### 3) Assortativity Coefficient _a_

In [48]:
nx.degree_assortativity_coefficient(phon_conn)

0.4824768300873068

### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [49]:
perc_nodes_in_lcc(phon_conn)

0.33081285444234404

### 5) Mean shortest path in the Largest Connected Component _d_

In [50]:
mean_shortest_path_lcc(phon_conn)

7.694844006568145

In [51]:
pickle.dump(phonological_sim_list, open( "phon_conn.pickle", "wb" ))

### We have added all layers to the multiplex

In [52]:
multiplex.number_of_edges()

6998

In [53]:
len(phon_conn.nodes)

529

## The original multiplex layer has been populated

### 1) Mean degree of connectivity _k_

In [54]:
mean_degree_connectivity(multiplex)

26.457466918714555

### 2) Mean Clustering Coefficient _CC_

In [55]:
nx.average_clustering(multiplex)

0.3279300913138615

### 3) Assortativity Coefficient _a_

In [56]:
nx.degree_assortativity_coefficient(multiplex)

-0.07063907775109278

### 4) Number of nodes in the Lagest Connected Component _Conn._

In [57]:
perc_nodes_in_lcc(multiplex)

1.0

### 5) Mean shortest path in the Largest Connected Component _d_

In [58]:
mean_shortest_path_lcc(multiplex)

2.391905825743255

In [59]:
pickle.dump(multiplex, open( "multiplex.pickle", "wb" ))

In [60]:
len(multiplex.nodes)

529

# Create the Word Embedding Network

In [61]:
glove_embedding = WordEmbeddings('glove')

In [62]:
word_emb = {}
for word in words:
    sentence = Sentence(word) # --> strip() removes the white space from beginning and end of word
     # embed a sentence using glove.
    glove_embedding.embed(sentence)
    for token in sentence:

        word_emb[word]=token.embedding

In [63]:
word_emb_layer = nx.Graph()
word_emb_layer.add_nodes_from(words)

### Compute distance using cosine similarity


In [64]:
word_emb_list = []
for item in tqdm(word_emb):
    x = word_emb[item]
    
    for word in word_emb:
        if word is not item:
            y = word_emb[word]
            cosine_sim = torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0))
            
            if cosine_sim > .8:
                word_emb_list.append((item, word))
                word_emb_layer.add_edge(item, word)

  0%|          | 0/529 [00:00<?, ?it/s]

In [65]:
len(word_emb_layer.nodes)

529

### 1) Mean Dregree of connectivity _k_

In [66]:
mean_degree_connectivity(word_emb_layer)

0.9300567107750473

### 2) Mean Clustering Coefficient _CC_

In [67]:
nx.average_clustering(word_emb_layer)

0.06606594537207236

### 3) Assortativity Coefficient _a_

In [68]:
nx.degree_assortativity_coefficient(word_emb_layer)

0.4112425700024269

### 4) Percentage of nodes in the LCC _Conn._

In [69]:
perc_nodes_in_lcc(word_emb_layer)

0.1342155009451796

### 5) Mean shortest path in LLC _d_

In [70]:
mean_shortest_path_lcc(word_emb_layer)

3.2305835010060364

In [71]:
pickle.dump(word_emb_list, open( "word_emb_layer.pickle", "wb" ))

In [72]:
len(word_emb_layer.nodes)

529

# Create the Visual Embedding Network

In [73]:
file = open('../data/clip_embeddings.txt', 'r')
lines = file.readlines()

In [74]:
visual_vecs = {}
for line in lines:
    visual_vecs[line.split()[0]] = line.split()[1:]

In [75]:
len(visual_vecs['couch'])

513

In [76]:
visual_graph = nx.Graph()
visual_graph.add_nodes_from(words)
len(visual_graph.nodes)

529

In [77]:
visual_list = []
for item in tqdm(visual_vecs):
    x = np.array(visual_vecs[item])
   
    for word in visual_vecs:
        if word is not item:
            y = np.array(visual_vecs[word])
          
            cosine_sim = cosine_similarity(x.reshape(1,-1),y.reshape(1,-1))
            
            if cosine_sim > .8:
                visual_list.append((item, word))
                visual_graph.add_edge(item, word)

  0%|          | 0/496 [00:00<?, ?it/s]

In [78]:
len(visual_graph.nodes)

529

### 1) Mean Degree of Connectivity _k_

In [79]:
mean_degree_connectivity(visual_graph)

1.1568998109640831

### 2) Mean Clustering Coefficient _CC_

In [80]:
nx.average_clustering(visual_graph)

0.06509661244878844

### 3) Assortativity Coefficient _a_

In [81]:
nx.degree_assortativity_coefficient(visual_graph)

-0.3743916999602581

### 4) Percentage of nodes in the LCC _Conn._

In [82]:
perc_nodes_in_lcc(visual_graph)

0.09829867674858223

### 5) Mean shortest length in the LCC _d_

In [83]:
mean_shortest_path_lcc(visual_graph)

1.9079939668174963

In [84]:
pickle.dump(visual_list, open( "visual_graph.pickle", "wb" ))

In [85]:
len(visual_graph.nodes)

529

# Create the Lancaster Embedding Network

In [86]:
norms = pd.read_csv('../data/Sensorimotor_norms.csv')
cols = norms.describe().columns

In [87]:
# normalize all columns
for col in cols:
    m = norms[col].max()
    norms[col] = norms[col] / m

In [88]:
vecs = {}

for i,row in norms.iterrows():
    vecs[row.Word.lower()] =  row[cols].values

In [89]:
# lemmatizer = WordNetLemmatizer()
# new_words = []

# for word in words:
#     new_words.append(lemmatizer.lemmatize(word))
    
# print("new words", len(new_words))
# print([item for item, count in collections.Counter(new_words).items() if count > 1]) # is there duplicates?

# # remove the duplicate
# for idx, word in enumerate(new_words):
#     if word == 'glass':
#         new_words[idx] = 'glasses'
#         break
    
    
# print([item for item, count in collections.Counter(new_words).items() if count > 1]) # is there duplicates?
# print("new words", len(new_words))

In [90]:
lancaster = nx.Graph()
lancaster.add_nodes_from(words)
len(lancaster.nodes)

529

In [100]:
lancaster_list = []

for item in tqdm(vecs):
    x =  vecs[item]

    
    for word in vecs:
        if word is not item:
            y =  vecs[word]
            cosine_sim = dot(x, y)/(norm(x)*norm(y))
            
            if cosine_sim > .95:
                
                if item not in words or word not in words:
                    continue
                else:
                    lancaster_list.append((item, word))
                    lancaster.add_edge(item, word)


  0%|          | 0/524 [00:00<?, ?it/s]

In [101]:
print(len(lancaster.nodes))

529


### 1) Mean Degree of Connectivity _k_

In [102]:
mean_degree_connectivity(lancaster)

28.748582230623818

### 2) Mean Clustering Coefficient _CC_

In [103]:
nx.average_clustering(lancaster)

0.4827661753441736

### 3) Assortativity Coefficient _a_

In [104]:
nx.degree_assortativity_coefficient(lancaster)

0.350448606017677

### 4) Percetage of nodes in the LCC _Conn._

In [105]:
perc_nodes_in_lcc(lancaster)

0.8638941398865785

### 5) Mean shortest path length in LCC _d_

In [106]:
mean_shortest_path_lcc(lancaster)

3.1677127720833815

In [107]:
pickle.dump(lancaster_list, open( "lancaster.pickle", "wb" ))

In [108]:
len(lancaster.nodes)

529

# Now, we will add the above created layers to the multiplex

## Multiplex + Visual

### Make a copy of the multiplex to add each layer

In [109]:
multiplex_visual = multiplex.copy()

In [110]:
# multiplex + visual
for pair in visual_list:
    multiplex_visual.add_edge(pair[0], pair[1])

In [111]:
len(multiplex_visual.nodes)

529

### 1) _k_

In [112]:
mean_degree_connectivity(multiplex_visual)

27.330812854442343

### 2) _CC_


In [113]:
nx.average_clustering(multiplex_visual)

0.33048673859820415

### 3) _a_

In [114]:
nx.degree_assortativity_coefficient(multiplex_visual)

-0.06984398586988612

### 4) _Conn._

In [115]:
perc_nodes_in_lcc(multiplex_visual)

1.0

### 5)  _d_

In [116]:
mean_shortest_path_lcc(multiplex_visual)

2.375379503923927

## Multiplex + Word

In [117]:
multiplex_word = multiplex.copy()

In [118]:
for pair in word_emb_list:
    multiplex_word.add_edge(pair[0], pair[1])

In [119]:
len(multiplex_word.nodes)

529

### 1) _k_

In [120]:
mean_degree_connectivity(multiplex_word)

26.801512287334592

### 2) _CC_

In [121]:
nx.average_clustering(multiplex_word)

0.33161217183466807

### 3) _a_

In [122]:
nx.degree_assortativity_coefficient(multiplex_word)

-0.06763528402764445

### 4) _Conn._

In [123]:
perc_nodes_in_lcc(multiplex_word)

1.0

### 5) _d_

In [124]:
mean_shortest_path_lcc(multiplex_word)

2.389220656470184

# Multiplex + Lancaster

In [125]:
multiplex_lancaster = multiplex.copy()

In [126]:
for pair in lancaster_list:
    multiplex_lancaster.add_edge(pair[0], pair[1])

In [127]:
len(multiplex_lancaster.nodes)

529

### 1) _k_

In [128]:
mean_degree_connectivity(multiplex_lancaster)

51.406427221172024

### 2) _CC_

In [129]:
nx.average_clustering(multiplex_lancaster)

0.4064225821552918

### 3) _a_

In [130]:
nx.degree_assortativity_coefficient(multiplex_lancaster)

0.11362202923405687

### 4) _Conn._

In [131]:
perc_nodes_in_lcc(multiplex_lancaster)

1.0

### 5) _d_

In [132]:
mean_shortest_path_lcc(multiplex_lancaster)

2.1268760382654524