In [74]:
#imports
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
import networkx as nx
import torch
from networkx.algorithms import approximation
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from tqdm import tqdm_notebook as tqdm

# Read child vocabulary 

In [2]:
def read_data(datafile):
    with open(datafile) as f:
        lines = f.readlines()
    return lines

In [3]:
words = read_data('../data/vertomul.txt')
words = [line.strip() for line in open('../data/vertomul.txt')]

# Recreate the Multiplex Network

## Get the data for the 4 layers

In [4]:
phonological_sim_words = [line.strip() for line in open('../data/phcmul.txt')]
feature_sharing_words = [line.strip() for line in open('../data/mrmul.txt')]
free_association_words = [line.strip() for line in open('../data/famul.txt')]
co_occurrances_words = [line.strip() for line in open('../data/cumul.txt')]

## Clean up the data and store it into lists

In [5]:
phonological_sim_list = []
feature_sharing_list = []
free_association_list = []
co_occurrances_list = []

for pair in phonological_sim_words:
    phonological_sim_list.append(tuple(map(str, pair.split('\t'))))
    
for pair in feature_sharing_words:
    feature_sharing_list.append(tuple(map(str, pair.split('\t'))))
    
for pair in free_association_words:
    free_association_list.append(tuple(map(str, pair.split('\t'))))
    
for pair in co_occurrances_words :
    co_occurrances_list.append(tuple(map(str, pair.split('\t'))))



## Instantiate a graph for multiplex and populate it

In [6]:
multiplex = nx.Graph()
multiplex.add_nodes_from(words)

## We'll add one layer at a time:

## Free associations Layer

In [7]:
free_assoc = nx.Graph()
free_assoc.add_nodes_from(words)

In [8]:
# Add the free association layer to the multiplex
for pair in free_association_list:
    multiplex.add_edge(pair[0], pair[1])
    
# Add the free association layer to its graph
for pair in free_association_list:
    free_assoc.add_edge(pair[0], pair[1])

### 1) Mean degree of connectivity _k_

In [9]:
num_edges = 0
for node in free_assoc.nodes():
    num_edges += len(free_assoc.edges(node))

print(num_edges/free_assoc.number_of_nodes())

9.228733459357278


### 2) Mean Clustering Coefficient _CC_

In [10]:
nx.average_clustering(free_assoc)

0.1941295782803327

### 3) Assortativity Coefficient _a_

In [11]:
nx.degree_assortativity_coefficient(free_assoc)

-0.10131692039561056

### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [12]:
largest_cc = max(nx.connected_components(free_assoc), key=len)
lenght = len(largest_cc)
conn = lenght/free_assoc.number_of_nodes()
conn

0.996219281663516

### 5) Mean Shortest Path lenght in the Largest Connected Component _d_

In [13]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


### We have added one layer to multiplex

In [14]:
multiplex.number_of_edges()

2441

## Feature Sharing Norms Layer

In [15]:
feat_norms = nx.Graph()
feat_norms.add_nodes_from(words)

In [16]:
for pair in feature_sharing_list:
    multiplex.add_edge(pair[0], pair[1])
    
for pair in feature_sharing_list:
    feat_norms.add_edge(pair[0], pair[1])

### 1) Mean degree of connectivity _k_

In [17]:
num_edges = 0
for node in feat_norms.nodes():
    num_edges += len(feat_norms.edges(node))

print(num_edges/feat_norms.number_of_nodes())

9.032136105860113


### 2) Mean Clustering Coefficient _CC_

In [18]:
nx.average_clustering(feat_norms)

0.1520222342517882

### 3) Assortativity Coefficient _a_

In [19]:
nx.degree_assortativity_coefficient(feat_norms)

-0.010608927346368343

### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [20]:
largest_cc = max(nx.connected_components(feat_norms), key=len)
lenght = len(largest_cc)
conn = lenght/feat_norms.number_of_nodes()
conn

0.24196597353497165

### 5) Mean Shortest Path lenght in the Largest Connected Component _d_

In [21]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


### We have added two layers to multiplex

In [22]:
multiplex.number_of_edges()

4686

## Co-occurrences Layer

In [23]:
co_oc = nx.Graph()
co_oc.add_nodes_from(words)

In [24]:
for pair in co_occurrances_list:
    multiplex.add_edge(pair[0], pair[1])
    
for pair in co_occurrances_list:
    co_oc.add_edge(pair[0], pair[1])

### 1) Mean Degree of connectivity _k_

In [25]:
num_edges = 0
for node in co_oc.nodes():
    num_edges += len(co_oc.edges(node))

print(num_edges/co_oc.number_of_nodes())

8.117202268431003


### 2) Mean Clustering Coefficient _CC_

In [26]:
nx.average_clustering(co_oc)

0.34767890644326055

### 3) Assortativity Coefficient _a_

In [27]:
nx.degree_assortativity_coefficient(co_oc)

-0.43924523453557973

### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [28]:
largest_cc = max(nx.connected_components(co_oc), key=len)
lenght = len(largest_cc)
conn = lenght/co_oc.number_of_nodes()
conn

0.6238185255198487

### 5) Mean shortest path length of the Largest Connect Component _d_

In [29]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


### We have added three layers to multiplex

In [30]:
multiplex.number_of_edges()

6696

## Phonological Layer

In [31]:
phon_conn = nx.Graph()
phon_conn.add_nodes_from(words)

In [32]:
for pair in phonological_sim_list:
    multiplex.add_edge(pair[0], pair[1])

for pair in phonological_sim_list:
    phon_conn.add_edge(pair[0], pair[1])

### 1) Mean Degree of Connectivity _a_

In [33]:
num_edges = 0
for node in phon_conn.nodes():
    num_edges += len(phon_conn.edges(node))

print(num_edges/phon_conn.number_of_nodes())

1.3194706994328922


### 2) Mean Clustering Coefficient _CC_

In [34]:
nx.average_clustering(phon_conn)

0.1176973624988748

### 3) Assortativity Coefficient _a_

In [35]:
nx.degree_assortativity_coefficient(phon_conn)

0.4824768300873068

### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [36]:
largest_cc = max(nx.connected_components(phon_conn), key=len)
lenght = len(largest_cc)
conn = lenght/phon_conn.number_of_nodes()
conn

0.33081285444234404

### 5) Mean shortest path in the Largest Connected Component _d_

In [37]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


### We have added all layers to the multiplex

In [38]:
multiplex.number_of_edges()

6998

## The original multiplex layer has been populated

### 1) Mean degree of connectivity _k_

In [39]:
num_edges = 0
for node in multiplex.nodes():
    num_edges += len(multiplex.edges(node))

print(num_edges/multiplex.number_of_nodes())

26.457466918714555


### 2) Mean Clustering Coefficient _CC_

In [40]:
nx.average_clustering(multiplex)

0.3279300913138615

### 3) Assortativity Coefficient _a_

In [41]:
nx.degree_assortativity_coefficient(multiplex)

-0.07063907775109278

### 4) Number of nodes in the Lagest Connected Component _Conn._

In [42]:
largest_cc = max(nx.connected_components(multiplex), key=len)
lenght = len(largest_cc)
conn = lenght/multiplex.number_of_nodes()
conn

1.0

### 5) Mean shortest path in the Largest Connected Component _d_

In [43]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


# Create the Word Embedding Network

In [44]:
glove_embedding = WordEmbeddings('glove')

In [45]:
word_emb = {}
for word in words:
    sentence = Sentence(word) # --> strip() removes the white space from beginning and end of word
     # embed a sentence using glove.
    glove_embedding.embed(sentence)
    for token in sentence:

        word_emb[word]=token.embedding

In [46]:
word_emb_layer = nx.Graph()
word_emb_layer.add_nodes_from(words)

### Compute distance using cosine similarity


In [47]:
word_emb_list = []
for item in tqdm(word_emb):
    x = word_emb[item]
    
    for word in word_emb:
        if word is not item:
            y = word_emb[word]
            cosine_sim = torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0))
            
            if cosine_sim > .8:
                word_emb_list.append((item, word))
                word_emb_layer.add_edge(item, word)

### 1) Mean Dregree of connectivity _k_

In [48]:
num_edges = 0
for node in word_emb_layer.nodes():
    num_edges += len(word_emb_layer.edges(node))

print(num_edges/word_emb_layer.number_of_nodes())

0.9300567107750473


### 2) Mean Clustering Coefficient _CC_

In [49]:
nx.average_clustering(word_emb_layer)

0.06606594537207236

### 3) Assortativity Coefficient _a_

In [50]:
nx.degree_assortativity_coefficient(word_emb_layer)

0.4112425700024269

### 4) Percentage of nodes in the LCC _Conn._

In [51]:
largest_cc = max(nx.connected_components(word_emb_layer), key=len)
length = len(largest_cc)
conn = length/word_emb_layer.number_of_nodes()
conn

0.1342155009451796

### 5) Mean shortest path in LLC _d_

In [52]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


# Create the Visual Embedding Network

In [66]:
file = open('../data/clip_embeddings.txt', 'r')
lines = file.readlines()

In [67]:
visual_vecs = {}
for line in lines:
    visual_vecs[line.split()[0]] = line.split()[1:]

In [68]:
len(visual_vecs['couch'])

513

In [69]:
visual_graph = nx.Graph()
visual_graph.add_nodes_from(words)

In [76]:
visual_list = []
for item in tqdm(visual_vecs):
    x = np.array(visual_vecs[item])
   
    for word in visual_vecs:
        if word is not item:
            y = np.array(visual_vecs[word])
          
            cosine_sim = cosine_similarity(x.reshape(1,-1),y.reshape(1,-1))
            
            if cosine_sim > .8:
                visual_list.append((item, word))
                visual_graph.add_edge(item, word)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/496 [00:00<?, ?it/s]

In [77]:
len(visual_list)

612

In [78]:
for pair in visual_list:
    visual_graph.add_edge(pair[0], pair[1])

### 1) Mean Degree of Connectivity _k_

In [79]:
num_edges = 0
for node in visual_graph.nodes():
    num_edges += len(visual_graph.edges(node))

print(num_edges/visual_graph.number_of_nodes())

1.1568998109640831


### 2) Mean Clustering Coefficient _CC_

In [80]:
nx.average_clustering(visual_graph)

0.06509661244878844

### 3) Assortativity Coefficient _a_

In [81]:
nx.degree_assortativity_coefficient(visual_graph)

-0.3743916999602581

### 4) Percentage of nodes in the LCC _Conn._

In [82]:
largest_cc = max(nx.connected_components(visual_graph), key=len)
lenght = len(largest_cc)
conn = lenght/visual_graph.number_of_nodes()
conn

0.09829867674858223

### 5) Mean shortest length in the LCC _d_

In [83]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


# Create the Lancaster Embedding Network

In [84]:
norms = pd.read_csv('../data/Sensorimotor_norms.csv')
cols = norms.describe().columns

In [85]:
# normalize all columns
for col in cols:
    m = norms[col].max()
    norms[col] = norms[col] / m

In [86]:
vecs = {}

for i,row in norms.iterrows():
    vecs[row.Word.lower()] =  row[cols].values

In [87]:
lancaster = nx.Graph()
lancaster.add_nodes_from(words)

In [88]:
lancaster_list = []
for item in tqdm(vecs):
    x =  vecs[item]
    
    for word in  vecs:
        if word is not item:
            y =  vecs[word]
            cosine_sim = dot(x, y)/(norm(x)*norm(y))
            
            if cosine_sim > .95:
                lancaster_list.append((item, word))
                lancaster.add_edge(item, word)

### 1) Mean Degree of Connectivity _k_

In [89]:
num_edges = 0
for node in lancaster.nodes():
    num_edges += len(lancaster.edges(node))

print(num_edges/lancaster.number_of_nodes())

30.16936936936937


### 2) Mean Clustering Coefficient _CC_

In [90]:
nx.average_clustering(lancaster)

0.4884309389715761

### 3) Assortativity Coefficient _a_

In [91]:
nx.degree_assortativity_coefficient(lancaster)

0.3429306184865875

### 4) Percetage of nodes in the LCC _Conn._

In [92]:
largest_cc = max(nx.connected_components(lancaster), key=len)
conn = len(largest_cc)/lancaster.number_of_nodes()
conn

0.8738738738738738

### 5) Mean shortest path length in LCC _d_

In [93]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


# Now, we will add the above created layers to the multiplex

## Multiplex + Visual

### Make a copy of the multiplex to add each layer

In [94]:
multiplex_visual = multiplex.copy()

In [95]:
# multiplex + visual
for pair in visual_list:
    multiplex_visual.add_edge(pair[0], pair[1])

### 1) _k_

In [96]:
num_edges = 0
for node in multiplex_visual.nodes():
    num_edges += len(multiplex_visual.edges(node))

print(num_edges/multiplex_visual.number_of_nodes())

27.330812854442343


### 2) _CC_


In [97]:
nx.average_clustering(multiplex_visual)

0.33048673859820415

### 3) _a_

In [98]:
nx.degree_assortativity_coefficient(multiplex_visual)

-0.06984398586988612

### 4) _Conn._

In [99]:
largest_cc = max(nx.connected_components(multiplex_visual), key=len)
lenght = len(largest_cc)
conn = lenght/multiplex_visual.number_of_nodes()
conn

1.0

### 5)  _d_

In [100]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


## Multiplex + Word

In [79]:
multiplex_word = multiplex.copy()

In [80]:
for pair in word_emb_list:
    multiplex_word.add_edge(pair[0], pair[1])

### 1) _k_

In [81]:
num_edges = 0
for node in multiplex_word.nodes():
    num_edges += len(multiplex_word.edges(node))

print(num_edges/multiplex_word.number_of_nodes())

26.801512287334592


### 2) _CC_

In [82]:
nx.average_clustering(multiplex_word)

0.33161217183466807

### 3) _a_

In [83]:
nx.degree_assortativity_coefficient(multiplex_word)

-0.06763528402764445

### 4) _Conn._

In [84]:
largest_cc = max(nx.connected_components(multiplex_word), key=len)
lenght = len(largest_cc)
conn = lenght/multiplex_word.number_of_nodes()
conn

1.0

### 5) _d_

In [85]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


# Multiplex + Lancaster

In [86]:
multiplex_lancaster = multiplex.copy()

In [87]:
for pair in lancaster_list:
    multiplex_lancaster.add_edge(pair[0], pair[1])

### 1) _k_

In [88]:
num_edges = 0
for node in multiplex_lancaster.nodes():
    num_edges += len(multiplex_lancaster.edges(node))

print(num_edges/multiplex_lancaster.number_of_nodes())

51.765765765765764


### 2) _CC_

In [89]:
nx.average_clustering(multiplex_lancaster)

0.41222324697813234

### 3) _a_

In [90]:
nx.degree_assortativity_coefficient(multiplex_lancaster)

0.12874644984634204

### 4) _Conn._

In [91]:
largest_cc = max(nx.connected_components(multiplex_lancaster), key=len)
lenght = len(largest_cc)
conn = lenght/multiplex_lancaster.number_of_nodes()
conn

1.0

### 5) _d_

In [92]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0
