# Phase 1: Create a Word-Embedding layer 

In [1]:
#imports
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
import networkx as nx
import torch
from networkx.algorithms import approximation
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from numpy import dot
from numpy.linalg import norm

In [2]:
def read_data(datafile):
    with open(datafile) as f:
        lines = f.readlines()
    return lines

In [3]:
glove_embedding = WordEmbeddings('glove')
words = read_data('../data/vertomul.txt')
words = [line.strip() for line in open('../data/vertomul.txt')]

In [4]:
word_emb = {}
for word in words:
    sentence = Sentence(word) # --> strip() removes the white space from beginning and end of word
     # embed a sentence using glove.
    glove_embedding.embed(sentence)
    for token in sentence:

        word_emb[word]=token.embedding
    

In [11]:
word_emb['red']

tensor([-0.3002,  0.5015, -0.1275, -0.8164,  0.3361,  0.3221, -0.0474,  0.0371,
        -0.6158, -0.2233, -0.3913, -0.3189,  0.8709,  0.7445,  0.2371,  0.3177,
         0.6132, -0.4816,  0.5545, -0.4877, -0.1187,  0.1520, -0.4388,  0.0452,
         0.6666,  0.6442, -0.2181, -0.2422,  0.1765, -0.7179,  0.4889,  0.2287,
         0.0800,  0.1224,  0.1864,  0.2052, -0.3514,  0.8317,  0.8658,  0.3340,
         0.4451, -0.9813, -0.1045, -0.1020,  0.6549,  0.1068, -0.0953,  0.5637,
         0.0488, -0.1084,  0.1054,  0.0412, -0.2939,  1.0227, -0.8657, -2.5878,
        -0.5008,  0.9758,  1.5560,  0.4521, -0.5428,  0.8199, -0.6083,  0.1992,
         0.7497, -0.3914,  0.0605, -0.0569, -0.0121,  0.0621,  0.0706, -0.4798,
        -0.8661, -0.5934,  0.5765,  0.9837, -0.0351,  0.4203, -0.4059,  0.3510,
         0.8739, -0.0694, -0.6869,  0.1860, -0.3690, -0.0218, -0.1014, -0.0376,
         0.5682,  0.7438, -0.2871, -1.0705, -0.5070, -0.1258, -0.9040, -0.2559,
        -1.3706,  0.1731,  0.1293, -0.48

###  We decided to compute the distance using cosine similarity

In [5]:
G = nx.Graph()

In [6]:
G.add_nodes_from(words)

In [7]:
len(list(G.nodes))

529

A similarity of 1 means that the words are the same.
Cosine similarity value smaller than 80 makes the edges increase by a lot. 

In [8]:
for item in word_emb:
    x = word_emb[item]
    
    for word in word_emb:
        if word is not item:
            y = word_emb[word]
            cosine_sim = torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0))
            
            if cosine_sim > .8:
                G.add_edge(item, word)
#             print("x:", item, "y:", word, "distance: ", cosine_sim)

In [9]:
G.number_of_nodes()

529

In [10]:
G.number_of_edges()

246

### Mean clustering coefficient (CC) [doc here](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html)

In [205]:
print(nx.average_clustering(G))

0.06606594537207236


### Assortativity coefficient (a) [doc here](https://networkx.org/documentation/stable/reference/algorithms/assortativity.html)

In [206]:
nx.degree_assortativity_coefficient(G)

0.4112425700024269

### Mean degree of connectivity (k)

In [207]:
num_edges = 0
for node in G.nodes():
    num_edges += len(G.edges(node))

print(num_edges/G.number_of_nodes())

0.9300567107750473


### Largest Connected Component (Conn)

In [208]:
largest_cc = max(nx.connected_components(G), key=len)
len(largest_cc)

71

In [209]:
conn = 71/G.number_of_nodes()
conn

0.1342155009451796

### Mean shortest path length of the largest connected component <_d_>

In [210]:
largest_cc = nx.complete_graph(largest_cc)
type(largest_cc)

networkx.classes.graph.Graph

In [211]:
print(nx.average_shortest_path_length(largest_cc))

1.0



The graph is not connected 

In [212]:
print(nx.is_connected(G))

False


Nodes with higher closeness centrality are better connected with the rest of the nodes. I am assuming that nodes with a closeness centrality equal to zero are not connected to any other node. 

In [213]:
word_emb_cc = nx.closeness_centrality(G)

# Create the Lancaster embedding layer

In [214]:
norms = pd.read_csv('../data/Sensorimotor_norms.csv')
cols = norms.describe().columns

In [215]:
# normalize all columns
for col in cols:
    m = norms[col].max()
    norms[col] = norms[col] / m

In [216]:
vecs = {}

for i,row in norms.iterrows():
    vecs[row.Word.lower()] =  row[cols].values

In [217]:
vecs['red']

array([0.0797948717948718, 0.0444, 0.0454731667349447, 0.1462078035949145,
       0.037851314596554844, 0.9112, 0.011907164480322906,
       0.05999999999999999, 0.6824, 0.0942, 0.05351856069232521,
       0.39078498293515357, 0.22258326563769293, 0.26746589499793305,
       0.5825902335456475, 0.21853741496598642, 0.38273142382731423,
       0.10460611278519157, 0.3260135135135135, 0.8757322175732217,
       0.4368098159509203, 0.2880694143167028], dtype=object)

In [218]:
lancaster = nx.Graph()

In [219]:
x = vecs['foot']
y = vecs['candy']
result = dot(x, y)/(np.sqrt(np.dot(x,x)) * np.sqrt(np.dot(y,y)))
result

0.7487981418968008

In [220]:
for item in vecs:
    x =  vecs[item]
    
    for word in  vecs:
        if word is not item:
            y =  vecs[word]
            cosine_sim = dot(x, y)/(norm(x)*norm(y))
            
            if cosine_sim > .95:
                lancaster.add_edge(item, word)
#             print("x:", item, "y:", word, "distance: ", cosine_sim)

In [221]:
lancaster.number_of_nodes()

492

In [222]:
lancaster.number_of_edges()

8372

In [223]:
print(nx.average_clustering(lancaster))

0.5509739250593991


In [224]:
nx.degree_assortativity_coefficient(lancaster)

0.3429306184865875

In [225]:
num_edges = 0
for node in lancaster.nodes():
    num_edges += len(lancaster.edges(node))

print(num_edges/lancaster.number_of_nodes())

34.03252032520325


In [226]:
largest_cc = max(nx.connected_components(G), key=len)
conn = len(largest_cc)/G.number_of_nodes()
conn

0.1342155009451796

In [227]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


# Multiplex Network - 4 layers 

## Get the data for the 4 layers in the multiplex

In [228]:
phonological_sim_words = [line.strip() for line in open('../data/phcmul.txt')]
feature_sharing_words = [line.strip() for line in open('../data/mrmul.txt')]
free_association_words = [line.strip() for line in open('../data/famul.txt')]
co_occurrances_words = [line.strip() for line in open('../data/cumul.txt')]

## Clean up the data and store it into a list

In [229]:
phonological_sim_list = []
feature_sharing_list = []
free_association_list = []
co_occurrances_list = []

for pair in phonological_sim_words:
    phonological_sim_list.append(tuple(map(str, pair.split('\t'))))
    
for pair in feature_sharing_words:
    feature_sharing_list.append(tuple(map(str, pair.split('\t'))))
    
for pair in free_association_words:
    free_association_list.append(tuple(map(str, pair.split('\t'))))
    
for pair in co_occurrances_words :
    co_occurrances_list.append(tuple(map(str, pair.split('\t'))))


## Instantiate another graph for the multiplex

In [230]:
multiplex = nx.Graph()

In [231]:
multiplex.add_nodes_from(words)

In [232]:
len(list(multiplex.nodes))

529

In [233]:
multiplex.number_of_edges()

0

Adding the phonological connections to the graph

In [234]:
phon_conn = nx.Graph()

In [235]:
phon_conn.add_nodes_from(words)

In [236]:
for pair in phonological_sim_list:
    multiplex.add_edge(pair[0], pair[1])

for pair in phonological_sim_list:
    phon_conn.add_edge(pair[0], pair[1])
    

In [237]:
phon_conn.number_of_edges()

349

In [238]:
print(nx.average_clustering(phon_conn))

0.1176973624988748


In [239]:
approximation.average_clustering(phon_conn, trials=1000, seed=10)

0.119

In [240]:
nx.degree_assortativity_coefficient(phon_conn)

0.4824768300873068

## implementation of mean degree

In [241]:
num_edges = 0
for node in phon_conn.nodes():
    num_edges += len(phon_conn.edges(node))

print(num_edges/phon_conn.number_of_nodes())

1.3194706994328922


## Number of nodes in the largest connected component

In [242]:
largest_cc = max(nx.connected_components(phon_conn), key=len)
lenght = len(largest_cc)
conn = lenght/phon_conn.number_of_nodes()
conn

0.33081285444234404

### Mean shortest path length of the largest connected component [doc here](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.average_shortest_path_length.html#networkx.algorithms.shortest_paths.generic.average_shortest_path_length)


In [243]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

# for C in (phon_conn.subgraph(c).copy() for c in largest_cc):
#     print(nx.average_shortest_path_length(C))

1.0


Adding the feature sharing connections to the graph

In [244]:
feat_norms = nx.Graph()

In [245]:
feat_norms.add_nodes_from(words)

In [246]:
for pair in feature_sharing_list:
    multiplex.add_edge(pair[0], pair[1])
for pair in feature_sharing_list:
    feat_norms.add_edge(pair[0], pair[1])

In [247]:
len(list(multiplex.nodes))


529

In [248]:
feat_norms.number_of_edges()

2389

In [249]:
print(nx.average_clustering(feat_norms))

0.1520222342517882


In [250]:
nx.degree_assortativity_coefficient(feat_norms)

-0.010608927346368343

In [251]:
num_edges = 0
for node in feat_norms.nodes():
    num_edges += len(feat_norms.edges(node))

print(num_edges/feat_norms.number_of_nodes())

9.032136105860113


In [252]:
largest_cc = max(nx.connected_components(feat_norms), key=len)
lenght = len(largest_cc)
conn = lenght/feat_norms.number_of_nodes()
conn

0.24196597353497165

In [253]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


Adding the free association connections to the graph

In [254]:
free_assoc = nx.Graph()

In [255]:
free_assoc.add_nodes_from(words)

In [256]:
for pair in free_association_list:
    multiplex.add_edge(pair[0], pair[1])
for pair in free_association_list:
    free_assoc.add_edge(pair[0], pair[1])

In [257]:
free_assoc.number_of_edges()

2441

In [258]:
print(nx.average_clustering(free_assoc))

0.1941295782803327


In [259]:
nx.degree_assortativity_coefficient(free_assoc)

-0.10131692039561056

In [260]:
num_edges = 0
for node in free_assoc.nodes():
    num_edges += len(free_assoc.edges(node))

print(num_edges/free_assoc.number_of_nodes())

9.228733459357278


In [261]:
largest_cc = max(nx.connected_components(free_assoc), key=len)
lenght = len(largest_cc)
conn = lenght/free_assoc.number_of_nodes()
conn

0.996219281663516

In [262]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


In [263]:
multiplex.number_of_edges()

5001

Adding the cooccurrances connections to the graph

In [264]:
co_oc = nx.Graph()

In [265]:
co_oc.add_nodes_from(words)

In [266]:
for pair in co_occurrances_list:
    multiplex.add_edge(pair[0], pair[1])
for pair in co_occurrances_list:
    co_oc.add_edge(pair[0], pair[1])

In [267]:
co_oc.number_of_edges()

2147

In [268]:
print(nx.average_clustering(co_oc))

0.34767890644326055


In [269]:
nx.degree_assortativity_coefficient(co_oc)

-0.43924523453557973

In [270]:
num_edges = 0
for node in co_oc.nodes():
    num_edges += len(co_oc.edges(node))

print(num_edges/co_oc.number_of_nodes())

8.117202268431003


In [271]:
largest_cc = max(nx.connected_components(co_oc), key=len)
lenght = len(largest_cc)
conn = lenght/co_oc.number_of_nodes()
conn

0.6238185255198487

In [272]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


In [273]:
multiplex.number_of_edges()

6998

##  The original multiplex layer has been constucted. 

In [274]:
# computes the difference between the multiplex and the word-embedding layer
print(multiplex.number_of_nodes())
print(G.number_of_nodes())

529
529


In [12]:
print(multiplex.number_of_edges())

NameError: name 'multiplex' is not defined

In [275]:
multiplex_word = multiplex.copy()

In [88]:
print(nx.average_clustering(multiplex))

0.3279300913138615


In [89]:
nx.degree_assortativity_coefficient(multiplex)

-0.07063907775109278

In [91]:
num_edges = 0
for node in multiplex.nodes():
    num_edges += len(multiplex.edges(node))

print(num_edges/multiplex.number_of_nodes())

26.457466918714555


In [92]:
largest_cc = max(nx.connected_components(multiplex), key=len)
lenght = len(largest_cc)
conn = lenght/multiplex.number_of_nodes()
conn

1.0

In [93]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


In [94]:
multiplex_cc = nx.closeness_centrality(multiplex)

In [95]:
print(nx.is_connected(multiplex))

True


# Now, we add the word embedding to the multiplex

In [276]:
multiplex_word = nx.union(multiplex, G)

NetworkXError: ('The node sets of G and H are not disjoint.', 'Use appropriate rename=(Gprefix,Hprefix)or use disjoint_union(G,H).')

# Now, we add the visual layer to the multiplex

In [96]:
visual_words = [line.strip().replace('"', "") for line in open('../data/wac_clip_freechild.txt')]

In [97]:
visual_list = []

for pair in visual_words:   
    visual_list.append(tuple(map(str, pair.split('->'))))

In [98]:
visual_graph = nx.Graph()

In [99]:
visual_graph.add_nodes_from(words)
visual_graph.number_of_nodes()

529

In [100]:
for pair in visual_list:
    visual_graph.add_edge(pair[0], pair[1])

In [101]:
for pair in visual_list:
    multiplex.add_edge(pair[0], pair[1])

### Mean degree of connectivity + mean clustering coefficient

In [123]:
num_edges = 0
for node in visual_graph.nodes():
    num_edges += len(visual_graph.edges(node))

print(num_edges/visual_graph.number_of_nodes())

3.4251497005988023


In [124]:
num_edges = 0
for node in multiplex.nodes():
    num_edges += len(multiplex.edges(node))

print(num_edges/multiplex.number_of_nodes())

17.393213572854293


### Assortativity coefficient

In [125]:
nx.degree_assortativity_coefficient(multiplex)

-0.16446899275620427

In [126]:
nx.degree_assortativity_coefficient(visual_graph)

-0.3603314446075859

### Largest connected component _Conn._

In [127]:
largest_cc = max(nx.connected_components(multiplex), key=len)
lenght = len(largest_cc)
conn = lenght/multiplex.number_of_nodes()
conn

0.5279441117764471

In [128]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


In [129]:
largest_cc = max(nx.connected_components(visual_graph), key=len)
lenght = len(largest_cc)
conn = lenght/visual_graph.number_of_nodes()
conn

0.4720558882235529

In [130]:
largest_cc = nx.complete_graph(largest_cc)
print(nx.average_shortest_path_length(largest_cc))

1.0


In [293]:
multiplex_visual_cc = nx.closeness_centrality(multiplex)


| Empirical Network | <_k_> | CC | _a_ | _Conn._| <_d_> | 
| --- | --- | --- | --- | --- | --- |
| Word Embedding | 7.97 | 0.06 | 0.41 | 13.4% | 1.0 |
| Visual Embedding | 8.76 | 0.0 | -0.36 | 47.2% | 1.0 |
| Multiplex Aggregate | 44.7 | 0.17 | -0.16 | 52.7% | 1.0 |