In [1]:
import networkit as nk # The one Dr. Kennington had been using in make_big_network.py
import networkx as nx # The one compatible w/ eliorc/node2vec implementation and used in Phase_1.ipynb
import torch
from networkx.algorithms import approximation
import numpy as np
from tqdm import tqdm_notebook as tqdm
from numpy import dot
from numpy.linalg import norm
from tqdm import tqdm
import pickle
from nltk.stem import WordNetLemmatizer
from operator import itemgetter
import collections
from itertools import chain, combinations
from datetime import datetime

## Define Shared Functions (For use with NetworkX)

In [3]:
def mean_degree_connectivity_nx(graph_name):
    num_edges = 0
    for node in graph_name.nodes():
        num_edges += len(graph_name.edges(node))

    return num_edges/graph_name.number_of_nodes()

In [4]:
def perc_nodes_in_lcc_nx(graph_name):
    largest_cc = max(nx.connected_components(graph_name), key=len)
    lenght = len(largest_cc)
    return lenght/graph_name.number_of_nodes()
   

In [5]:
def mean_shortest_path_lcc_nx(graph_name):
    S = [graph_name.subgraph(c).copy() for c in nx.connected_components(graph_name)]
    comps = [len(max(nx.connected_components(i), key=len)) for i in S]
    index_max = max(range(len(comps)), key=comps.__getitem__)
    return nx.average_shortest_path_length(S[index_max])


## Define Shared Functions (For use with NetworKit)

In [6]:
def mean_degree_connectivity_nk(graph_name):
    num_edges = 0
    for node in graph_name.iterNodes():
        num_edges += graph_name.degree(node)

    return num_edges/graph_name.numberOfNodes()

In [7]:
def perc_nodes_in_lcc_nk(graph_name):
    largest_cc = nk.components.ConnectedComponents.extractLargestConnectedComponent(graph_name)
    length = largest_cc.numberOfNodes()
    return length/graph_name.numberOfNodes()

## Load and Clean Data

### Cleaned BERT Vocab


In [8]:
with open("cleaned_bert_vocab.pickle", "rb") as input_file:
    cleaned_bert_vocab = pickle.load(input_file)    


In [9]:
print(f"{len(cleaned_bert_vocab):,}")

26,573


### Cleaned Word Embedding List 


In [11]:
threshold = 90
with open(fr"cleaned_word_emb_list_{threshold}_cos_thresh.pickle", "rb") as input_file:
    bert_word_emb_list = pickle.load(input_file)    

In [12]:
print(f"{len(bert_word_emb_list):,}")

177,013


In [13]:
## Remove duplicate tuple pairs -- only keep one of (a, one) and (one, a) if both are present.
bert_word_emb_list = (list(set(tuple(sorted(l)) for l in bert_word_emb_list)))

In [14]:
print(f"{len(bert_word_emb_list):,}")

177,013


In [15]:
bert_word_emb_list[:10]

[('multitude', 'reclaimed'),
 ('spearheaded', 'twists'),
 ('misses', 'surpassing'),
 ('distinguishing', 'tensions'),
 ('finely', 'layered'),
 ('scarf', 'tractor'),
 ('binding', 'merging'),
 ('bothering', 'complained'),
 ('downward', 'successors'),
 ('closet', 'successors')]

### Free Association


In [16]:
with open(r"free_assoc_full.pickle", "rb") as input_file:
    free_assoc = pickle.load(input_file)  

#### Intersect free_assoc w/ bert vocab

In [17]:
free_assoc_list_intersect_bert = [word_pair for word_pair in free_assoc if all(word in cleaned_bert_vocab for word in word_pair)] # TODO: if I make cleaned_bert_vocab a set will it be a faster lookup? (hashed)

In [18]:
print(f"Original length: {len(free_assoc):,} vs Intersect length: {len(free_assoc_list_intersect_bert):,}. Diff: {len(free_assoc)-len(free_assoc_list_intersect_bert):,}")

Original length: 19,960 vs Intersect length: 12,755. Diff: 7,205


In [19]:
## Remove duplicate tuple pairs -- only keep one of (a, one) and (one, a) if both are present.
free_assoc_list_intersect_bert = (list(set(tuple(sorted(l)) for l in free_assoc_list_intersect_bert)))

In [20]:
print(f"Length after removing duplicate tuples: {len(free_assoc_list_intersect_bert):,}")

Length after removing duplicate tuples: 11,765


In [21]:
free_assoc_list_intersect_bert[:5]

[('anger', 'fury'),
 ('assembly', 'school'),
 ('salary', 'wage'),
 ('bright', 'brilliance'),
 ('activate', 'start')]

In [22]:
pickle.dump(free_assoc_list_intersect_bert, open( "free_assoc_list_intersect_bert.pickle", "wb" ))

### Visual


In [23]:
with open(r"../data/fullmultiplex/visual_layer_vectors_full.pickle", "rb") as input_file:
    visual_list = pickle.load(input_file)      

#### Intersect visual_list w/ bert vocab

This was going to take 120+ hours to build so instead of intersecting w/ bert vocab before building (since it should have negligible impact on time) I intersect after (takes 5 mins). To stay consistent, I did the same for all layers as well. 

In [24]:
visual_list_intersect_bert = [word_pair for word_pair in visual_list if all(word in cleaned_bert_vocab for word in word_pair)]

In [25]:
print(f"Original length: {len(visual_list):,} vs Intersect length: {len(visual_list_intersect_bert):,}. Diff: {len(visual_list)-len(visual_list_intersect_bert):,}")

Original length: 541,033 vs Intersect length: 461,317. Diff: 79,716


In [26]:
## Remove duplicate tuple pairs -- only keep one of (a, one) and (one, a) if both are present.
visual_list_intersect_bert = (list(set(tuple(sorted(l)) for l in visual_list_intersect_bert)))

In [27]:
print(f"Length after removing duplicate tuples: {len(visual_list_intersect_bert):,}")

Length after removing duplicate tuples: 461,317


In [28]:
visual_list_intersect_bert[102:110]

[('highlands', 'plants'),
 ('nanjing', 'toulouse'),
 ('bowie', 'hepburn'),
 ('hailey', 'receptions'),
 ('interiors', 'princeton'),
 ('breaststroke', 'molecules'),
 ('medicines', 'ticked'),
 ('crowns', 'peerage')]

In [29]:
pickle.dump(visual_list_intersect_bert, open( "visual_list_intersect_bert.pickle", "wb" ))

### Lancaster

In [31]:
with open(r"lancaster_full.pickle", "rb") as input_file:
    lancaster_list = pickle.load(input_file)    

#### Intersect lancaster_list w/ bert vocab

In [32]:
# lancaster_list_intersect_bert = [word_pair for word_pair in lancaster_list if all(word in cleaned_bert_vocab for word in word_pair)]

In [35]:
with open("lancaster_list_intersect_bert.pickle", "rb") as input_file:
    lancaster_list_intersect_bert = pickle.load(input_file)    


In [36]:
print(f"Original length: {len(lancaster_list):,} vs Intersect length: {len(lancaster_list_intersect_bert):,}. Diff: {len(lancaster_list)- len(lancaster_list_intersect_bert):,}")

Original length: 13,114,740 vs Intersect length: 1,375,293. Diff: 11,739,447


In [37]:
## Remove duplicate tuple pairs -- only keep one of (a, one) and (one, a) if both are present.
lancaster_list_intersect_bert = (list(set(tuple(sorted(l)) for l in lancaster_list_intersect_bert)))

In [38]:
print(f"Length after removing duplicate tuples: {len(lancaster_list_intersect_bert):,}")

Length after removing duplicate tuples: 1,375,293


In [39]:
lancaster_list_intersect_bert[:5]

[('reformed', 'unlawful'),
 ('lengthy', 'near'),
 ('disguised', 'projectile'),
 ('host', 'reception'),
 ('content', 'obsolete')]

In [40]:
pickle.dump(lancaster_list_intersect_bert, open(f"lancaster_list_intersect_bert.pickle", "wb" ) )   

## Build NetworkX and NetworKit networks

In [2]:
file_date_info = datetime.utcnow().strftime("%m-%d_%H:%M")

In [176]:
layers = {
    'free_assoc':free_assoc_list_intersect_bert,
    'visual_list':visual_list_intersect_bert,
    'lancaster_list':lancaster_list_intersect_bert,
    'bert_word_emb_list':bert_word_emb_list
         }

In [177]:
layer_names = list(layers.keys())

In [178]:
words = []
for layer_name in layer_names:
    i = layers[layer_name]
    for a,b in i:
        words.append(a)
        words.append(b)

words = list(set(words)) 
print(len(words))
word_to_index = {words[i]:i for i in range(len(words))} # Dict of word:index
index_to_word = {i:words[i] for i in range(len(words))} # Dict of index:word

16706


In [179]:
pickle.dump(index_to_word, open("generated_layer_data/" + "_".join(layer_names) + "_index_to_word_BERT_intersect.pickle", "wb" ) ) 

## Instantiate a graph for multiplex and populate it

In [180]:
## NetworkX
multiplex_nx = nx.Graph()
multiplex_nx.add_nodes_from(words)
nx.is_weighted(multiplex_nx) # Will return true once we have added edges which result in at least one edge weight greater than 1

False

In [181]:
## NetworKit
multiplex_nk = nk.Graph(len(words), weighted=True)
multiplex_nk.isWeighted()

True

## Add one layer at a time:

### Free associations Layer

In [182]:
free_association_words = list(set([a for tup in layers['free_assoc'] for a in tup]))
free_assoc_word_to_index = {free_association_words[i]:i for i in range(len(free_association_words))} # Dict of word:index
free_assoc_index_to_word = {i:free_association_words[i] for i in range(len(free_association_words))} # Dict of index:word
pickle.dump(free_assoc_index_to_word, open( "generated_layer_data/free_assoc_index_to_word_BERT_intersect.pickle", "wb" ) ) 


## NetworkX
free_association_layer_nx = nx.Graph()
free_association_layer_nx.add_nodes_from(free_association_words)

free_association_layer_full_vocab_nx = nx.Graph()
free_association_layer_full_vocab_nx.add_nodes_from(words)


## NetworKit
free_association_layer_nk = nk.Graph(len(free_association_words))
free_association_layer_full_vocab_nk = nk.Graph(len(words))


In [183]:
# Add the free association layer to the multiplex
for pair in layers['free_assoc']:
    ## NetworkX
    multiplex_nx.add_edge(pair[0], pair[1], weight=1) # first layer, can safely assume weight of 1
    free_association_layer_nx.add_edge(pair[0], pair[1])
    free_association_layer_full_vocab_nx.add_edge(pair[0], pair[1])
    
    ## NetworKit (IMPORTANT: NetworKit will add duplicate edges whereas NetworkX will not. This explains original edge diffs between networkit and networkx
    ## Docs show bool checkMultiEdge param but it's not working)
    ## (duplicate edges should no longer be an issue now that I am removing duplicate pairs in the lists)
    if not multiplex_nk.hasEdge(word_to_index[pair[0]], word_to_index[pair[1]]):
        multiplex_nk.addEdge(word_to_index[pair[0]], word_to_index[pair[1]], w=1)

    if not free_association_layer_nk.hasEdge(free_assoc_word_to_index[pair[0]], free_assoc_word_to_index[pair[1]]):
        free_association_layer_nk.addEdge(free_assoc_word_to_index[pair[0]], free_assoc_word_to_index[pair[1]])

    if not free_association_layer_full_vocab_nk.hasEdge(word_to_index[pair[0]], word_to_index[pair[1]]):
        free_association_layer_full_vocab_nk.addEdge(word_to_index[pair[0]], word_to_index[pair[1]])

In [184]:
nk.overview(multiplex_nk)

Network Properties:
nodes, edges			16706, 11765
directed?			False
weighted?			True
isolated nodes			8876
self-loops			3
density				0.000084
min/max/avg degree		0, 131, 1.408296
degree assortativity		-0.152464
number of connected components	8975
size of largest component	7590 (45.43 %)


#### Remove self loops -- I don't expect them to contribute any value to this task. 

In [185]:
multiplex_nx.remove_edges_from(nx.selfloop_edges(multiplex_nx))
multiplex_nk.removeSelfLoops()

free_association_layer_nx.remove_edges_from(nx.selfloop_edges(free_association_layer_nx))
free_association_layer_nk.removeSelfLoops()


free_association_layer_full_vocab_nx.remove_edges_from(nx.selfloop_edges(free_association_layer_full_vocab_nx))
free_association_layer_full_vocab_nk.removeSelfLoops()

#### Save Layers

In [186]:
print(f"Layer-Only Vocab: # Edges -> NetworkX: {free_association_layer_nx.number_of_edges()} | NetworKit: {free_association_layer_nk.numberOfEdges()}")
print(f"Full Vocab:       # Edges -> NetworkX: {free_association_layer_full_vocab_nx.number_of_edges()} | NetworKit: {free_association_layer_full_vocab_nk.numberOfEdges()}")
print("\n")
print(f"Layer-Only Vocab: # Nodes -> NetworkX: {free_association_layer_nx.number_of_nodes()}  | NetworKit: {free_association_layer_nk.numberOfNodes()}")
print(f"Full Vocab:       # Nodes -> NetworkX: {free_association_layer_full_vocab_nx.number_of_nodes()} | NetworKit: {free_association_layer_full_vocab_nk.numberOfNodes()}")

Layer-Only Vocab: # Edges -> NetworkX: 11762 | NetworKit: 11762
Full Vocab:       # Edges -> NetworkX: 11762 | NetworKit: 11762


Layer-Only Vocab: # Nodes -> NetworkX: 7830  | NetworKit: 7830
Full Vocab:       # Nodes -> NetworkX: 16706 | NetworKit: 16706


In [187]:
#### NetworkX
nx_filename = f"networkx_graphs/layers/free_association_layer_{threshold}_cos_thresh_nx_{file_date_info}.pickle"
pickle.dump(free_association_layer_nx, open(nx_filename, "wb" ) )   
print(nx_filename)

nx_full_vocab_filename = f"networkx_graphs/layers/free_association_layer_full_vocab_{threshold}_cos_thresh_nx_{file_date_info}.pickle"
pickle.dump(free_association_layer_full_vocab_nx, open(nx_full_vocab_filename, "wb" ) )   
print(nx_full_vocab_filename)


#### NetworKit
nk_filename = f"networkit_graphs/layers/free_association_layer_{threshold}_cos_thresh_nk_{file_date_info}.pickle"
pickle.dump(free_association_layer_nk, open(nk_filename, "wb" ) )   
print(nk_filename)

nk_full_vocab_filename = f"networkit_graphs/layers/free_association_layer_full_vocab_{threshold}_cos_thresh_nk_{file_date_info}.pickle"
pickle.dump(free_association_layer_full_vocab_nk, open(nk_full_vocab_filename, "wb" ) )   
print(nk_full_vocab_filename)

networkx_graphs/layers/free_association_layer_90_cos_thresh_nx_12-14_05:42.pickle
networkx_graphs/layers/free_association_layer_full_vocab_90_cos_thresh_nx_12-14_05:42.pickle
networkit_graphs/layers/free_association_layer_90_cos_thresh_nk_12-14_05:42.pickle
networkit_graphs/layers/free_association_layer_full_vocab_90_cos_thresh_nk_12-14_05:42.pickle


#### Layer Analysis

##### 1) Mean degree of connectivity _k_
Number of nodes adjacent to the node

In [54]:
mean_degree_connectivity_nx(free_association_layer_nx)

3.0043422733077905

In [55]:
mean_degree_connectivity_nk(free_association_layer_nk)

3.0043422733077905

##### 2) Mean Clustering Coefficient _CC_
The degree to which nodes tend to cluster together

In [56]:
nx.average_clustering(free_association_layer_nx)

0.12318254461546252

##### 3) Assortativity Coefficient _a_
The similarity of connections in the graph with respect to the node degree

In [57]:
nx.degree_assortativity_coefficient(free_association_layer_nx)

-0.15229921739271393

##### 4) Percentage of nodes in the Largest Connected Component (LCC)
The largest connected sugraph in the network

In [58]:
perc_nodes_in_lcc_nx(free_association_layer_nx)

0.9693486590038314

In [59]:
perc_nodes_in_lcc_nk(free_association_layer_nk)

0.9693486590038314

##### 5) Mean Shortest Path length in the Largest Connected Component _d_
Average shorted path among the nodes in LCC

In [62]:
mean_shortest_path_lcc_nx(free_association_layer_nx)

7.34155117723784

#### One layer has been added to multiplex ✅

In [188]:
print(f"# Edges -> NetworkX: {multiplex_nx.number_of_edges()} | NetworKit: {multiplex_nk.numberOfEdges()}")
print(f"# Nodes -> NetworkX: {multiplex_nx.number_of_nodes()} | NetworKit: {multiplex_nk.numberOfNodes()}")

# Edges -> NetworkX: 11762 | NetworKit: 11762
# Nodes -> NetworkX: 16706 | NetworKit: 16706


### Visual Layer

In [189]:
visual_words = list(set([a for tup in layers['visual_list'] for a in tup]))
visual_word_to_index = {visual_words[i]:i for i in range(len(visual_words))} # Dict of word:index
visual_index_to_word = {i:visual_words[i] for i in range(len(visual_words))} # Dict of index:word
pickle.dump(visual_index_to_word, open( "generated_layer_data/visual_index_to_word_BERT_intersect.pickle", "wb" ) ) 


## NetworkX
visual_layer_nx = nx.Graph()
visual_layer_nx.add_nodes_from(visual_words)

visual_layer_full_vocab_nx = nx.Graph()
visual_layer_full_vocab_nx.add_nodes_from(words)

## NetworKit
visual_layer_nk = nk.Graph(len(visual_words))
visual_layer_full_vocab_nk = nk.Graph(len(words))

In [190]:
# Add the free association layer to the multiplex
for pair in layers['visual_list']:
    ## NetworkX
    if multiplex_nx.has_edge(pair[0], pair[1]):
        multiplex_nx[pair[0]][pair[1]]['weight'] += 1
    else:
        multiplex_nx.add_edge(pair[0], pair[1], weight=1)    
    
    visual_layer_full_vocab_nx.add_edge(pair[0], pair[1])
    visual_layer_nx.add_edge(pair[0], pair[1])
    
    ## NetworKit
    if multiplex_nk.hasEdge(word_to_index[pair[0]], word_to_index[pair[1]]):
        multiplex_nk.increaseWeight(word_to_index[pair[0]], word_to_index[pair[1]], w=1)

    else:
        multiplex_nk.addEdge(word_to_index[pair[0]], word_to_index[pair[1]], w=1)
    
    visual_layer_nk.addEdge(visual_word_to_index[pair[0]], visual_word_to_index[pair[1]])
    visual_layer_full_vocab_nk.addEdge(word_to_index[pair[0]], word_to_index[pair[1]])



In [191]:
nk.overview(multiplex_nk)

Network Properties:
nodes, edges			16706, 473002
directed?			False
weighted?			True
isolated nodes			5548
self-loops			0
density				0.003390
clustering coefficient		0.227748
min/max/avg degree		0, 2025, 56.626601
degree assortativity		-0.241270
number of connected components	5644
size of largest component	10929 (65.42 %)


#### Remove self loops -- I don't expect them to contribute any value to this task. 

In [192]:
multiplex_nx.remove_edges_from(nx.selfloop_edges(multiplex_nx))
multiplex_nk.removeSelfLoops()

visual_layer_nx.remove_edges_from(nx.selfloop_edges(visual_layer_nx))
visual_layer_nk.removeSelfLoops()

visual_layer_full_vocab_nx.remove_edges_from(nx.selfloop_edges(visual_layer_full_vocab_nx))
visual_layer_full_vocab_nk.removeSelfLoops()

#### Save Layers

In [193]:
print(f"Layer-Only Vocab: # Edges -> NetworkX: {visual_layer_nx.number_of_edges()} | NetworKit: {visual_layer_nk.numberOfEdges()}")
print(f"Full Vocab:       # Edges -> NetworkX: {visual_layer_full_vocab_nx.number_of_edges()} | NetworKit: {visual_layer_full_vocab_nk.numberOfEdges()}")
print("\n")
print(f"Layer-Only Vocab: # Nodes -> NetworkX: {visual_layer_nx.number_of_nodes()}  | NetworKit: {visual_layer_nk.numberOfNodes()}")
print(f"Full Vocab:       # Nodes -> NetworkX: {visual_layer_full_vocab_nx.number_of_nodes()} | NetworKit: {visual_layer_full_vocab_nk.numberOfNodes()}")

Layer-Only Vocab: # Edges -> NetworkX: 461317 | NetworKit: 461317
Full Vocab:       # Edges -> NetworkX: 461317 | NetworKit: 461317


Layer-Only Vocab: # Nodes -> NetworkX: 3881  | NetworKit: 3881
Full Vocab:       # Nodes -> NetworkX: 16706 | NetworKit: 16706


In [194]:
#### NetworkX
nx_filename = f"networkx_graphs/layers/visual_layer_{threshold}_cos_thresh_nx_{file_date_info}.pickle"
pickle.dump(visual_layer_nx, open(nx_filename, "wb" ) )   
print(nx_filename)

nx_full_vocab_filename = f"networkx_graphs/layers/visual_layer_full_vocab_{threshold}_cos_thresh_nx_{file_date_info}.pickle"
pickle.dump(visual_layer_full_vocab_nx, open(nx_full_vocab_filename, "wb" ) )   
print(nx_full_vocab_filename)


#### NetworKit
nk_filename = f"networkit_graphs/layers/visual_layer_{threshold}_cos_thresh_nk_{file_date_info}.pickle"
pickle.dump(visual_layer_nk, open(nk_filename, "wb" ) )   
print(nk_filename)

nk_full_vocab_filename = f"networkit_graphs/layers/visual_layer_full_vocab_{threshold}_cos_thresh_nk_{file_date_info}.pickle"
pickle.dump(visual_layer_full_vocab_nk, open(nk_full_vocab_filename, "wb" ) )   
print(nk_full_vocab_filename)

networkx_graphs/layers/visual_layer_90_cos_thresh_nx_12-14_05:42.pickle
networkx_graphs/layers/visual_layer_full_vocab_90_cos_thresh_nx_12-14_05:42.pickle
networkit_graphs/layers/visual_layer_90_cos_thresh_nk_12-14_05:42.pickle
networkit_graphs/layers/visual_layer_full_vocab_90_cos_thresh_nk_12-14_05:42.pickle


#### Layer Analysis

##### 1) Mean degree of connectivity _k_

In [71]:
mean_degree_connectivity_nx(visual_layer_nx)

237.73099716567896

In [72]:
mean_degree_connectivity_nk(visual_layer_nk)

237.73099716567896

##### 2) Mean Clustering Coefficient _CC_

In [73]:
nx.average_clustering(visual_layer_nx)

0.765524311803005

##### 3) Assortativity Coefficient _a_

In [74]:
nx.degree_assortativity_coefficient(visual_layer_nx)

-0.3372534316681479

##### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [75]:
perc_nodes_in_lcc_nx(visual_layer_nx)

0.9963926822983767

In [76]:
perc_nodes_in_lcc_nk(visual_layer_nk)

0.9963926822983767

##### 5) Mean Shortest Path length in the Largest Connected Component _d_

In [77]:
mean_shortest_path_lcc_nx(visual_layer_nx)

2.3737910725626032

#### Two layers have been added to multiplex ✅

In [195]:
print(f"# Edges -> NetworkX: {multiplex_nx.number_of_edges()} | NetworKit: {multiplex_nk.numberOfEdges()}")
print(f"# Nodes -> NetworkX: {multiplex_nx.number_of_nodes()} | NetworKit: {multiplex_nk.numberOfNodes()}")

# Edges -> NetworkX: 473002 | NetworKit: 473002
# Nodes -> NetworkX: 16706 | NetworKit: 16706


##### As expected, now that we have edges with a weight > 1, the network returns True for weighted. 

In [196]:
nx.is_weighted(multiplex_nx) # Will return true once we have added edges which result in at least one edge weight greater than 1

True

### Lancaster Layer

In [197]:
lancaster_words = list(set([a for tup in layers['lancaster_list'] for a in tup]))
lancaster_word_to_index = {lancaster_words[i]:i for i in range(len(lancaster_words))} # Dict of word:index
lancaster_index_to_word = {i:lancaster_words[i] for i in range(len(lancaster_words))} # Dict of index:word
pickle.dump(lancaster_index_to_word, open( "generated_layer_data/lancaster_index_to_word_BERT_intersect.pickle", "wb" ) ) 


## NetworkX
lancaster_layer_nx = nx.Graph()
lancaster_layer_nx.add_nodes_from(lancaster_words)

lancaster_layer_full_vocab_nx = nx.Graph()
lancaster_layer_full_vocab_nx.add_nodes_from(words)



## NetworKit
lancaster_layer_nk = nk.Graph(len(lancaster_words))
lancaster_layer_full_vocab_nk = nk.Graph(len(words))

In [198]:
# Add the free association layer to the multiplex
for pair in layers['lancaster_list']:
    ## NetworkX
    if multiplex_nx.has_edge(pair[0], pair[1]):
        multiplex_nx[pair[0]][pair[1]]['weight'] += 1
    else:
        multiplex_nx.add_edge(pair[0], pair[1], weight=1)    
    
    lancaster_layer_nx.add_edge(pair[0], pair[1])
    lancaster_layer_full_vocab_nx.add_edge(pair[0], pair[1])
    
    
    ## NetworKit
    if multiplex_nk.hasEdge(word_to_index[pair[0]], word_to_index[pair[1]]):
        multiplex_nk.increaseWeight(word_to_index[pair[0]], word_to_index[pair[1]], w=1)

    else:
        multiplex_nk.addEdge(word_to_index[pair[0]], word_to_index[pair[1]], w=1)
    
    lancaster_layer_nk.addEdge(lancaster_word_to_index[pair[0]], lancaster_word_to_index[pair[1]])
    lancaster_layer_full_vocab_nk.addEdge(word_to_index[pair[0]], word_to_index[pair[1]])


In [199]:
nk.overview(multiplex_nk)

Network Properties:
nodes, edges			16706, 1846353
directed?			False
weighted?			True
isolated nodes			1825
self-loops			0
density				0.013232
clustering coefficient		0.428246
min/max/avg degree		0, 2638, 221.040704
degree assortativity		0.185446
number of connected components	1837
size of largest component	14856 (88.93 %)


#### Remove self loops -- I don't expect them to contribute any value to this task. 

In [200]:
multiplex_nx.remove_edges_from(nx.selfloop_edges(multiplex_nx))
multiplex_nk.removeSelfLoops()

lancaster_layer_nx.remove_edges_from(nx.selfloop_edges(lancaster_layer_nx))
lancaster_layer_nk.removeSelfLoops()

lancaster_layer_full_vocab_nx.remove_edges_from(nx.selfloop_edges(lancaster_layer_full_vocab_nx))
lancaster_layer_full_vocab_nk.removeSelfLoops()

#### Save Layers

In [201]:
print(f"Layer-Only Vocab: # Edges -> NetworkX: {lancaster_layer_nx.number_of_edges()} | NetworKit: {lancaster_layer_nk.numberOfEdges()}")
print(f"Full Vocab:       # Edges -> NetworkX: {lancaster_layer_full_vocab_nx.number_of_edges()} | NetworKit: {lancaster_layer_full_vocab_nk.numberOfEdges()}")
print("\n")
print(f"Layer-Only Vocab: # Nodes -> NetworkX: {lancaster_layer_nx.number_of_nodes()}  | NetworKit: {lancaster_layer_nk.numberOfNodes()}")
print(f"Full Vocab:       # Nodes -> NetworkX: {lancaster_layer_full_vocab_nx.number_of_nodes()} | NetworKit: {lancaster_layer_full_vocab_nk.numberOfNodes()}")

Layer-Only Vocab: # Edges -> NetworkX: 1375293 | NetworKit: 1375293
Full Vocab:       # Edges -> NetworkX: 1375293 | NetworKit: 1375293


Layer-Only Vocab: # Nodes -> NetworkX: 11273  | NetworKit: 11273
Full Vocab:       # Nodes -> NetworkX: 16706 | NetworKit: 16706


In [203]:
#### NetworkX
nx_filename = f"networkx_graphs/layers/lancaster_layer_{threshold}_cos_thresh_nx_{file_date_info}.pickle"
pickle.dump(lancaster_layer_nx, open(nx_filename, "wb" ) )   
print(nx_filename)

nx_full_vocab_filename = f"networkx_graphs/layers/lancaster_layer_full_vocab_{threshold}_cos_thresh_nx_{file_date_info}.pickle"
pickle.dump(lancaster_layer_full_vocab_nx, open(nx_full_vocab_filename, "wb" ) )   
print(nx_full_vocab_filename)


#### NetworKit
nk_filename = f"networkit_graphs/layers/lancaster_layer_{threshold}_cos_thresh_nk_{file_date_info}.pickle"
pickle.dump(lancaster_layer_nk, open(nk_filename, "wb" ) )   
print(nk_filename)

nk_full_vocab_filename = f"networkit_graphs/layers/lancaster_layer_full_vocab_{threshold}_cos_thresh_nk_{file_date_info}.pickle"
pickle.dump(lancaster_layer_full_vocab_nk, open(nk_full_vocab_filename, "wb" ) )   
print(nk_full_vocab_filename)

networkx_graphs/layers/lancaster_layer_90_cos_thresh_nx_12-14_05:42.pickle
networkx_graphs/layers/lancaster_layer_full_vocab_90_cos_thresh_nx_12-14_05:42.pickle
networkit_graphs/layers/lancaster_layer_90_cos_thresh_nk_12-14_05:42.pickle
networkit_graphs/layers/lancaster_layer_full_vocab_90_cos_thresh_nk_12-14_05:42.pickle


#### Layer Analysis

##### 1) Mean degree of connectivity _k_

In [85]:
mean_degree_connectivity_nx(lancaster_layer_nx)

243.997693604187

In [86]:
mean_degree_connectivity_nk(lancaster_layer_nk)

243.997693604187

##### 2) Mean Clustering Coefficient _CC_

In [87]:
nx.average_clustering(lancaster_layer_nx)

0.4431936950059712

##### 3) Assortativity Coefficient _a_

In [88]:
nx.degree_assortativity_coefficient(lancaster_layer_nx)

0.38844316153996505

##### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [89]:
perc_nodes_in_lcc_nx(lancaster_layer_nx)

0.9968952364055709

In [90]:
perc_nodes_in_lcc_nk(lancaster_layer_nk)

0.9968952364055709

##### 5) Mean Shortest Path length in the Largest Connected Component _d_

In [91]:
mean_shortest_path_lcc_nx(lancaster_layer_nx)

3.1951176723515418

#### Three layers have been added to multiplex ✅

In [204]:
print(f"# Edges -> NetworkX: {multiplex_nx.number_of_edges()} | NetworKit: {multiplex_nk.numberOfEdges()}")
print(f"# Nodes -> NetworkX: {multiplex_nx.number_of_nodes()} | NetworKit: {multiplex_nk.numberOfNodes()}")

# Edges -> NetworkX: 1846353 | NetworKit: 1846353
# Nodes -> NetworkX: 16706 | NetworKit: 16706


### BERT Word Embedding Layer

In [205]:
bert_words = list(set([a for tup in layers['bert_word_emb_list'] for a in tup]))
bert_word_to_index = {bert_words[i]:i for i in range(len(bert_words))} # Dict of word:index
bert_index_to_word = {i:bert_words[i] for i in range(len(bert_words))} # Dict of index:word
pickle.dump(bert_index_to_word, open( "generated_layer_data/bert_index_to_word_BERT_intersect.pickle", "wb" ) ) 


## NetworkX
bert_layer_nx = nx.Graph()
bert_layer_nx.add_nodes_from(bert_words)

bert_layer_full_vocab_nx = nx.Graph()
bert_layer_full_vocab_nx.add_nodes_from(words)

## NetworKit
bert_layer_nk = nk.Graph(len(bert_words))
bert_layer_full_vocab_nk = nk.Graph(len(words))

In [206]:
# Add the free association layer to the multiplex
for pair in layers['bert_word_emb_list']:
    ## NetworkX
    if multiplex_nx.has_edge(pair[0], pair[1]):
        multiplex_nx[pair[0]][pair[1]]['weight'] += 1
    else:
        multiplex_nx.add_edge(pair[0], pair[1], weight=1)    
    
    bert_layer_nx.add_edge(pair[0], pair[1])
    bert_layer_full_vocab_nx.add_edge(pair[0], pair[1])

    
    ## NetworKit
    if multiplex_nk.hasEdge(word_to_index[pair[0]], word_to_index[pair[1]]):
        multiplex_nk.increaseWeight(word_to_index[pair[0]], word_to_index[pair[1]], w=1)

    else:
        multiplex_nk.addEdge(word_to_index[pair[0]], word_to_index[pair[1]], w=1)
    
    bert_layer_nk.addEdge(bert_word_to_index[pair[0]], bert_word_to_index[pair[1]])
    bert_layer_full_vocab_nk.addEdge(word_to_index[pair[0]], word_to_index[pair[1]])


In [207]:
nk.overview(multiplex_nk)

Network Properties:
nodes, edges			16706, 2020659
directed?			False
weighted?			True
isolated nodes			0
self-loops			0
density				0.014481
clustering coefficient		0.457812
min/max/avg degree		1, 2638, 241.908177
degree assortativity		0.171620
number of connected components	110
size of largest component	16441 (98.41 %)


#### Remove self loops -- I don't expect them to contribute any value to this task. 

In [208]:
multiplex_nx.remove_edges_from(nx.selfloop_edges(multiplex_nx))
multiplex_nk.removeSelfLoops()

bert_layer_nx.remove_edges_from(nx.selfloop_edges(bert_layer_nx))
bert_layer_nk.removeSelfLoops()

bert_layer_full_vocab_nx.remove_edges_from(nx.selfloop_edges(bert_layer_full_vocab_nx))
bert_layer_full_vocab_nk.removeSelfLoops()

#### Save Layers

In [209]:
print(f"Layer-Only Vocab: # Edges -> NetworkX: {bert_layer_nx.number_of_edges()} | NetworKit: {bert_layer_nk.numberOfEdges()}")
print(f"Full Vocab:       # Edges -> NetworkX: {bert_layer_full_vocab_nx.number_of_edges()} | NetworKit: {bert_layer_full_vocab_nk.numberOfEdges()}")
print("\n")
print(f"Layer-Only Vocab: # Nodes -> NetworkX: {bert_layer_nx.number_of_nodes()}  | NetworKit: {bert_layer_nk.numberOfNodes()}")
print(f"Full Vocab:       # Nodes -> NetworkX: {bert_layer_full_vocab_nx.number_of_nodes()} | NetworKit: {bert_layer_full_vocab_nk.numberOfNodes()}")

Layer-Only Vocab: # Edges -> NetworkX: 177013 | NetworKit: 177013
Full Vocab:       # Edges -> NetworkX: 177013 | NetworKit: 177013


Layer-Only Vocab: # Nodes -> NetworkX: 5639  | NetworKit: 5639
Full Vocab:       # Nodes -> NetworkX: 16706 | NetworKit: 16706


In [210]:
#### NetowrkX
nx_filename = f"networkx_graphs/layers/bert_layer_{threshold}_cos_thresh_nx_{file_date_info}.pickle"
pickle.dump(bert_layer_nx, open(nx_filename, "wb" ) )   
print(nx_filename)

nx_full_vocab_filename = f"networkx_graphs/layers/bert_layer_full_vocab_{threshold}_cos_thresh_nx_{file_date_info}.pickle"
pickle.dump(bert_layer_full_vocab_nx, open(nx_full_vocab_filename, "wb" ) )   
print(nx_full_vocab_filename)

#### NetworKit
nk_filename = f"networkit_graphs/layers/bert_layer_{threshold}_cos_thresh_nk_{file_date_info}.pickle"
pickle.dump(bert_layer_nk, open(nk_filename, "wb" ) )   
print(nk_filename)

nk_full_vocab_filename = f"networkit_graphs/layers/bert_layer_full_vocab_{threshold}_cos_thresh_nk_{file_date_info}.pickle"
pickle.dump(bert_layer_full_vocab_nk, open(nk_full_vocab_filename, "wb" ) )   
print(nk_full_vocab_filename)

networkx_graphs/layers/bert_layer_90_cos_thresh_nx_12-14_05:42.pickle
networkx_graphs/layers/bert_layer_full_vocab_90_cos_thresh_nx_12-14_05:42.pickle
networkit_graphs/layers/bert_layer_90_cos_thresh_nk_12-14_05:42.pickle
networkit_graphs/layers/bert_layer_full_vocab_90_cos_thresh_nk_12-14_05:42.pickle


#### Layer Analysis

##### 1) Mean degree of connectivity _k_

In [98]:
mean_degree_connectivity_nx(bert_layer_nx)

62.78169888278064

In [99]:
mean_degree_connectivity_nk(bert_layer_nk)

62.78169888278064

##### 2) Mean Clustering Coefficient _CC_

In [100]:
nx.average_clustering(bert_layer_nx)

0.36108860249068137

##### 3) Assortativity Coefficient _a_

In [101]:
nx.degree_assortativity_coefficient(bert_layer_nx)

0.053698120711544256

##### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [102]:
perc_nodes_in_lcc_nx(bert_layer_nx)

0.778152154637347

In [103]:
perc_nodes_in_lcc_nk(bert_layer_nk)

0.778152154637347

##### 5) Mean Shortest Path length in the Largest Connected Component _d_

In [104]:
mean_shortest_path_lcc_nx(bert_layer_nx)

3.5770940245886838

#### Four layers have been added to multiplex ✅

In [211]:
print(f"# Edges -> NetworkX: {multiplex_nx.number_of_edges()} | NetworKit: {multiplex_nk.numberOfEdges()}")
print(f"# Nodes -> NetworkX: {multiplex_nx.number_of_nodes()} | NetworKit: {multiplex_nk.numberOfNodes()}")

# Edges -> NetworkX: 2020659 | NetworKit: 2020659
# Nodes -> NetworkX: 16706 | NetworKit: 16706


### Save NetworkX graph

In [212]:
filename = f"networkx_graphs/full_adult_intersect_bert_{threshold}_cos_thresh_nx"

if nx.is_weighted(multiplex_nx):
    filename += '_weighted'

filename += f'_{file_date_info}.pickle'

pickle.dump(multiplex_nx, open(filename, "wb" ) )   
print(filename)

networkx_graphs/full_adult_intersect_bert_90_cos_thresh_nx_weighted_12-14_05:42.pickle


In [213]:
list(dict(sorted(nx.get_edge_attributes(multiplex_nx, 'weight').items(), key=lambda item: item[1], reverse=True)).items())[:2]

[(('hurried', 'rushed'), 3), (('dog', 'puppy'), 3)]

### Save NetworKit graph

In [214]:
filename = f"networkit_graphs/full_adult_intersect_bert_{threshold}_cos_thresh_nk"


if multiplex_nk.isWeighted():
    filename += '_weighted'

filename += f'_{file_date_info}.pickle'

pickle.dump(multiplex_nk, open(filename, "wb" ) )   
print(filename)

networkit_graphs/full_adult_intersect_bert_90_cos_thresh_nk_weighted_12-14_05:42.pickle
