In [16]:
#imports
from flair.embeddings import WordEmbeddings, TransformerWordEmbeddings
from flair.data import Sentence
import networkx as nx
from pytorch_pretrained_bert import BertTokenizer
import torch
from networkx.algorithms import approximation
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from tqdm.notebook import tqdm
import pickle
from nltk.stem import WordNetLemmatizer
import collections

In [47]:
word_thresh = 0.98
visual_thresh = 0.9
lancaster_thresh = 0.98
free_asoc_threshold =  0.1

# Recreate the Multiplex Network

In [18]:
def mean_degree_connectivity(graph_name):
    num_edges = 0
    for node in graph_name.nodes():
        num_edges += len(graph_name.edges(node))

    return num_edges/graph_name.number_of_nodes()

In [19]:
def perc_nodes_in_lcc(graph_name):
    largest_cc = max(nx.connected_components(graph_name), key=len)
    lenght = len(largest_cc)
    return lenght/graph_name.number_of_nodes()
   

In [20]:
def mean_shortest_path_lcc(graph_name):
    S = [graph_name.subgraph(c).copy() for c in nx.connected_components(graph_name)]
    comps = [len(max(nx.connected_components(i), key=len)) for i in S]
    index_max = max(range(len(comps)), key=comps.__getitem__)
    return nx.average_shortest_path_length(S[index_max])


## create free association layer

In [7]:
faw = [line.strip().split('\t') for line in open('../data/freeassoc_full.csv')]
words = list(set(line[0] for line in faw))
# faw = [line for line in faw if len(line) == 5]

In [8]:
free_association_words =  ['\t'.join(line[:2]) for line in faw if float(line[4]) > free_asoc_threshold]

free_association_words[:5]

['a\tone', 'a\tthe', 'aardvark\tanimal', 'aardvark\tanteater', 'abacus\tmath']

## Clean up the data and store it into lists

In [9]:
free_association_list = []

for pair in free_association_words:
    free_association_list.append(tuple(map(str, pair.split('\t'))))

## Free associations Layer

In [10]:
multiplex = nx.Graph()
multiplex.add_nodes_from(words)

free_assoc = nx.Graph()
free_assoc.add_nodes_from(words)
len(free_assoc.nodes)

12217

In [11]:
# Add the free association layer to the multiplex
for pair in free_association_list:
    multiplex.add_edge(pair[0], pair[1])
    
# Add the free association layer to its graph
for pair in free_association_list:
    free_assoc.add_edge(pair[0], pair[1])

In [12]:
len(free_assoc.nodes)

12600

### 1) Mean degree of connectivity _k_

In [13]:
mean_degree_connectivity(free_assoc)

2.9576984126984125

### 2) Mean Clustering Coefficient _CC_

In [14]:
nx.average_clustering(free_assoc)

0.12690427571331633

### 3) Assortativity Coefficient _a_

In [15]:
nx.degree_assortativity_coefficient(free_assoc)

-0.15218634277569373

### 4) Percentage of nodes in the Largest Connected Component _Conn._

In [16]:
perc_nodes_in_lcc(free_assoc)

0.92

### 5) Mean Shortest Path lenght in the Largest Connected Component _d_

In [17]:
# mean_shortest_path_lcc(free_assoc)

### We have added one layer to multiplex

In [18]:
multiplex.number_of_edges()

18636

In [19]:
# with open(r"free_assoc_full.pickle", "rb") as input_file:
#     free_association_list = pickle.load(input_file)
    
# free_association_list = {a:b for a,b in free_association_list}

In [20]:
pickle.dump(free_association_list, open( "free_assoc_full.pickle", "wb" ) )

# Create the BERT Word Embedding Network

In [21]:
# glove_embedding = WordEmbeddings('glove')
embedding = TransformerWordEmbeddings('bert-base-uncased', layers='-1', layer_mean=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# embd_vocab  = list(glove_embedding.precomputed_word_embeddings.key_to_index.keys())

# len(embd_vocab)

In [34]:
bert_vocab = list(tokenizer.vocab.keys())

len(bert_vocab)

30522

In [35]:
word_emb = {}
for word in tqdm(bert_vocab):
    sentence = Sentence(word) # --> strip() removes the white space from beginning and end of word
     # embed a sentence using glove.
    embedding.embed(sentence)
    for token in sentence:
        word_emb[word]=token.embedding
        

  0%|          | 0/30522 [00:00<?, ?it/s]

In [36]:
len(word_emb['red'])

768

In [37]:
len(word_emb)

30522

In [38]:
pickle.dump(word_emb, open( "word_emb_layer_full.pickle", "wb" ))

In [39]:
word_emb_layer = nx.Graph()
word_emb_layer.add_nodes_from(bert_vocab)

### Compute distance using cosine similarity


In [49]:
word_emb_list = []
emb_vocab = list(word_emb.keys())

with tqdm(total=len(emb_vocab)*len(emb_vocab)/2.0) as pbar:
    while len(emb_vocab) > 0:
        item = emb_vocab.pop()
        x = word_emb[item]

        for word in emb_vocab:
            y = word_emb[word]
#             print(x.unsqueeze(0))
            cosine_sim = torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0))
#             print(item, word, cosine_sim)
            if cosine_sim > word_thresh:
#                 print(cosine_sim, item, word)
                word_emb_list.append((item, word))
#                 print(len(word_emb_list))
                word_emb_layer.add_edge(item, word)
        pbar.update(len(emb_vocab))                   
            

  0%|          | 0/465796242.0 [00:00<?, ?it/s]

In [50]:
pickle.dump(word_emb_list, open( "word_emb_list_98.pickle", "wb" ))

In [51]:
len(word_emb_list)

147726

### 1) Mean Dregree of connectivity _k_

In [30]:
mean_degree_connectivity(word_emb_layer)

0.0

### 2) Mean Clustering Coefficient _CC_

In [31]:
nx.average_clustering(word_emb_layer)

0.0

### 3) Assortativity Coefficient _a_

In [32]:
nx.degree_assortativity_coefficient(word_emb_layer)

ValueError: max() arg is an empty sequence

### 4) Percentage of nodes in the LCC _Conn._

In [None]:
perc_nodes_in_lcc(word_emb_layer)

### 5) Mean shortest path in LLC _d_

In [None]:
mean_shortest_path_lcc(word_emb_layer)

In [None]:
len(word_emb_layer.nodes)

# Create the Visual Embedding Network

In [33]:
file = open('../data/clip.bertvocab.embeddings.513.txt', 'r')
lines = file.readlines()

In [34]:
visual_vecs = {}
for line in lines:
    visual_vecs[line.split()[0]] = line.split()[1:]
    
visual_words = list(visual_vecs.keys())

len(visual_words)

30522

In [35]:
len(visual_vecs['car'])

513

In [64]:
pickle.dump(visual_vecs, open( "word_emb_layer_full.pickle", "wb" ))

In [37]:
visual_graph = nx.Graph()
visual_graph.add_nodes_from(visual_words)
len(visual_graph.nodes)

30522

In [38]:
visual_list = []

visual_words_chop = visual_words.copy()

with tqdm(total=len(visual_words_chop)*len(visual_words_chop)/2.0) as pbar:
    while len(visual_words_chop) > 0:
        item = visual_words_chop.pop()
        x = np.array(visual_vecs[item])
#         print(item, len(visual_words_chop))
        for word in visual_words_chop:
#             if word is not item:
            y = np.array(visual_vecs[word])

            cosine_sim = cosine_similarity(x.reshape(1,-1),y.reshape(1,-1))

            if cosine_sim > visual_thresh:
                visual_list.append((item, word))
                visual_graph.add_edge(item, word)
        pbar.update(len(visual_words_chop))


  0%|          | 0/465796242.0 [00:00<?, ?it/s]

In [39]:
len(visual_graph.nodes)

30522

### 1) Mean Degree of Connectivity _k_

In [None]:
mean_degree_connectivity(visual_graph)

### 2) Mean Clustering Coefficient _CC_

In [None]:
nx.average_clustering(visual_graph)

### 3) Assortativity Coefficient _a_

In [None]:
nx.degree_assortativity_coefficient(visual_graph)

### 4) Percentage of nodes in the LCC _Conn._

In [None]:
perc_nodes_in_lcc(visual_graph)

### 5) Mean shortest length in the LCC _d_

In [None]:
mean_shortest_path_lcc(visual_graph)

In [62]:
pickle.dump(visual_list, open( "visual_layer_vectors_full.pickle", "wb" ))

In [None]:
len(visual_graph.nodes)

# Create the Lancaster Embedding Network

In [40]:
norms = pd.read_csv('../data/lancaster_full.csv')
cols = norms.describe().columns

In [41]:
# normalize all columns
for col in cols:
    m = norms[col].max()
    norms[col] = norms[col] / m

In [42]:
vecs = {}

for i,row in norms.iterrows():
    vecs[row.Word.lower()] =  row[cols].values

In [43]:
len(vecs['red'])

39

In [44]:
lancaster_words = list(vecs.keys())

lancaster_words[:10]

['a',
 'a cappella',
 'aardvark',
 'aback',
 'abacus',
 'abandon',
 'abandoned',
 'abandonee',
 'abandoner',
 'abandonment']

In [45]:
pickle.dump(vecs, open( "lancaster_layer_vectors_full.pickle", "wb" ))

In [46]:
lancaster = nx.Graph()
lancaster.add_nodes_from(lancaster_words)
len(lancaster.nodes)

39707

In [47]:
lancaster_list = []

lan_words = list(vecs.keys())

with tqdm(total=len(lan_words)*len(lan_words)/2.0) as pbar:
    while len(lan_words) > 0:
        item = lan_words.pop()

        x =  vecs[item]
    
        for word in lan_words:
            if word is not item:
                y =  vecs[word]
                cosine_sim = dot(x, y)/(norm(x)*norm(y))

                if cosine_sim > lancaster_thresh:

                    lancaster_list.append((item, word))
                    lancaster.add_edge(item, word)
        pbar.update(len(lan_words))


  0%|          | 0/788322924.5 [00:00<?, ?it/s]

In [48]:
print(len(lancaster.nodes))

39707


In [49]:
pickle.dump(lancaster_list, open( "lancaster_full.pickle", "wb" ))

### 1) Mean Degree of Connectivity _k_

In [None]:
mean_degree_connectivity(lancaster)

### 2) Mean Clustering Coefficient _CC_

In [None]:
nx.average_clustering(lancaster)

### 3) Assortativity Coefficient _a_

In [None]:
nx.degree_assortativity_coefficient(lancaster)

### 4) Percetage of nodes in the LCC _Conn._

In [None]:
perc_nodes_in_lcc(lancaster)

### 5) Mean shortest path length in LCC _d_

In [None]:
mean_shortest_path_lcc(lancaster)

In [None]:
len(lancaster.nodes)

# Now, we will add the above created layers to the multiplex

In [None]:
with open(r"free_assoc_full.pickle", "rb") as input_file:
    free_assoc = pickle.load(input_file)
    
with open(r"word_emb_layer_full.pickle", "rb") as input_file:
    word_emb_list = pickle.load(input_file)    
    
with open(r"visual_graph_full.pickle", "rb") as input_file:
    visual_list = pickle.load(input_file)        
    
with open(r"lancaster_full.pickle", "rb") as input_file:
    lancaster_list = pickle.load(input_file)       

In [None]:
print(len(free_assoc))
print(len(word_emb_list))
print(len(visual_list))
print(len(lancaster_list))

### Full list

In [None]:
layers = {'free_assoc':free_assoc,
          'word_emb_list':word_emb_list,
          'visual_list':visual_list, 
          'lancaster_list':lancaster_list}

len(layers)

In [None]:
from itertools import chain, combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))

all_subsets = powerset(layers.keys())
all_subsets.pop(0) # the first subset is the empty set
print(len(all_subsets))
all_subsets[14:]

In [None]:
for sub_layers in all_subsets[14:]:
    print(sub_layers)
    
    words = []
    for k in sub_layers:
        i = layers[k]
        for a,b in i:
            words.append(a)
            words.append(b)

    words = list(set(words)) 
    print(len(words))

    new_3 = nx.Graph()
#     new_3.add_nodes_from(words)
    
    for k in sub_layers:
        for pair in layers[k]:
            new_3.add_edge(pair[0], pair[1])    

    print(len(new_3.nodes))
    print('k', mean_degree_connectivity(new_3))
    print('CC', nx.average_clustering(new_3))
    print('a', nx.degree_assortativity_coefficient(new_3))
    print('conn', perc_nodes_in_lcc(new_3))
    print('d', mean_shortest_path_lcc(new_3))
    
    print()

('free_assoc',)
13744
k 5.968277066356229
CC 0.18140117405816272
a -0.1378838837754419
conn 0.9985448195576252
d 4.9834091852721665

('word_emb_list',)
25558
k 1119.9501525940998
CC 0.6476716252456551
a 0.1769889275645062
conn 0.8268643868847327
d 3.069093343628048

('visual_list',)
11811
k 2464.240284480569
CC 0.8647897461330403
a -0.4776667293884356
conn 0.9993226653119973
d 1.868065777870662

('lancaster_list',)
39662
k 2542.666784327568
CC 0.5281151568371529