In [None]:
import numpy as np
import scipy.sparse as sparse
import collections
import math
import matplotlib.pyplot as plt


import os
import pandas as pd

import Embeddings_Auxiliary_Functions as emb

In [None]:
#Notice: the user can either load a bipartite matrix, use the sample one, or generate a random matrix.
#Alternatively, the user can skip this part and load a nested list directly.

In [None]:
#USE RANDOM TEST DATA OR LOAD USER DATA
###################################################################################################################
#Generate a random binary matrix that can be used as the adjacency matrix of the bipartite network.
#example_data_matrix = sparse.random(1000, 100, density=0.1, data_rvs=np.ones)
#example_data_matrix = example_data_matrix.toarray()
#Use a fixed sample bipartite matrix

#USE SAMPLE BIPARTITE MATRIX

example_data_matrix = emb.load_sample_matrix()

#Extract the nested list that the algoritm use as input
example_data_nested_list = emb.extract_input_data(example_data_matrix)

In [None]:
#CREATION OF THE EMBEDDINGS WITH THE SAMPLE DATA

#Set the vocabulary_size, i.e. the number of words to embed
#Notice that the default value of 500 depends on the size of the example_matrix
#Adjust this value according to your data if you run the code on different data
vocabulary_size = 500

#Use the nested list as input to create the database for training Word2Vec
data, reverse_dictionary, accumulated = emb.data_preprocessing(example_data_nested_list,vocabulary_size)

#Define the number of iteration of the algorithm
n_run = 60

ListOfEmbeddings = []
for cont_run in range(0,n_run):
    print("realization number: " +str(cont_run))
    final_embeddings, codestoexp = emb.create_the_embeddings(data, reverse_dictionary, accumulated, vocabulary_size, num_steps=20000)
    ListOfEmbeddings.append(final_embeddings)
    
List_of_Words = codestoexp

#NOTICE: LAUNCING THIS FUNCTION WILL OVERWRITE THE EXISTING EMBEDDINGS UNLESS A DIFFERENT NAME TO SAVE THEM IS PROVIDED
#emb.save_embeddings(ListOfEmbeddings,List_of_Words)

In [None]:
#Loading the embedding tensor, shape: [n_run, vocabulary_size, embedding_size]
EmbeddingTensor, ListOfWords = emb.load_embeddings()

scalar_product_matrix = np.stack([np.dot(EmbeddingTensor[i],np.transpose(EmbeddingTensor[i])) for i in range(0,EmbeddingTensor.shape[0])])
context_similarity_matrix = np.mean(scalar_product_matrix,axis = 0)
cs_indexes = np.transpose(np.stack(np.nonzero(np.triu(context_similarity_matrix,k=1))))
context_similarity = np.array([[item[0],item[1],context_similarity_matrix_0[item[0],item[1]]] for item in cs_indexes])
    
#Saving one istance of context similarity, specify a different name if you don't want to overwrite existing file
emb.save_context_similarity(context_similarity)

In [None]:
#This cell requires the default embeddings provided by the authors.
#It creates different values of the context similarity and it calculates the average correlation between 
#the different realizations of context similarity to check the self correlation between different runs.
#We find that the average correation is 0.958 +- 0.001

# Recall that that 
n_randomization = 100 #number of randomization
correlation_list = []
for n in range(0,n_randomization):
    #There are 60 sample embeddings so that two sets of context similarity can be calculated and their corellation can be studied
    #Recal that context similarity is defined as the average over 30 runs of the scalar product between the embeddings
    iteration_perm = np.random.choice(range(60),60,replace = False)
    index_0 = iteration_perm[:30]
    index_1 = iteration_perm[30:]

    EmbeddingTensor_0 = EmbeddingTensor[index_0]
    EmbeddingTensor_1 = EmbeddingTensor[index_1]

    scalar_product_matrix_0 = np.stack([np.dot(EmbeddingTensor_0[i],np.transpose(EmbeddingTensor_0[i])) for i in range(0,30)])
    scalar_product_matrix_1 = np.stack([np.dot(EmbeddingTensor_1[i],np.transpose(EmbeddingTensor_1[i])) for i in range(0,30)])

    context_similarity_matrix_0 = np.mean(scalar_product_matrix_0,axis = 0)
    context_similarity_matrix_1 = np.mean(scalar_product_matrix_1,axis = 0)
    cs_indexes = np.transpose(np.stack(np.nonzero(np.triu(context_similarity_matrix_0,k=1))))
    context_similarity_0 = np.array([[item[0],item[1],context_similarity_matrix_0[item[0],item[1]]] for item in cs_indexes])
    cs_indexes = np.transpose(np.stack(np.nonzero(np.triu(context_similarity_matrix_1,k=1))))
    context_similarity_1 = np.array([[item[0],item[1],context_similarity_matrix_1[item[0],item[1]]] for item in cs_indexes])

    correlation_list.append(np.corrcoef(context_similarity_0[:,2],context_similarity_1[:,2])[0,1])
    print(n,end="\r")
    
#Saving one istance of context similarity.
#nb.save_context_similarity(context_similarity_0)



In [None]:
context_similarity = emb.load_context_similarity()
plt.title("Sample Context Similarity Distribution\nAverage Correlations: 0.958 +- 0.001")
plt.hist(context_similarity[:,2],bins = 40)
plt.yscale("log")
plt.xlabel("context similarity")
plt.ylabel("N. Couples")
plt.show()