In [None]:
import numpy as np
import scipy as sp
import scipy.sparse as ss
import seaborn as sns
import matplotlib.pyplot as plt

from par2vec.common import *

### Load Reuters

In [None]:
# Load tokenized reuters
topic2id = np.load('data/reuters/reuters_topic2id.npy').item(0)
id2topic = np.load('data/reuters/reuters_id2topic.npy').item(0)
topics = list(np.load('data/reuters/reuters_topics.npy'))

word2id = np.load('data/reuters/reuters_word2id.npy').item(0)
id2word = np.load('data/reuters/reuters_id2word.npy').item(0)
tokenized = list(np.load('data/reuters/reuters_tokenized.npy'))

### Compute document graphs and essentials

In [None]:
# Create graphs for N documents
N=100000
max_w=13

counts = np.zeros((N, max_w))
entropies = []
for i, (_, A_o, A_i, _, _) in enumerate(get_lapl(tokenized[:N], word2id)):
    print ('\r %d/%d' % (i+1, N), end='')
    if (i+1 % 1000) == 0:
        print('\n {}/{} \n'.format(i, N))
        
    # Graph
    A = (A_o + A_i).data
    
    # Weight counts
    bincount = np.bincount(A.astype(int))
    for j in range(min(len(bincount), max_w)):
        counts[i, j] = bincount[j]
        
    # Entropies
    entropies.append(sp.stats.entropy((A_o + A_i).data))
    

# Parse data
x = np.arange(1, max_w)
y_mean = np.mean(counts[:,1:], axis=0)
y_std = np.std(counts[:,1:], axis=0)

### Plot distribution of non-zero graph weights

In [None]:
plt.figure(figsize=(7,5))
plt.title('Distribution of non-zero graph weights')
plt.xlabel('Weight')
plt.ylabel('Average Count')
plt.yscale('log')
plt.errorbar(x, y_mean, yerr=y_std)
plt.savefig('plots/dist_nonzero_graph_weights.png')
plt.show()

### Plot distribution graph entropies 

In [None]:
plt.figure(figsize=(7,5))
plt.title('Distribution of graph entropies')
plt.xlabel('Entropy')
plt.ylabel('Occurence')
sns.distplot(ents, bins=50)
plt.savefig('plots/dist_graph_entropies.png')
plt.show()

### Plot distribution of topics

In [None]:
topic_count = [len(x) for x in topics]
plt.xlabel('# Topics per document')
plt.ylabel('Amount of Documents')
sns.distplot(topic_count, kde=False, )
plt.savefig('plots/dist_topics_over_docs.png')
plt.show()

### Plot class imbalance

In [None]:
flat_topics = [item for sublist in topics for item in sublist]
topic_i, topic_count = np.unique(flat_topics, return_counts=True)
print('Counts per class:', y)
plt.bar(x, y)
plt.savefig('plots/dist_topic_counts.png')
plt.show()

In [None]:
id2topic[x[np.argmax(y)]]