In [1]:
import numpy as np, string, utils, preprocess

In [2]:
book = utils.load_text("books/king-james-bible-processed.txt")
book[:100]

'In the beginning God created the heaven and the earth.\r\nAnd the earth was without form, and void; an'

In [3]:
book_lems = preprocess.words_lems(book, lower=True)
book_lems[:15]

['in',
 'the',
 'beginning',
 'god',
 'create',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.',
 'and',
 'the',
 'earth',
 'be']

In [4]:
words_map = preprocess.words_to_int(book_lems, ignore_punct=True)
[(w, words_map.get(w, -1)) for w in book_lems[:15]]

[('in', 6305),
 ('the', 1379),
 ('beginning', 2713),
 ('god', 5515),
 ('create', 8232),
 ('the', 1379),
 ('heaven', 1102),
 ('and', 6259),
 ('the', 1379),
 ('earth', 6955),
 ('.', -1),
 ('and', 6259),
 ('the', 1379),
 ('earth', 6955),
 ('be', 9229)]

In [5]:
def build_link(adj, weight, words_map, words, from_index, to_index):
    words_len = len(words)
    while to_index < words_len and words[to_index] in string.punctuation:
        to_index += 1
        weight /= 2
    
    if to_index < len(words):
        adj[words_map[words[from_index]], words_map[words[to_index]]] = adj[words_map[words[from_index]], words_map[words[to_index]]] + weight
    weight /= 2
    return weight, to_index + 1

def build_graph(lemmas, lemmas_map):
    len_dist_lemmas = len(lemmas_map)
    len_lemmas = len(lemmas)
    adj = np.zeros((len_dist_lemmas, len_dist_lemmas))
    for index, lemma in enumerate(lemmas):
        # TODO Take into account punctuation / stop words
        if(lemma in string.punctuation):
            continue
        weight = 16
        next_index = index + 1
        
        weight, next_index = build_link(adj, weight, lemmas_map, lemmas, index, next_index)
        weight, next_index = build_link(adj, weight, lemmas_map, lemmas, index, next_index)
        weight, next_index = build_link(adj, weight, lemmas_map, lemmas, index, next_index)
        weight, next_index = build_link(adj, weight, lemmas_map, lemmas, index, next_index)
    
    return adj

In [6]:
graph = build_graph(book_lems, words_map)
graph

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  2.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  9.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [7]:
print("Sparsity: {:05.2f}%".format(100 * (1 - np.count_nonzero(graph) / graph.size)))

Sparsity: 99.64%
