In [82]:
import os, codecs, nltk, numpy as np
from nltk.corpus import wordnet

In [58]:
corpus_root = "books"

In [59]:
book_file = "king-james-bible-processed.txt"

In [60]:
book = None

with codecs.open(os.path.join(corpus_root,book_file),encoding="utf8") as f:
    book = f.read().lower()

In [61]:
book[:100]

'in the beginning god created the heaven and the earth.\r\nand the earth was without form, and void; an'

In [62]:
book_sents = nltk.sent_tokenize(book)

In [63]:
book_tokens = [t for t in nltk.word_tokenize(book)]

In [64]:
book_tokens[:15]

['in',
 'the',
 'beginning',
 'god',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.',
 'and',
 'the',
 'earth',
 'was']

In [65]:
book_pos = nltk.pos_tag(book_tokens)

In [66]:
book_pos[:15]

[('in', 'IN'),
 ('the', 'DT'),
 ('beginning', 'NN'),
 ('god', 'NN'),
 ('created', 'VBD'),
 ('the', 'DT'),
 ('heaven', 'NN'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('earth', 'NN'),
 ('.', '.'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('earth', 'NN'),
 ('was', 'VBD')]

In [67]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [68]:
wnl = nltk.WordNetLemmatizer()

In [69]:
book_lems = [wnl.lemmatize(t,pos=get_wordnet_pos(p)) for t,p in book_pos]

In [70]:
book_lems[:15]

['in',
 'the',
 'beginning',
 'god',
 'create',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.',
 'and',
 'the',
 'earth',
 'be']

In [71]:
set_book_lems = set(book_lems)

In [73]:
len(set_book_lems)

13080

In [97]:
words_map = {w:i for w,i in zip(set_book_lems, range(0, len(set_book_lems)))}

In [98]:
words_map["god"]

8724

In [104]:
def build_graph(lemmas, lemmas_map):
    len_dist_lemmas = len(lemmas_map)
    len_lemmas = len(lemmas)
    adj = np.zeros((len_dist_lemmas, len_dist_lemmas))
    for index, lemma in enumerate(lemmas):
        # TODO Take into account punctuation / stop words
        if(index < len_lemmas - 1):
            adj[lemmas_map[lemma], lemmas_map[lemmas[index + 1]]] = 16
        if(index < len_lemmas - 2):
            adj[lemmas_map[lemma], lemmas_map[lemmas[index + 2]]] = 8
        if(index < len_lemmas - 3):
            adj[lemmas_map[lemma], lemmas_map[lemmas[index + 3]]] = 4
    
    return adj

In [105]:
graph = build_graph(book_lems, words_map)

In [106]:
graph

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [107]:
unique, counts = np.unique(graph, return_counts=True)
dict(zip(unique, counts))

{0.0: 170627922, 4.0: 186803, 8.0: 153651, 16.0: 118024}