In [36]:
import numpy as np
from nltk import sent_tokenize, word_tokenize as tokenize
from string import punctuation, whitespace
import re

In [43]:
START_TOKEN = "<START>"
END_TOKEN = "<END>"

punctuation += "«»—…“”–"
punct = set(punctuation)

In [44]:
text = open("../dictation_text.txt").read()

In [45]:
def char_pairs(word):
    word: list = list(word)
    word.append(END_TOKEN)
    word.insert(0, START_TOKEN)
    for i in range(len(word)-1):
        yield (word[i], word[i+1])
        
def normalize(text: str):
    tokens = [t.strip(punctuation) for t in tokenize(text)]
    return [t for t in tokens if len(t) > 0]

In [46]:
corpus = normalize(text)

all_chars = list(set(''.join(corpus)) - set(whitespace))
all_chars.sort()

In [47]:
char_to_id = {
    START_TOKEN: 0,
    END_TOKEN: 1,
}

for i, char in enumerate(all_chars):
    i += 2
    char_to_id[char] = i

id_to_char = {v: k for k, v in char_to_id.items()}

In [48]:
for i in range(100):
    rand_id = np.random.randint(0, len(id_to_char))
    char = id_to_char[rand_id]
    assert rand_id == char_to_id[char]

In [49]:
M = np.zeros((len(char_to_id), len(char_to_id)))
M.shape

(56, 56)

In [50]:
for word in corpus:
    for a,b in char_pairs(word):
        a_id = char_to_id[a]
        b_id = char_to_id[b]
        M[a_id, b_id] += 1

In [51]:
for i in range(100):
    a = np.random.randint(2, M.shape[0]) # ignoring START and AND tags
    b = np.random.randint(2, M.shape[1])
    count = sum(
                map(lambda word: word.count(id_to_char[a] + id_to_char[b]),
                corpus
                   )
            )
    
    assert int(M[a,b]) == count

In [53]:
#normalize
for i in range(M.shape[0]):
    _sum = np.sum(M[i])
    if _sum > 0: M[i] /= _sum

In [54]:
def chain_from_char(char: str) -> str:
    assert len(char) == 1 or char in (START_TOKEN, END_TOKEN)
    accum: str = char
    prev: str = char
    while True:
        prev_id: int = char_to_id[prev]
        next_id: int = np.argmax(M[prev_id])
        next_char: str = id_to_char[next_id]
        if next_char == END_TOKEN:
            break
        else:
            accum += next_char
            prev = next_char
    return re.sub("^<START>", '', accum)

def chain_from_id(id_: int) -> int:
    return chain_from_char(id_to_char(id_))

In [70]:
import json
import os
os.makedirs("markov", exist_ok=True)
with open("markov/id_to_char.json", 'w') as f:
    print(json.dumps(id_to_char), file=f)
with open("markov/matrix.txt", 'wb') as f:
    np.save(f, M)