# Story Time with Markov Twain

## Cleaning Text and Getting Frequency

In [4]:
using JLD;
using ForwardDiff;

Add two workers to do processing.

In [None]:
# addprocs(2);

Remember, Julia interpreter prints last variable of cell.

In [88]:
function clean_corpus(text, regex; normalize = true, lower_case = true)
    if normalize
        # replace control characters with spaces
        text = normalize_string(text, stripmark = true, stripignore = true, stripcc = true)
    end
    
    if lower_case
        text = lowercase(text)
    end
    
    # remove unwanted characters
    text = replace(text, regex, "")
    
    # remove ""
    text = split(text, " ")
    target_index = 1
    for i in 1:length(text)
        target_index = findnext(text, "", target_index)
        if target_index == 0
            break
        else
            splice!(text, target_index)
        end
    end        
    text = join(text, " ")

end;

Read in file

In [12]:
f = open("mark_twain_books/adventures_of_tom_sawyer.txt")
ats = readall(f);

Clean text

In [13]:
# create regex object (I prefer whitelisting characters I want to keep)
chars_to_remove = r"[^a-z ]"
ats_clean = clean_corpus(ats, chars_to_remove);

Define text to numeric function

In [19]:
function text_to_numeric(text, symbols)
    numeric_text = []
    for each in text
        push!(numeric_text, findfirst(symbols, each))
    end
    # convert to tuple?
    numeric_text
end;

In [20]:
function numeric_to_text(numeric, symbols)
    text= []
    for num in numeric
        push!(text, symbols[num])
    end
    text
end;

In [89]:
# if ngram = 2, M will actually need to be a 3-dimensional array
function get_corpus_frequencies(corpus, ngram; groupby = "words")
    if groupby == "chars"
        corpus = split(corpus, "")
    else        
        corpus = split(corpus, " ")
    end
    
    # find unique symbols
    unique_symbols = unique(corpus)   
    # convert text to numbers
    corpus_numeric = text_to_numeric(corpus, unique_symbols);
    # create M
    dimensions = repeat([length(unique_symbols)], outer=[ngram])
    M = repeat([0], outer = dimensions)
    # get frequencies for ngram of text
    for i in 1:length(corpus)-ngram+1
        M[corpus_numeric[i:i+ngram-1]...] += 1
    end
    
    M
end;

Let's make sure this frequency array works on a subset of the text.

In [None]:
len_ats_clean = length(split(ats_clean, " "))
# text subset
ats_subset = join(split(ats_clean, " ")[1:round(Int64, len_ats_clean/2)], " ")
@time M = get_corpus_frequencies(ats_subset, 2);

Now let's combine all three Mark Twain novels and create a frequency array for the whole text.

In [14]:
# import other books
f = open("mark_twain_books/huckleberry_finn.txt")
hf = readall(f)
f = open("mark_twain_books/the_prince_and_the_pauper.txt")
tpatp = readall(f)

# clean other books
hf_clean = clean_corpus(hf, chars_to_remove)
tpatp_clean = clean_corpus(tpatp, chars_to_remove)

# combine all books
big_corpus_clean = ats_clean * " " * hf_clean * " " * tpatp_clean;
# M_2 = get_corpus_frequencies(big_corpus_clean, 2);

Call this on desktop

In [None]:
# @time M_3 = get_corpus_frequencies(big_corpus_clean, 3);

Save M_2 object.

In [None]:
save("M_2.jld", "M_2", M_2);

## Markov Model

Import M_2 object.

In [5]:
M_2 = load("M_2.jld", "M_2");

In [26]:
function choose_next_state(distribution, r)
    # only consider entries that are non-zero
    nonzero_entries = findn(distribution)
    distribution_nonzero = distribution[nonzero_entries]
    ranges = cumsum(distribution_nonzero)
    
    for (idx, range) in enumerate(ranges)
        if r < range
            return nonzero_entries[idx]
        end
    end
end;

In [98]:
function markov_model(ϕ, num_steps, unique_symbols, ngram, M_2)

    # create empty array to store result of Markov jumping from state to state
    markov_chain_text = []
    push!(markov_chain_text, ϕ)
    
    current_state = text_to_numeric(split(ϕ, " "), unique_symbols)
    
    for step in 1:num_steps
        # normalize row
        distribution = M_2[current_state, :][:] / sum(M_2[current_state, :][:])

        # randomly choose next word
        # generate random number betweeen 0 and 1
        r = rand()
        next_word_idx = choose_next_state(distribution, r)
        next_word = numeric_to_text([next_word_idx], unique_symbols)[1]
        push!(markov_chain_text, next_word)
        current_state = text_to_numeric(markov_chain_text[end-ngram+1:end], unique_symbols)
    end
    
    markov_chain_text
end;

In [99]:
function get_phi(cleaned_corpus, ngram; groupby = "words")
    if groupby == "chars"
        cleaned_corpus_array = split(cleaned_corpus, "")
    else
        cleaned_corpus_array = split(cleaned_corpus, " ")
        
    end
    starting_point = rand(1:length(cleaned_corpus_array)-ngram)
    ϕ = join(cleaned_corpus_array[starting_point:starting_point+ngram-1], " ") 
end;

In [101]:
num_steps = 10
unique_symbols = unique(split(big_corpus_clean, " "))
ngram = 1
# choose random ngram set of symbols from text
ϕ = get_phi(big_corpus_clean, ngram, groupby = "words")

markov_chain_text = markov_model(ϕ, num_steps, unique_symbols, ngram, M_2);