## Data cleaning, preprocessing

In [1]:
import re

def cleaned_text(text):
    text = text.lower().strip()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r'[\"\',;.:?!()<>-]', '', text)
    return text

In [2]:
with open("robert_frost.txt", "r") as file:
    text = file.read()

with open("preprocessed.txt", "w") as file:

    for line in text.splitlines():
        line = cleaned_text(line)
        if line:
            file.write(line + " <END>\n")

In [3]:
tokenized_sentences = []

with open("preprocessed.txt", "r") as file:
    for line in file:
        words = line.split()
        if words:
            tokenized_sentences.append(words)

print(tokenized_sentences[:5])
print(len(tokenized_sentences))

[['two', 'roads', 'diverged', 'in', 'a', 'yellow', 'wood', '<END>'], ['and', 'sorry', 'i', 'could', 'not', 'travel', 'both', '<END>'], ['and', 'be', 'one', 'traveler', 'long', 'i', 'stood', '<END>'], ['and', 'looked', 'down', 'one', 'as', 'far', 'as', 'i', 'could', '<END>'], ['to', 'where', 'it', 'bent', 'in', 'the', 'undergrowth', '<END>']]
1436


#### Initial word probabilities


In [4]:
lines_total_count = len(tokenized_sentences)
initial_prob = {}

for line in tokenized_sentences:
    initial_prob[line[0]] = initial_prob.get(line[0], 0) + 1

for word in initial_prob:
    initial_prob[word] /= lines_total_count

print(dict(list(initial_prob.items())[:10]))
print(len(initial_prob))

{'two': 0.005571030640668524, 'and': 0.08983286908077995, 'to': 0.034818941504178275, 'then': 0.008356545961002786, 'because': 0.0006963788300835655, 'though': 0.004874651810584958, 'had': 0.002785515320334262, 'in': 0.0201949860724234, 'oh': 0.002785515320334262, 'yet': 0.0020891364902506965}
305


#### First-order transitions


In [5]:
first_order = {}
first_order_prob = {}

for line in tokenized_sentences:
    for i in range(len(line)-1):
        previous = line[i]
        current = line[i+1]

        if previous not in first_order:
            first_order[previous] = {}

        if current not in first_order[previous]:
            first_order[previous][current] = 0

        first_order[previous][current] += 1

for previous, current in first_order.items():
    total = sum(current.values())
    first_order_prob[previous] = {current:count/total for current, count in current.items()}

print(dict(list(first_order_prob.items())[:4]))

{'two': {'roads': 0.1111111111111111, 'miles': 0.05555555555555555, 'towns': 0.05555555555555555, 'converging': 0.05555555555555555, 'or': 0.05555555555555555, 'oldbelievers': 0.05555555555555555, 'legs': 0.05555555555555555, 'footsteps': 0.05555555555555555, '<END>': 0.1111111111111111, 'village': 0.05555555555555555, 'winds': 0.05555555555555555, 'weeks': 0.05555555555555555, 'of': 0.05555555555555555, 'as': 0.05555555555555555, 'at': 0.05555555555555555, 'oclock': 0.05555555555555555}, 'roads': {'diverged': 1.0}, 'diverged': {'in': 1.0}, 'in': {'a': 0.08383233532934131, 'the': 0.2754491017964072, 'leaves': 0.005988023952095809, 'fire': 0.005988023952095809, 'ice': 0.005988023952095809, 'rain': 0.041916167664670656, 'front': 0.017964071856287425, 'every': 0.017964071856287425, 'it': 0.03592814371257485, 'winter': 0.011976047904191617, 'somewhere': 0.005988023952095809, 'march': 0.005988023952095809, 'warren': 0.011976047904191617, '<END>': 0.011976047904191617, 'wentworth': 0.0059880

#### Second-order transition

In [6]:
second_order = {}
second_order_prob = {}

for line in tokenized_sentences:
    for i in range(2, len(line)):
        previous_previous = line[i - 2]
        previous = line[i - 1]
        current = line[i]

        key = (previous_previous, previous)

        if key not in second_order:
            second_order[key] = {}

        if current not in second_order[key]:
            second_order[key][current] = 0

        second_order[key][current] += 1

for key, current in second_order.items():
    total = sum(current.values())
    second_order_prob[key] = {word: count / total for word, count in current.items()}

print(dict(list(second_order_prob.items())[:5]))

{('two', 'roads'): {'diverged': 1.0}, ('roads', 'diverged'): {'in': 1.0}, ('diverged', 'in'): {'a': 1.0}, ('in', 'a'): {'yellow': 0.07142857142857142, 'wood': 0.07142857142857142, 'window': 0.07142857142857142, 'packing': 0.07142857142857142, 'byroad': 0.07142857142857142, 'family': 0.07142857142857142, 'new': 0.07142857142857142, 'row': 0.07142857142857142, 'time': 0.07142857142857142, 'town': 0.07142857142857142, 'book': 0.07142857142857142, 'smother': 0.07142857142857142, 'glass': 0.14285714285714285}, ('a', 'yellow'): {'wood': 1.0}}


#### Cumulative probability method

In [7]:
import random

def cumulative_prob(prob_dict):

    r = random.random()
    cumulative = 0.0

    for word, probability in prob_dict.items():
        cumulative += probability
        if r <= cumulative:
            return word

#### Generation and inference

In [8]:
def markov_model(initial_prob, first_order_prob, second_order_prob):
    sentence = []

    w1 = cumulative_prob(initial_prob)
    sentence.append(w1)

    w2 = cumulative_prob(first_order_prob[w1])
    sentence.append(w2)

    while True:
        key = (sentence[-2], sentence[-1])
        if key not in second_order_prob:
            break

        next_word = cumulative_prob(second_order_prob[key])
        if next_word == "<END>":
            break

        sentence.append(next_word)


    return sentence

In [12]:
for i in range(10):
    sentence = markov_model(initial_prob, first_order_prob, second_order_prob)
    print(f"{i+1}", ' '.join(sentence))

1 ive said why shouldnt they be married
2 itd make my position stronger i think were all mad tell me about her does she look like me
3 its raining
4 he said the dead had souls but when ive done it what was either cloud or smoke
5 from the ground
6 legitimately my demand upon her
7 theirs interlock
8 that prove
9 but it doesnt seem as if by eye pairs out of beaten ways
10 up a cheering song of how
