In [42]:
!gdown 1Renfi4W_TL3e9VslX-nUs58EY6h3Oaey

Downloading...
From: https://drive.google.com/uc?id=1Renfi4W_TL3e9VslX-nUs58EY6h3Oaey
To: /content/robert_frost.txt
  0% 0.00/56.3k [00:00<?, ?B/s]100% 56.3k/56.3k [00:00<00:00, 79.7MB/s]


In [43]:
import numpy as np
import string
import re
import random
import pandas as pd

In [44]:
def remove_punctuation(text):
    translation_table = str.maketrans('', '', string.punctuation)
    return text.translate(translation_table)

In [45]:
def add_to_transition_dict(transition_dict, condition, next_word):
    if condition not in transition_dict:
        transition_dict[condition] = []
    transition_dict[condition].append(next_word)

In [46]:
    initial_probabilities = {} # for first word
    first_order_transitions = {} # for second word
    second_order_transitions = {} # for other words

In [47]:
try:
    with open('robert_frost.txt', 'r') as file:
        for line in file:
            processed_line = remove_punctuation(line.strip().lower())
            words = processed_line.split()
            if words:
                words.append('<END>')
                if len(words) >= 1:
                    add_to_transition_dict(initial_probabilities, words[0], words[0])
                if len(words) >= 2:
                    add_to_transition_dict(first_order_transitions, words[0], words[1])
                if len(words) >= 3:
                    for i in range(2, len(words)):
                        add_to_transition_dict(second_order_transitions, tuple([words[i-2], words[i-1]]), words[i])

    # Normalize initial probabilities
    initial_total = sum(len(v) for v in initial_probabilities.values())
    for key, value in initial_probabilities.items():
        initial_probabilities[key] = len(value) / initial_total
    # total_lines = sum(initial_probabilities.values())
    # for word, count in initial_probabilities.items():
    #     initial_probabilities[word] = count / total_lines
except FileNotFoundError:
    print("Error: File not found.")

In [48]:
def word_probabilities(word_list):
    word_counts = {}
    for word in word_list:
        word_counts[word] = word_counts.get(word, 0) + 1

    total_words = len(word_list)
    probabilities = {}
    for word, count in word_counts.items():
        probabilities[word] = count / total_words

    return probabilities

In [49]:
first_order_transitions

{'two': ['roads',
  'roads',
  'miles',
  'oldbelievers',
  'winds',
  'weeks',
  'of',
  'at'],
 'and': ['sorry',
  'be',
  'looked',
  'having',
  'both',
  'that',
  'miles',
  'miles',
  'would',
  'dropped',
  'further',
  'when',
  'tell',
  'the',
  'caught',
  'put',
  'threw',
  'birds',
  'suddenly',
  'scurf',
  'sorry',
  'since',
  'whats',
  'tell',
  'many',
  'blew',
  'stamped',
  'sometimes',
  'some',
  'then',
  'came',
  'this',
  'then',
  'politician',
  'thatd',
  'rode',
  'if',
  'from',
  'i',
  'he',
  'full',
  'experts',
  'built',
  'both',
  'thats',
  'spoke',
  'anyway',
  'had',
  'the',
  'how',
  'taken',
  'lie',
  'left',
  'stroked',
  'the',
  'a',
  'me',
  'a',
  'between',
  'wont',
  'hes',
  'his',
  'nothing',
  'better',
  'kick',
  'carried',
  'thought',
  'swollen',
  'swollen',
  'hold',
  'all',
  'fell',
  'set',
  'sit',
  'bring',
  'push',
  'that',
  'those',
  'sproutlands',
  'perhaps',
  'see',
  'dangle',
  'disappeared',
  

In [50]:
for condition_word, next_words in first_order_transitions.items():
    first_order_transitions[condition_word] = word_probabilities(next_words)

for condition_pair, next_words in second_order_transitions.items():
    second_order_transitions[condition_pair] = convert_to_probability_dict(next_words)

In [51]:
def sample_word(probability_dict):
    cumulative_probability = 0
    random_value = random.random()

    for word, probability in probability_dict.items():
        cumulative_probability += probability
        if cumulative_probability > random_value:
            return word
    return list(probability_dict.keys())[-1]

In [52]:
def most_frequent_word(probability_dict):
    if not probability_dict:
        return None

    most_frequent = max(probability_dict, key=probability_dict.get)
    return most_frequent

In [53]:
def generate_poetry(initial_probabilities, first_order_transitions, second_order_transitions):
    poetry = []
    for _ in range(4):
        line = []
        # First word
        first_word = sample_word(initial_probabilities)
        line.append(first_word)

        # Second word
        if first_word in first_order_transitions:
            second_word = sample_word(first_order_transitions[first_word])
            line.append(second_word)
        else:
            # Handle cases where the first word has no following words
            second_word = most_frequent_word(initial_probabilities)
            if second_word:
                line.append(second_word)
            else:
                line.append("<END>")  # or any other default

        # Subsequent words
        while line[-1] != "<END>" and len(line) < 10: # Limit line length to avoid infinite loops
          if tuple(line[-2:]) in second_order_transitions:
            next_word = sample_word(second_order_transitions[tuple(line[-2:])])
            line.append(next_word)
          else:
            line.append("<END>") # or another strategy to handle unseen bigrams

        poetry.append(" ".join(line[:-1])) # Exclude <END> from output
    return "\n".join(poetry)


In [56]:
print(generate_poetry(initial_probabilities, first_order_transitions, second_order_transitions))

that made him throw the hoe
that sends light rustle rushes to their leaves
that need endless talktalk
well take me there and show it to me
