In [None]:
import nltk
import re
from numpy.random import choice
from functools import reduce

poets = {"Shakespeare" : "Shakespeare_Sonnets.txt", "Frost" : "Robert_Frost.txt", "Whitman" : "Walt_Whitman.txt"}

def build_templates(poets):
    templates = {}
    
    # Threshold for template length. Templates are ignored if they are longer than this value.
    threshold = 10

    for poet, filename in poets.items():
        poet_template = {}

        with open(filename, "r") as f:
            for line in list(filter(lambda x: len(x) > 10, f)):
                line = line.strip()
                # Uncomment to strip punctuation
                # line = re.sub(r'[^\w\s]', '', line)
                text = nltk.word_tokenize(line)
                template = [word[1] for word in nltk.pos_tag(text)]
                
                # If the template length exceeds the threshold, don't include it
                if len(template) > threshold:
                    continue
                    
                template_str = "-".join(template)
                
                count = poet_template.get(template_str, 0)
                poet_template[template_str] = count + 1
                
        # Filter out rare templates that only occurred once in corpus
        filtered_templates = { template : val for template, val in poet_template.items() if val > 1 }
        
        # Find the total number of processed lines by summing the counts in the filtered dictionary
        total_count = reduce(lambda a, b: a + b, filtered_templates.values())
        
        # Get a list of the unique templates
        candidates = list(filtered_templates.keys())
            
        # Compute the probability distribution and store it under the poet's name
        probability_dist = list(map(lambda x: filtered_templates[x] / total_count, candidates))
        templates[poet] = (candidates, probability_dist)
        
    return templates

def random_weighted_template(templates, poet):
    candidates, dist = templates[poet][0], templates[poet][1]
    template_index = choice(range(len(candidates)), 1, p=dist).item()

    return candidates[template_index].split("-")

templates = build_templates(poets)

In [None]:
print(random_weighted_template(templates, "Frost"))

In [None]:
def build_common_words(poets):
    word_counts = {}
    
    for poet, filename in poets.items():
        with open(filename, "r") as f:
            for line in list(filter(lambda x: len(x) > 10, f)):
                line = line.strip()

                words = line.split()
                for word in words:
                    count = word_counts.get(word, 0)
                    word_counts[word] = count + 1
                    
    words = list(word_counts.keys())
    words.sort(key=lambda x: -word_counts[x])
                
    return word_counts, words
        
word_counts, words = build_common_words(poets)

In [None]:
print(words[:100])