In [2]:
import nltk
import re
from numpy.random import choice
from functools import reduce
import operator

poets = {"Shakespeare" : "Shakespeare_Sonnets.txt", "Frost" : "Robert_Frost.txt", "Whitman" : "Walt_Whitman.txt"}

def build_templates(poets):
    templates = {}
    
    # Threshold for template length. Templates are ignored if they are longer than this value.
    threshold = 10

    for poet, filename in poets.items():
        poet_template = {}

        with open(filename, "r") as f:
            for line in list(filter(lambda x: len(x) > 10, f)):
                line = line.strip()
                # Uncomment to strip punctuation
                # line = re.sub(r'[^\w\s]', '', line)
                text = nltk.word_tokenize(line)
                template = [word[1] for word in nltk.pos_tag(text)]
                
                # If the template length exceeds the threshold, don't include it
                if len(template) > threshold:
                    continue
                    
                template_str = "-".join(template)
                
                count = poet_template.get(template_str, 0)
                poet_template[template_str] = count + 1
                
        # Filter out rare templates that only occurred once in corpus
        filtered_templates = { template : val for template, val in poet_template.items() if val > 1 }
        sorted_x = sorted(filtered_templates.items(), key=operator.itemgetter(1))
        print(sorted_x)
        
        # Find the total number of processed lines by summing the counts in the filtered dictionary
        total_count = reduce(lambda a, b: a + b, filtered_templates.values())
        
        # Get a list of the unique templates
        candidates = list(filtered_templates.keys())
            
        # Compute the probability distribution and store it under the poet's name
        probability_dist = list(map(lambda x: filtered_templates[x] / total_count, candidates))
        templates[poet] = (candidates, probability_dist)
        
    return templates

def random_weighted_template(templates, poet):
    candidates, dist = templates[poet][0], templates[poet][1]
    template_index = choice(range(len(candidates)), 1, p=dist).item()

    return candidates[template_index].split("-")

templates = build_templates(poets)

[('NN', 2)]
[('CD-IN-PRP$-NNS-VBZ-IN-DT-JJ-NNS-,', 2), ('RB-JJ-CC-NN-PRP-RB-VBP-DT-NN-,', 2), ('CC-VBD-RB-IN-DT-NN-IN-NN-.', 2), ('PRP-MD-RB-VB-VBN-CC-IN-DT-NN', 2), ('NNP-PRP$-NN-PRP-MD-VB-RP-,', 2), ('NN-IN-RB-VBG-JJ-NN-,', 2), ('CC-NN-WRB-DT-JJ-NN-VBZ-DT-NN-.', 2), ('CC-DT-MD-RB-VB-NN-IN-PRP$-NN', 2), ('TO-VB-PRP-,-WP-MD-VB-PRP-VB', 2), ('PRP-MD-RB-VB-PRP-VBN-IN-PRP-PRP', 2), ('RB-RBR-JJ-IN-DT-PRP-VBD-VBD-JJ-.', 2), ('PRP-VBP-IN-DT-JJ-NN-PRP-VBP', 2), ('DT-VBD-JJ-DT-NN-IN-,', 2), ('CC-VBD-DT-NN-CC-DT-JJ-NNS-,', 2), ('CC-DT-NN-IN-WDT-DT-NN-VBZ-,', 2), ('CC-DT-JJ-JJ-NNS-VBP-.', 2), ('NNP-VBD-NNS-DT-NNS-NN', 2), ('DT-NNS-VBP-RB-TO-DT-JJ-NN-:', 2), ('DT-NN-NN-VBZ-VBN-CD-NN', 2), ('IN-JJ-NN-CC-JJ-WRB-DT-NN-VBZ-:', 2), ('DT-NN-RB-TO-DT-NN-VBZ-VBN-.', 2), ('PRP-VBP-IN-DT-RB-JJ-NN', 2), ('IN-DT-VBD-NN-RB-RB-RB', 2), ('IN-DT-JJ-CC-JJ-NN', 2), ('DT-VBZ-DT-NN-RB-IN-DT-NN-.', 2), ('NNP-VBZ-:-DT-JJ-NNS-JJ-CC-NN-:', 2), ('DT-NN-VBZ-VBG-TO-VB', 2), ('CC-NN-CC-NN-CC-NN-IN-:', 2), ('PRP-VBP-PRP-VB-R

In [2]:
print(random_weighted_template(templates, "Frost"))

['NNP', 'PRP', 'IN', 'DT', 'NN', 'MD', 'VB', ':']


In [3]:
def build_common_words(poets):
    word_counts = {}
    
    for poet, filename in poets.items():
        with open(filename, "r") as f:
            for line in list(filter(lambda x: len(x) > 10, f)):
                line = line.strip()

                words = line.split()
                for word in words:
                    count = word_counts.get(word, 0)
                    word_counts[word] = count + 1
                    
    words = list(word_counts.keys())
    words.sort(key=lambda x: -word_counts[x])
                
    return word_counts, words
        
word_counts, words = build_common_words(poets)

In [4]:
print(words[:100])

['the', 'and', 'of', 'I', 'to', 'in', 'a', 'The', 'with', 'And', 'is', 'my', 'you', 'that', 'it', 'for', 'not', 'or', 'as', 'all', 'on', 'me', 'be', 'from', 'are', 'his', 'at', 'have', 'by', 'was', 'he', 'To', 'what', 'they', 'O', 'But', 'one', 'their', 'your', 'see', 'but', 'so', 'A', 'thy', 'no', 'we', 'out', 'this', 'them', 'will', 'do', 'more', 'than', 'her', 'there', 'me,', 'That', 'shall', 'him', 'You', 'its', 'had', 'if', 'For', 'up', 'thou', 'know', 'It', 'He', 'In', 'am', 'where', 'As', 'like', 'through', 'old', 'love', 'an', 'some', 'Of', 'these', 'now', 'any', 'when', 'you,', 'With', 'has', 'yet', 'who', 'were', 'those', 'Or', 'every', 'can', 'she', 'What', 'They', 'THE', 'make', 'would']
