In [10]:
import nltk
import re
from numpy.random import choice
from functools import reduce
import operator

poets = {"Shakespeare" : "Shakespeare_Sonnets.txt", "Frost" : "Robert_Frost.txt", "Whitman" : "Walt_Whitman.txt"}

def build_templates(poets):
    templates = {}
    
    # Threshold for template length. Templates are ignored if they are longer than this value.
    max_threshold = 20
    min_threshold = 3

    for poet, filename in poets.items():
        poet_template = {}

        with open(filename, "r") as f:
            for line in list(filter(lambda x: len(x) > 10, f)):
                line = line.strip()
                # Uncomment to strip punctuation
                # line = re.sub(r'[^\w\s]', '', line)
                text = nltk.word_tokenize(line)
                template = [word[1] for word in nltk.pos_tag(text)]

                # If the template length exceeds the threshold, don't include it
                if len(template) > max_threshold or len(template) < min_threshold:
                    continue
                    
                template_str = "-".join(template)
                
                count = poet_template.get(template_str, 0)
                poet_template[template_str] = count + 1
                                
        # Filter out rare templates that only occurred once in corpus
        filtered_templates = { template : val for template, val in poet_template.items() if val > 0 }
        sorted_x = sorted(filtered_templates.items(), key=operator.itemgetter(1))
        print(sorted_x)
        
        # Find the total number of processed lines by summing the counts in the filtered dictionary
        total_count = reduce(lambda a, b: a + b, filtered_templates.values())
        
        # Get a list of the unique templates
        candidates = list(filtered_templates.keys())
            
        # Compute the probability distribution and store it under the poet's name
        probability_dist = list(map(lambda x: filtered_templates[x] / total_count, candidates))
        templates[poet] = (candidates, probability_dist)
        
    return templates

def random_weighted_template(templates, poet):
    candidates, dist = templates[poet][0], templates[poet][1]
    template_index = choice(range(len(candidates)), 1, p=dist).item()

    return candidates[template_index].split("-")

templates = build_templates(poets)

[('IN-JJS-NNS-PRP-VBP-NN-,', 1), ('DT-VBZ-NN-POS-VBD-MD-RB-VB-,', 1), ('CC-IN-DT-NN-MD-IN-NN-NN-,', 1), ('PRP$-NN-NN-MD-VB-PRP$-NN-:', 1), ('CC-NN-,-VBD-TO-VB-JJ-JJ-NNS-,', 1), ('NNP-VBD-NN-POS-NN-IN-JJ-NN-,', 1), ('VBG-DT-NN-WRB-NN-VBZ-,', 1), ('NNP-PRP-JJ-NN-,-TO-VB-JJ-NNS-RB-JJ-:', 1), ('NN-IN-NN-RB-DT-NN-POS-JJ-NN-,', 1), ('CC-RB-NN-TO-DT-NN-NN-,', 1), ('IN-JJ-JJ-NN-JJS-JJ-NN-,', 1), ('CC-NN-NN-NN-NN-IN-NN-:', 1), ('NNP-DT-NN-,-CC-RB-DT-NN-VB-,', 1), ('TO-VB-DT-NN-POS-JJ-,-IN-DT-NN-CC-NN-.', 1), ('WRB-NN-NNS-MD-VB-JJ-NN-,', 1), ('CC-VB-JJ-NNS-IN-JJ-NN-POS-NN-,', 1), ('NNP-NN-POS-JJ-NN-RB-VBN-IN-RB-,', 1), ('MD-VB-DT-NN-MD-VB-IN-JJ-JJ-VBN-:', 1), ('RB-VBG-VBN-,-WRB-DT-JJ-NN-NNS-,', 1), ('WRB-PDT-DT-NN-IN-JJ-JJ-NNS-:', 1), ('TO-VB-,-IN-JJ-JJ-NN-JJ-NNS-,', 1), ('WRB-DT-JJ-NN-,-CC-JJ-NN-.', 1), ('WRB-RB-RBR-JJ-NN-MD-VB-NN-POS-NN-,', 1), ('IN-JJ-NN-NN-POS-JJ-NN-IN-NN', 1), ("NNP-NN-PRP$-NN-,-CC-VB-PRP$-JJ-NN-,-''", 1), ('VBG-PRP$-NN-IN-NN-NN-.', 1), ('DT-VBD-TO-VB-JJ-VBN-WRB-JJ-NN-JJ-,'

[('PRP-VBP-VBG-IN-TO-VB-DT-NN-NN-:', 1), ('PRP-MD-RB-VB-TO-VB-DT-NNS-RP', 1), ('(-CC-VB-TO-VB-DT-NN-JJ-,-PRP-MD-)-:', 1), ('PRP-VBP-VBG-IN-TO-VB-DT-JJ-NN', 1), ('DT-VBZ-NN-IN-DT-NN-.-PRP-VBZ-RB-JJ-.', 1), ('PRP-VBZ-WRB-PRP-VBZ-PRP-IN-PRP$-NN-.', 1), ('PRP-VBZ-IN-DT-JJ-,-NN-,-NN-NN-.', 1), ('NNS-VBP-RB-WP-DT-NN-NN-VBP', 1), ('WP-NN-DT-JJ-NN-IN-PRP', 1), ('DT-NNS-RP-IN-DT-JJ-NN', 1), ('NNP-NN-NNS-IN-DT-NNS-NN-.', 1), ('PRP-VBP-JJ-NN-,-CC-JJ-CC-JJ-,', 1), ('IN-CD-,-NN-,-VBP-JJ-CC-JJ-,', 1), ('IN-NN-IN-PRP-IN-RB-NNS-,', 1), ('CC-RB-,-IN-NN-IN-WRB-JJ-NNS-,', 1), ('JJ-JJ-NNS-IN-MD-VB-VBN-.', 1), ('PRP$-NNP-NNP', 1), ('NNP-NNP-,-WRB-PRP-VBZ-RB-IN-PRP-,', 1), ('VBZ-DT-JJ-NNS-IN-NN-NN', 1), ('RB-JJ-IN-NNS-MD-VB-:', 1), ('PRP-VBZ-DT-NN-,-DT-JJ-NN-:', 1), ('PRP-VBD-DT-JJ-NN-NN-.', 1), ('PRP$-NN-MD-RB-VB-PRP-VB-.', 1), ('PRP-NNS-CC-PRP-VBP-RB-TO-NN-:', 1), ('PRP-VBZ-JJ-DT-NNS-VBP-VBN-RB-,', 1), ('PRP-VBZ-JJ-PRP$-JJ-JJ-NN', 1), ('VBZ-RB-RB-IN-VBG-NN-.', 1), ('DT-NN-,-VBD-NNS-,', 1), ('DT-JJ-NN-,-DT

[('NN-PRP-VBP-,-DT-JJ-JJ-NN-,', 1), ('RB-JJ-DT-NN-NNP-,-DT-NN-NNP-.', 1), ('IN-NN-IN-JJ-TO-VB-PRP-VBG-,', 1), ('RB-VB-RB-CC-NN-RB-VBZ-JJ-IN-DT-NNP-,-PRP', 1), ('VB-DT-NNP-NN-VBZ-JJR-RB-,', 1), ('DT-NNP-RB-IN-DT-NNP-PRP-VBP-.', 1), ('IN-NNP-NN-IN-NN-,-NN-,-CC-NN-,', 1), ('NNP-,-IN-JJ-NN-NN-MD-IN-DT-NNS-NN-,', 1), ('DT-NNP-NNP-PRP-VBP-.', 1), ("IN-PRP-VBP-''-NNP-NNP-.", 1), ('IN-PRP-VBP-MD-IN-NN-,', 1), ('VBG-IN-PRP$-NNS-,-VBG-,-VBG-RB-,', 1), ('DT-NNP-NN-IN-PRP-IN-JJ-NN-,', 1), ('JJ-IN-NN-,-NN-,-CC-NN-,', 1), ('DT-NN-IN-NNS-IN-JJ-NNS-,', 1), ('IN-TO-PRP-VBG-IN-NN-PRP$-NNS-,', 1), ('IN-NN-VBG-TO-JJ-JJ-NNS-,', 1), ('CC-VBG-NN-,-WP-JJS-NN-.-PRP-VBD-,', 1), ('NNP-VBZ-RB-EX-VBZ-CC-CD-NN-IN-VBG-NNS-.', 1), ('CC-DT-VBZ-DT-NN-IN-NNP-,-DT-NN-IN-NNS-,', 1), ('DT-NN-IN-JJ-NNS-.', 1), ('VB-PRP-RB-,-RB-PRP-VBP-CD-,', 1), ('PRP-RB-JJ-NNP-RB-VBG-NN-,-CC-DT-RBR-CC-JJR-CD', 1), ('VBN-IN-PRP$-NN-IN-VBG-NN-,-IN-NN-,-NN-CC', 1), ('NN-,-NN-NN-POS-CC-NN-,', 1), ('(-CC-NNS-JJ-,-CC-RB-JJ-IN-JJ-,-IN-DT-JJ-,-)-D

In [19]:
print(random_weighted_template(templates, "Frost"))

['PRP', 'MD', 'VB', 'WP', 'VBZ', 'VBG', 'NNP', ',', 'RB', '.']


In [3]:
def build_common_words(poets):
    word_counts = {}
    
    for poet, filename in poets.items():
        with open(filename, "r") as f:
            for line in list(filter(lambda x: len(x) > 10, f)):
                line = line.strip()

                words = line.split()
                for word in words:
                    count = word_counts.get(word, 0)
                    word_counts[word] = count + 1
                    
    words = list(word_counts.keys())
    words.sort(key=lambda x: -word_counts[x])
                
    return word_counts, words
        
word_counts, words = build_common_words(poets)

In [4]:
print(words[:100])

['the', 'and', 'of', 'I', 'to', 'in', 'a', 'The', 'with', 'And', 'is', 'my', 'you', 'that', 'it', 'for', 'not', 'or', 'as', 'all', 'on', 'me', 'be', 'from', 'are', 'his', 'at', 'have', 'by', 'was', 'he', 'To', 'what', 'they', 'O', 'But', 'one', 'their', 'your', 'see', 'but', 'so', 'A', 'thy', 'no', 'we', 'out', 'this', 'them', 'will', 'do', 'more', 'than', 'her', 'there', 'me,', 'That', 'shall', 'him', 'You', 'its', 'had', 'if', 'For', 'up', 'thou', 'know', 'It', 'He', 'In', 'am', 'where', 'As', 'like', 'through', 'old', 'love', 'an', 'some', 'Of', 'these', 'now', 'any', 'when', 'you,', 'With', 'has', 'yet', 'who', 'were', 'those', 'Or', 'every', 'can', 'she', 'What', 'They', 'THE', 'make', 'would']


In [None]:
def generate_collocations(filename):
    collocations = {{}}
    
    with open(filename, "r") as f:
            for line in list(filter(lambda x: len(x) > 10, f)):
                line = line.strip()
                
            