In [3]:
import nltk
import re
from numpy.random import choice
from functools import reduce
import operator

sonnet_poets = {"Shakespeare" : "Shakespeare_Sonnets.txt", "Frost" : "Robert_Frost.txt", "Whitman" : "Walt_Whitman.txt"}
limerick_poets = {"Limericks" : "limericks.txt"}

def build_templates(poets):
    templates = {}
    row_map = {
        1: [],
        2: [],
        3: [],
        4: [],
        5: []
    }
    
    # Threshold for template length. Templates are ignored if they are longer than this value.
    max_threshold = 20
    min_threshold = 1

    for poet, filename in poets.items():
        poet_template = {}

        with open(filename, "r") as f:
            line_num = 1
            for line in list(filter(lambda x: len(x) > 5, f)):
                line = line.strip()
                # Uncomment to strip punctuation
                # line = re.sub(r'[^\w\s]', '', line)
                text = nltk.word_tokenize(line)
                template = [word[1] for word in nltk.pos_tag(text)]

                # If the template length exceeds the threshold, don't include it
                if len(template) > max_threshold or len(template) < min_threshold:
                    continue
                    
                template_str = "-".join(template)
                
                count = poet_template.get(template_str, 0)
                poet_template[template_str] = count + 1
                row_map[line_num].append(template)
                line_num = (line_num % 5) + 1
                                
        # Filter out rare templates that only occurred once in corpus
        filtered_templates = { template : val for template, val in poet_template.items() if val > 0 }
        sorted_x = sorted(filtered_templates.items(), key=operator.itemgetter(1))
        print(sorted_x)
        
        # Find the total number of processed lines by summing the counts in the filtered dictionary
        total_count = reduce(lambda a, b: a + b, filtered_templates.values())
        
        # Get a list of the unique templates
        candidates = list(filtered_templates.keys())
            
        # Compute the probability distribution and store it under the poet's name
        probability_dist = list(map(lambda x: filtered_templates[x] / total_count, candidates))
        templates[poet] = (candidates, probability_dist)
        
    templates['Lines'] = row_map
    return templates

def random_weighted_template(templates, poet):
    candidates, dist = templates[poet][0], templates[poet][1]
    template_index = choice(range(len(candidates)), 1, p=dist).item()

    return candidates[template_index].split("-")

templates = build_templates(limerick_poets)

[('DT-JJ-JJ-NN-VBN-NNP', 1), ('VBN-TO-VB-DT-NN-VBN-NNP-.', 1), ("``-CC-,-''-PRP-VBD-,-``-PRP-MD-VB", 1), ('WP-DT-JJ-NN', 1), ('VB-IN-NNP-VB-NNP-NNP-.', 1), ('EX-VBD-DT-JJ-NN-RB-JJ', 1), ('PRP-RB-VBD-WRB-PRP-VBD-VBN-:', 1), ('PRP-MD-VB-TO-DT-NN', 1), ('CC-VB-RB-IN-NN-,', 1), ('IN-IN-PRP-MD-VBN-RB-VBN-.', 1), ('DT-NN-IN-NN-,-NNP-NNP-,', 1), ('VBN-RP-IN-NNP-CC-NNP-POS-,', 1), ('VBN-IN-DT-NN-,', 1), ('NNP-PRP$-NN-,-``-PRP-VBZ-NN', 1), ("PRP-VBP-VBG-PRP-:-:-IN-NNS-.-''", 1), ('DT-NN-,-WP-VBD-IN-NNP-NNP-,', 1), ("VBN-CD-NNS-IN-PRP$-NN-''", 1), ('PRP-VBD-,-IN-DT-NN-,', 1), ('``-DT-NN-NN-:-RB-PRP', 1), ("RB-VBN-PRP-,-RB-WRB-PRP-VBP-JJ-.-''-''", 1), ('DT-NN-,-RB-JJ-,', 1), ('CD-NN-VBD-TO-PRP$-NN-,', 1), ('``-DT-NN-MD-MD', 1), ('NN-IN-PRP-MD-:', 1), ("CC-DT-NN-MD-RB-MD-DT-MD-,-MD-PRP-.-''", 1), ('RB-VBZ-TO-DT-NN-,', 1), ('DT-NN-WDT-VBZ-DT-JJR', 1), ('IN-DT-NN-IN-DT-JJ-NN-:', 1), ('CC-DT-NN-IN-PRP-VBZ', 1), ('JJ-NNS-IN-NNS-,', 1), ('CC-DT-VBZ-WRB-DT-NN-VBZ-IN-.', 1), ('DT-JJ-JJ-NN-IN-NNP', 1), ('

In [4]:
print(templates["Lines"])

{1: [['DT', 'JJ', 'JJ', 'NN', 'VBN', 'NNP'], ['EX', 'VBD', 'DT', 'JJ', 'NN', 'RB', 'JJ'], ['DT', 'NN', 'IN', 'NN', ',', 'NNP', 'NNP', ','], ['DT', 'NN', ',', 'WP', 'VBD', 'IN', 'NNP', 'NNP', ','], ['DT', 'NN', ',', 'RB', 'JJ', ','], ['RB', 'VBZ', 'TO', 'DT', 'NN', ','], ['CC', 'DT', 'VBZ', 'WRB', 'DT', 'NN', 'VBZ', 'IN', '.'], ['``', 'PRP', 'VBP', 'RB', 'VB', ':', 'PRP', 'RB', 'VBD', 'IN', 'DT', 'NN', '.', "''"], ['TO', 'VB', 'JJ', 'NNS', 'IN', 'PRP$', 'NN', '.'], ['PRP', 'VBD', 'RP', 'IN', 'NN', 'CC', 'NN', '.']], 2: [['VBN', 'TO', 'VB', 'DT', 'NN', 'VBN', 'NNP', '.'], ['PRP', 'RB', 'VBD', 'WRB', 'PRP', 'VBD', 'VBN', ':'], ['VBN', 'RP', 'IN', 'NNP', 'CC', 'NNP', 'POS', ','], ['VBN', 'CD', 'NNS', 'IN', 'PRP$', 'NN', "''"], ['CD', 'NN', 'VBD', 'TO', 'PRP$', 'NN', ','], ['DT', 'NN', 'WDT', 'VBZ', 'DT', 'JJR'], ['DT', 'JJ', 'JJ', 'NN', 'IN', 'NNP'], ['EX', 'VBD', 'DT', 'JJ', 'NN', 'IN', 'NNP', ','], ['DT', 'NN', 'IN', 'PRP$', 'NN', 'VBD', 'NNP', 'NNP']], 3: [['``', 'CC', ',', "''", 'PRP',

In [2]:
print(random_weighted_template(templates, "Limericks"))

['DT', 'NN', 'WDT', 'VBZ', 'DT', 'JJR']


In [9]:
def build_common_words(poets):
    word_counts = {}
    
    for poet, filename in poets.items():
        with open(filename, "r") as f:
            for line in list(filter(lambda x: len(x) > 10, f)):
                line = line.strip()

                words = line.split()
                for word in words:
                    count = word_counts.get(word, 0)
                    word_counts[word] = count + 1
                    
    words = list(word_counts.keys())
    words.sort(key=lambda x: -word_counts[x])
                
    return word_counts, words
        
word_counts, words = build_common_words(poets)

In [10]:
print(words[:100])

['a', 'A', 'He', 'young', 'named', 'to', 'he', 'said,', 'the', 'Phoebe', 'was', 'by', 'with', 'certain', 'fellow', 'Bee-Bee', 'Wished', 'wed', 'woman', 'Phoebe.', '"But,"', '"I', 'must', 'see', 'What', 'clerical', 'fee', 'Be', 'before', 'be', 'Bee-Bee.', 'There', 'man', 'so', 'benighted', 'never', 'knew', 'when', 'slighted;', 'would', 'go', 'party', 'And', 'eat', 'just', 'as', 'hearty,', 'As', 'if', "he'd", 'been', 'really', 'invited.', 'maiden', 'at', 'college,', 'Miss', 'Breeze,', 'Weighed', 'down', 'B.A.s', 'and', "Lit.D's,", 'Collapsed', 'from', 'strain,', 'Said', 'her', 'doctor,', '"It\'s', 'plain', 'You', 'are', 'killing', 'yourself', '---', 'degrees!"', 'painter,', 'who', 'lived', 'in', 'Great', 'Britain,', 'Interrupted', 'two', 'girls', 'their', "knittin'", 'sigh,', '"That', 'park', 'bench--well', 'I', 'Just', 'painted', 'it,', 'right', 'where', "you're", 'sittin.\'"']


In [None]:
def generate_collocations(filename):
    collocations = {{}}
    
    with open(filename, "r") as f:
            for line in list(filter(lambda x: len(x) > 10, f)):
                line = line.strip()
                
            