In [26]:
from HMM import unsupervised_HMM
from HMM_helper import sample_sentence, parse_observations
import numpy as np
import random
import pickle
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
# you have to install pyphen
# import pyphen
from nltk.corpus import cmudict
import json


### NOTES:


#### 1) We're currently using their HMM file (now using pset6 solutions)
#### 2) Rewrite the read_files function to include ours (eg. so all_words instead of poems) and keep the syllables and rhymes
#### 3) LOAD OUR HMM MODEL - (don't need to anymore because of 1)
#### 4) First sonnet: their HMM, using poems
#### 5) Second sonnet: their HMM, using all_words


In [2]:
f = open(os.getcwd()+'/data/shakespeare.txt')

all_words = []
lines = f.readlines()
new_lines = []
sonnets=[]
sonnet=[]
poems = []


for line in lines:
    line = line.strip()
    line = line.lower()
    newPunct = "!#$%&()*'+,./:;<=>?@[\]^_`{|}~"
#     line = ''.join(c for c in line if c not in newPunct)
    for punct in newPunct:
        line = line.replace(punct, '')
    if line.isdigit():
        sonnets.append(sonnet)
        sonnet = []
    elif line.strip() == '':
        pass
    else:
        all_words.append(nltk.word_tokenize(line)) # this is all tokens in each line
        sonnet.append(line)
sonnets.append(sonnet)
del sonnets[0]

In [3]:
def strip_punct(s):
    '''
    This function strips the punctuation of any given string. 
    Input: 
        s: string to strip punctuation from 
    Output: 
        string stripped of punctuation
    '''
    # newPunct = punctuation.replace("'", "")
    # newPunct = punctuation.replace("-", "")
    newPunct = '''!"#$%&()*+,./:;<=>?@[\]^_`{|}~'''
    return ''.join(c for c in s if c not in newPunct)

This lower cell reads in the text file, then returns the list of words and the number of syllables in either the line or whole poem. 

In [4]:
def read_files(sep='poem'):
    '''
    This function reads the shakespeare and syllable files given to us.
    Input: 
        sep: either 'line' or 'poem'. If line, shakeLines is a separate entry 
            per line, and if poem, shakeLines is a separate entry per poem
    Output: 
        shakeLines: A 2D list with each element being a list of the words in the
            line or poem
        syllables: a dictionary with each key being a word and it's value 
            being how many syllables it has. The changed number of syllables
            with the word being at the end of a line is currently ignored.
    '''

    # format: each line is an individual list of words in that line
    shakeLines = []
    # read in the shakespeare poems 
    if sep == 'line':
        with open("./data/shakespeare.txt") as poems:
            for index, line in enumerate(poems):
                # super jank way to get rid of line numbers, but it works!
                if line != "\n" and len(line) != 23 and len(line) != 22 and len(line) != 21:
                    line = line.lower()
                    shakeLines.append(re.findall(r"[\w']+", strip_punct(line.rstrip("\n"))))
    
    # format: each poem is an individual list of words in that poem
    if sep == 'poem':
        file = open("./data/shakespeare.txt")
        data = file.read()
        paragraph = data.split("\n\n\n")
        for poem in paragraph:
            poem = poem.replace('\n', ' ')
            poem = poem.lstrip()
            poem = poem.split(' ', 1)[1]
            poem = poem.lower()
            shakeLines.append(re.findall(r"[\w']+", strip_punct(poem.rstrip("\n"))))


    # format: dictionary of how many syllables each word is
    # note that syllable differences at end of lines are ignored
    syllables = {}
    with open("./data/Syllable_dictionary.txt") as syllDict:
        for line in syllDict:
            split = line.split()
            if len(split) == 3:
                (key, end, val) = line.split()
            else:
                (key, val) = line.split()

            # they're ordered by syllable length, so sometimes the E is last
            try:
                syllables[key] = int(val)
            except:
                syllables[key] = int(end)

    rhymes = {}
    if sep == 'line':
        # print(shakeLines)
        # sonnet format: abab cdcd efef gg
        for j in range(len(shakeLines)-1):
            line = shakeLines[j]
            last_word = line[-1]
            i = (j % 14) + 1
            # abab
            if i == 1 or i == 2 or i == 5 or i == 6 or i == 9 or i == 10:
                rhymes = make_rhyming_dictionary(rhymes, last_word, shakeLines[j+2][-1])
            # elif i == 3 or i == 4 or i == 7 or i == 8 or i == 11 or i == 12:
                # rhymes = make_rhyming_dictionary(rhymes, last_word, shakeLines[j-2][-1])
            elif i == 13:
                rhymes = make_rhyming_dictionary(rhymes, last_word, shakeLines[j+1][-1])
            # elif i == 14:
            #     rhymes = make_rhyming_dictionary(rhymes, last_word, shakeLines[j-1][-1])

    return shakeLines, syllables, rhymes


In [5]:
def make_rhyming_dictionary(dictionary, word1, word2):
    if word1 in dictionary:
        lst = dictionary.get(word1)
        if word2 not in lst:
            new_lst = dictionary[word1]
            new_lst.append(word2)
            dictionary[word1] = new_lst
    else:
        dictionary[word1] = [word2]

    if word2 in dictionary:
        lst = dictionary.get(word2)
        if word1 not in lst:
            new_lst = dictionary[word2]
            new_lst.append(word1)
            dictionary[word2] = new_lst
    else:
        dictionary[word2] = [word1]

    return dictionary

In [6]:
shakeLines, syllables, rhymes = read_files(sep='line')

This next function returns the sequences in the lines (not sure what this is to be honest), a dictionary with their encoding and the features (tokens).

In [7]:
def featurize(lines):
    '''
    This function returns the feature representation of a set of lines.
    Input: 
        lines: An iterable object with each element being a list of strings
    Output: 
        possiblePOS: the list of possible parts of speech, where the index of 
            each POS being the its number in the 
        POSlookup:  A 2D array being POS, [word, frequency] for the given POS
        features: The feature representation of the input
    '''
    possiblePOS = []
    POSlookup = []
    features = []
    for obs in lines:
        # POS is a list of tuples being (word, POS)
        POS = pos_tag(obs)
        poemFeatures = []
        # if it's a new POS, add it to the list
        for pair in POS: 
            if pair[1] not in possiblePOS:
                possiblePOS.append(pair[1])
                POSlookup.append([])
                POSlookup[possiblePOS.index(pair[1])].append([pair[0], 1])
            else: 
                firstCol = [row[0] for row in POSlookup[possiblePOS.index(pair[1])]]
                if pair[0] not in firstCol:
                    POSlookup[possiblePOS.index(pair[1])].append([pair[0], 1])
                else:
                    index = firstCol.index(pair[0])
                    POSlookup[possiblePOS.index(pair[1])][index][1] += 1
            # we are simply indexing using the order in which they appear

            poemFeatures.append(possiblePOS.index(pair[1]))
            # print(POSlookup[possiblePOS.index(pair[1])])
            


        features.append(poemFeatures)
    return possiblePOS, POSlookup, features


This function below calls featurize. Syllables is provided by calling 'read_files' above

example:
poems, syllables, _ = read_files(sep='poem')
lines, syllables, rhymes = read_files(sep='line')

In [8]:
def generate_words(emission, POSlookup, syllables, reverse=False, lastWord=None):
	'''
	This function generates a string given the emissions and the probabilities 
	of a word being emitted given a certain
	Input:
		emission: The list of emission, which represents the POS of the word
		POSlookup: A 2D array being POS, [word, frequency] for the given POS
		syllables: The dictionary of words and number of syllables each word has
		reverse: Whether to start from beginning or end of line
		rhymes: Dictionary of different rhymes
	Output:
		emStr: The sentence generated
	'''
	done = False
	if reverse:
		assert(lastWord is not None)
		while not done:
			emStr = lastWord
			try:
				syllableCount = syllables[lastWord]
			except:
				syllableCount = 2
				print(lastWord)
			for obs in emission:
				emRate = [row[1] for row in POSlookup[obs]]
				emWords = [row[0] for row in POSlookup[obs]]
				emRate = np.array(emRate)
				emRate = emRate/sum(emRate)

				index = np.random.choice(np.arange(len(emRate)), p=emRate)
				newWord = emWords[index]
				try:
					syllableCount += syllables[newWord]
				except:
					syllableCount += 2
					print(newWord)
				emStr = newWord + ' ' + emStr
				if syllableCount == 10:
					done = True
					break
	else:
		while not done:
			emStr = ''
			syllableCount = 0
			for obs in emission: 
				emRate = [row[1] for row in POSlookup[obs]]
				emWords = [row[0] for row in POSlookup[obs]]
				emRate = np.array(emRate)
				emRate = emRate/sum(emRate)

				index = np.random.choice(np.arange(len(emRate)), p=emRate)
				newWord = emWords[index]
				syllableCount += syllables[newWord]
				emStr = emStr + newWord + ' '
				if syllableCount == 10:
					done = True
					break
	return emStr



In [9]:
# HMM = unsupervised_HMM(features, 25, 100)
# emission, states = HMM.generate_emission(10)

#### This is the main function 

In [22]:
def generate_sonnet(poems, lines, syllables, rhymes=None):
	POSList, POSlookup, features = featurize(poems)
	HMM = unsupervised_HMM(features, 25, 100)
	emission, states = HMM.generate_emission(10)
	if rhymes is None:
		sonnet = ""
		for i in range(14):
			line = generate_words(emission, POSlookup, syllables)
			sonnet = sonnet + line + "\n"

	else:
		# abab cdcd efef gg
		sonnet = ["" for x in range(14)]
		line_idx = [0, 1, 4, 5, 8, 9, 12]
		for i in line_idx:
			# choose a random word in the dictionary
			key, val = random.choice(list(rhymes.items()))
			# choose a random word that rhymes with the previous one
			pair = np.random.choice(val)
			sonnet[i] += str(key)
			if i < 12:
				sonnet[i+2] += str(pair)
			else:
				sonnet[i+1] += str(pair)
		for i in range(len(sonnet)):
			line = generate_words(emission, POSlookup, syllables, True, sonnet[i])
			sonnet[i] = line
		sonnet = "\n".join(sonnet)
	print(sonnet)
	return sonnet


In [23]:
poems, syllables, _ = read_files(sep='poem')
lines, syllables, rhymes = read_files(sep='line')

In [24]:
sonnet2 = generate_sonnet(all_words, lines, syllables, rhymes)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
childrens
form if childrens another of seemed bind
turn which self of towers all of dressed longer
show of dignifies this in annexed kind
that odour of looks this thou born stronger
devise that needs the with gazed everywhere
do which glass as eyes this of buried stand
which guest upon summers all than i life
mightst that face of i a after made land
which purpose of lines this of been dearer
which world in looks a of slandered contains
that hate though numbers this though thy begin
petty in parts the as famoused remains
have worse barren as doth the since seen glad
i that thee in bonds that of falsehood sad


In [17]:
sonnet1 = sonnet

In [19]:
sonnet2

NameError: name 'sonnet2' is not defined

In [31]:
sonnet1[0]

'the little love-god lying once asleep'

In [32]:
with open('sonnet1.txt', 'w') as f:
    f.write(str(sonnet1))