In [1]:
from sklearn.model_selection import KFold
import keras.preprocessing.text
import os
import numpy as np
from IPython.display import HTML
import random
from HMM import unsupervised_HMM
import nltk

Using TensorFlow backend.


## Tokenize

In [2]:
file = open('data/dataset.txt', 'r')
s = file.read()
txt_list = keras.preprocessing.text.text_to_word_sequence(s,
                                               filters='0123456789!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',
                                               lower=True,
                                               split=' ')
# print(txt_list)
Tokenizer = keras.preprocessing.text.Tokenizer(num_words=None,
                                   filters='0123456789!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,
                                   split=" ",
                                   char_level=False,
                                   oov_token=None)
Tokenizer.fit_on_texts(txt_list)
sequences = Tokenizer.texts_to_sequences(txt_list)
word_docs = Tokenizer.word_docs
word_index = Tokenizer.word_index
word_counts = Tokenizer.word_counts
# print(word_counts)
# occ = 0
# for val in word_counts.values():
#     if val == 1:
#         occ += 1
# print(occ)
# print(len(word_counts))

## Load Dictionary

In [3]:
import json
syllable_dict = json.load(open('data/my_syllable_dict.json'))
word_to_syllable_dict = json.load(open('data/word_to_syllable_dict.json'))
rhyme_dict = json.load(open('data/my_rhyme_dict.json'))
reverse_dict = dict(zip(word_index.values(), word_index.keys()))
word_to_rhyme_dict = json.load(open('data/word_to_rhyme_dict.json'))
# print(rhyme_dict)
filtered_rhyme_dict = {}
for key in rhyme_dict.keys():
    if len(rhyme_dict[key]) > 4:
        filtered_rhyme_dict[key] = rhyme_dict[key]
filtered_word_to_rhyme_dict = {}
for key in word_to_rhyme_dict.keys():
    if word_to_rhyme_dict[key] in filtered_rhyme_dict:
        filtered_word_to_rhyme_dict[key] = word_to_rhyme_dict[key]
# print(filtered_word_to_rhyme_dict)

## Generate Sequence

In [4]:
def generate_sequence(file_name, seq_type, Tokenizer):
    '''
    returns X: a list of list, tokenized
    '''
    if seq_type == 'poem':
        file = open(file_name, 'r')
        s = file.read()
        s_list = keras.preprocessing.text.text_to_word_sequence(s,
                                                       filters='!"#$%&()+,./:;<=>?@[\\]^_`{|}~\t\n',
                                                       lower=True,
                                                       split=' ')
        poem_list = []
        for s in s_list:
            if s == '1':
                poem = []
            elif s.isdigit() or '*' in s:
                poem_list.append(poem)
                poem = []
            else:
                poem.append(s)
        poem_list.append(poem)
        # print(poem_list)
        token_list = []
        for poem in poem_list:
            real_list = []
            temp_list = Tokenizer.texts_to_sequences(poem)
            for num in temp_list:
                real_list.append(num[0] - 1)
            token_list.append(real_list)
        return token_list
    
    if seq_type == 'line':
        file = open('data/dataset.txt', 'r')
        s = file.read()
        we_list = s.split('\n')
        line_list = []
        for w in we_list:
            temp = keras.preprocessing.text.text_to_word_sequence(w,
                                                       filters='*!"#$%&()+,./:;<=>?@[\\]^_`{|}~\t\n',
                                                       lower=True,
                                                       split=' ')
            if len(temp) > 1:
                line_list.append(temp)
        # print(line_list)

        token_list = []
        for line in line_list:
            real_list = []
            temp_list = Tokenizer.texts_to_sequences(line)
            for num in temp_list:
                real_list.append(num[0] - 1)
            token_list.append(real_list)
        # print(token_list)
        return token_list

In [5]:
X = generate_sequence('data/dataset.txt', 'line', Tokenizer)
temp=[]
for Xi in X:
    Xi=list(reversed(Xi))
    temp.append(Xi)
X=temp
# print(X)

## Train

In [6]:
def cross_validation(X, fold, state, iteration, lamda_A, lamda_O):
    print('hidden state: %d' % state)
    random.shuffle(X)
    # get average length
    sum = 0
    max_ob = 0
    for x in X:
        sum += len(x)
        if max(x) > max_ob:
            max_ob = max(x)
    avg = int(sum / len(X))
    
    # fold split and train
    kf = KFold(n_splits=fold)
    count = 1
    max_prob = 0
    sum_prob = 0
    sum_count = 0
    for train_index, test_index in kf.split(X):
        if count%2 == 0:
            count+=1
            continue
        print('fold %d' % count)
        count += 1
        # print(X[train_index[0]:(train_index[-1]-1)])
        X_train = []
        for i in train_index:
            X_train.append(X[i])
        # print(X_train)
        hmm = unsupervised_HMM(X_train, state, iteration, max_ob+1, lamda_A, lamda_O)
        # print(hmm.A)
        # print(hmm.O)
        
        # print generated sequence
#         emission, states = hmm.generate_emission(avg)
#         emission = reverse_check(word_index, emission)
#         text = ''
#         for e in emission:
#             text = str(text + e + ' ')
#         print('Sample Text:')
#         print(text)
        
        # calculate validation probability
        prob = 0
        for i in test_index:
            prob += hmm.probability_alphas(X[i])
        prob = prob / len(test_index)
        if prob > max_prob:
            max_prob = prob
        sum_prob += prob
        sum_count += 1
    print('Max Probability in Fold: %s' % max_prob)
    print('Avg Probability in Fold: %s' % (sum_prob / sum_count))

In [7]:
def reverse_check(word_index, emission):
    reverse_dict = dict(zip(word_index.values(), word_index.keys()))
    res = []
    for e in emission:
        res.append(reverse_dict[e+1])
    return res

In [8]:
def train_hmm(X, state, iteration, lamda_A, lamda_O):
    random.shuffle(X)
    # get average length
    sum = 0
    max_ob = 0
    for x in X:
        sum += len(x)
        if max(x) > max_ob:
            max_ob = max(x)
    avg = int(sum / len(X))
    hmm = unsupervised_HMM(X, state, iteration, max_ob+1, lamda_A, lamda_O)
    return hmm

In [25]:
num_state = [2, 4, 6, 8, 10]
for state in num_state:
    cross_validation(X, 10, state, 50, 10000, 10000)

hidden state: 2
fold 1
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
fold 3
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
fold 5
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
fold 7
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
fold 9
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Max Probability in Fold: 1.7656846045499672e-17
Avg Probability in Fold: 4.114041274237296e-18
hidden state: 4
fold 1
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
fold 3
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
fold 5
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
fold 7
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
fold 9
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Max Probability in Fold: 1.5624144523047992e-18
Avg Probability in Fold: 8.472265069379482e-19
hidden s

In [14]:
hmm = train_hmm(X, 8, 100, 10000, 10000)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100


## Generate Poem

In [10]:
def generate_line(hmm, temp_word_to_rhyme_dict, rhyme_list=None):
    states = []
    emission = []
    words = []
    
    # choose starting state
    if rhyme_list == None:
        y0 = np.random.randint(hmm.L)
        states.append(y0)
        x0 = np.random.choice(hmm.D, p=hmm.O[y0])
        x_word = reverse_dict[x0+1]
        while x_word not in word_to_syllable_dict or x_word not in temp_word_to_rhyme_dict:
            x0 = np.random.choice(hmm.D, p=hmm.O[y0])
            x_word = reverse_dict[x0+1]
        emission.append(x0)
        words.append(x_word)
        rhyme = temp_word_to_rhyme_dict[x_word]
        # print(rhyme)
        new_rhyme_list = rhyme_dict[rhyme]
        new_rhyme_list.remove(x_word)
        temp_dict = temp_word_to_rhyme_dict.copy()
        for key in temp_dict.keys():
            if temp_word_to_rhyme_dict[key] == rhyme:
                temp_word_to_rhyme_dict.pop(key)
            
        # print(new_rhyme_list)
    else:
        y0 = np.random.randint(hmm.L)
        states.append(y0)
        index_pool = []
        prob_pool = []
        for word in rhyme_list:
            # print(word_index[word])
            index_pool.append(word_index[word]-1)
            prob_pool.append(hmm.O[y0][word_index[word]-1])
        sum_pool = sum(prob_pool)
        for i in range(len(prob_pool)):
            prob_pool[i] = prob_pool[i] / sum_pool
        # print(prob_pool)
        x0 = np.random.choice(index_pool, p=prob_pool)
        x_word = reverse_dict[x0+1]
        emission.append(x0)
        words.append(x_word)
        # print(x_word)
        
        
    # generate middle word
    index = 1
    remain_syllable = 10 - int(word_to_syllable_dict[x_word][-1])
    while remain_syllable > 3:
        y_temp = np.random.choice(hmm.L, p=hmm.A[states[index-1]])
        states.append(y_temp)
        x_temp = np.random.choice(hmm.D, p=hmm.O[states[index-1]])
        x_word = reverse_dict[x_temp+1]
        while x_word not in word_to_syllable_dict:
            x_temp = np.random.choice(hmm.D, p=hmm.O[y0])
            x_word = reverse_dict[x_temp+1]
        emission.append(x_temp)
        words.append(x_word)
        remain_syllable -= int(word_to_syllable_dict[x_word][-1])
        index += 1
        
    # generate last word pool
    word_pool = []
    E_syllable = 'E' + str(remain_syllable)
    # print(E_syllable)
    if E_syllable in syllable_dict:
        word_pool = word_pool + syllable_dict[E_syllable]
    word_pool = word_pool + syllable_dict[str(remain_syllable)]
    # print(word_pool)
    
    y_temp = np.random.choice(hmm.L, p=hmm.A[states[index-1]])
    states.append(y_temp)
    x_temp = np.random.choice(hmm.D, p=hmm.O[states[index-1]])
    x_word = reverse_dict[x_temp+1]
    while x_word not in word_pool:
        x_temp = np.random.choice(hmm.D, p=hmm.O[states[index-1]])
        x_word = reverse_dict[x_temp+1]
    emission.append(x_temp)
    words.append(x_word)
    
    final_emission = []
    emission = list(reversed(emission))
    final_emission.append(emission)
    final_words = []
    words = list(reversed(words))
    final_words.append(words)
    
    if rhyme_list is None:
        # print('enter!')
        emission_1, words_1, temp_word_to_rhyme_dict = generate_line(hmm, temp_word_to_rhyme_dict, new_rhyme_list)
        final_words.append(words_1[0])
        final_emission.append(emission_1[0])
        # emission = emission + emission_1
        # words = words + words_1
        

    return final_emission, final_words, temp_word_to_rhyme_dict

In [15]:
temp_word_to_rhyme_dict = filtered_word_to_rhyme_dict
kaigoo = []
for i in range(7):
    emission, words, temp_word_to_rhyme_dict = generate_line(hmm, temp_word_to_rhyme_dict)
    kaigoo.append(words[0])
    kaigoo.append(words[1])
    # print(emission)
    # print(words)
kaigoo_2 = [kaigoo[0], kaigoo[2], kaigoo[1], kaigoo[3], kaigoo[4], kaigoo[6], kaigoo[5], kaigoo[7], kaigoo[8], kaigoo[10], \
           kaigoo[9], kaigoo[11], kaigoo[12], kaigoo[13]]
# print(kaigoo_2)
for k in kaigoo_2:
    print(' '.join(k))

idea plead heart a breathe veil sharpened
issueless score weighs prayers quickly look for
nothing disdaineth inquire imprisoned
deceased eyes a seeing why spring more
cuckoo expressed and love this resty bends
legacy mask disabled awards
celestial that yield thou write daily spends
counterfeit or care the to art we wards
dignified the monsters side direct gate
entertain perceiv'st in and if although
applying because my smell make mind hate
offenders kindle true mind's wondrous so
well-tuned some sweet for that sessions hide
evermore beggared my zealous abide
