In [1]:
import pandas as pd
import numpy as np
import pickle
import sys

import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import string
from keras.models import load_model

from sklearn.feature_extraction.text import CountVectorizer

from fuzzywuzzy import fuzz 
from fuzzywuzzy import process

Using TensorFlow backend.


In [2]:
with open("Bender","rb") as pickle_in:
    Bender = pickle.load(pickle_in)

In [13]:
def remove_punctuation(text):
    goodpunct = [' ']
    cleaned = text.replace('\xa0',' ')
    return "".join([ch.lower() for ch in cleaned if ch not in string.punctuation or ch in goodpunct])

In [14]:
vectorizer = CountVectorizer(preprocessor = remove_punctuation, 
                             token_pattern=r'\b[a-zA-Z]+\b')
doc_word = vectorizer.fit_transform(Bender)
bow = vectorizer.get_feature_names()

In [15]:
Bender[:5]

[' Bite my shiny metal ass.',
 ' Shinier than yours, meatbag!',
 " Listen, buddy, I'm in a hurry here. Let's try for a two-fer!",
 ' Great choice!  Bring it on, baby!',
 " C'mon, c'mon! Kill me already! By the way, my name's Bender!"]

In [16]:
raw_text = ""
for i in Bender:
    raw_text += i

In [17]:
raw_text = remove_punctuation(raw_text)

In [18]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [19]:
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  185154
Total Vocab:  47


In [20]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

Total Patterns:  185054


In [21]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [None]:
# define the LSTM model 1
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
model.fit(X, y, epochs=30, batch_size=128, callbacks=callbacks_list)

In [None]:
with open("text_model","wb") as pickle_out:
    pickle.dump(model, pickle_out)

In [None]:
# load the network weights
filename = "weights-improvement-06-2.0628-bigger.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]

In [None]:
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print ("\nDone.")

In [24]:
#Much larger Attempt
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [None]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}-bigger.hdf5"

In [None]:
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
# model.fit(X, y, epochs=100, batch_size=64, callbacks=callbacks_list, initial_epoch = 6)

In [None]:
weights-improvement-06-2.0628-bigger.hdf5

In [None]:
model = load_model('weights-improvement-06-2.0628-bigger.hdf5')

### 90 Epoch model (13 hours!)
However, the generator is mostly being too repepetive. Some research shows that this is a common issue, and analyzing the code I can see how the model could get into loops since it always chooses the highest probability letter next.

In [25]:
# load the network weights
filename = "90epochmodel.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [26]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [27]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]

In [28]:
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
bender_str = ''
# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    bender_str += result
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print ("\nDone.")

Seed:
" u can vent tonight on your blog we do suck shut up zoidberg hes right leela good comeback leela me i "
 dont know what i was saying the recond i dont know what i was saying the recond i dont know what i 
Done.


In [None]:
pickle_in = open("Bender","rb")
Bender = pickle.load(pickle_in)

In [None]:
len(Bender)

In [None]:
vectorizer = CountVectorizer(preprocessor = remove_punctuation, 
                             token_pattern=r'\b[a-zA-Z]+\b')
doc_word = vectorizer.fit_transform(Bender)
bow = vectorizer.get_feature_names()

In [None]:
bow = vectorizer.get_feature_names()

In [None]:
def fuzzymatch(test, bow):
    match = ''
    max_ratio = 0
    for word in bow:
        if fuzz.ratio(word, test) > max_ratio:
            max_ratio = fuzz.ratio(word, test)
            match = word
    return (match, max_ratio)

In [None]:
fuzzy_bender = [fuzzymatch(word,bow)[0] for word in bender_str.split()]

In [None]:
fuzzy_str = ''
for word in fuzzy_bender:
    fuzzy_str += word + ' '

In [None]:
fuzzy_str

### Add some randomness
Instead of simply choosing argmax, we now have numpy select and element from the arglist at random based on the probabilities predicted by the model, so it still favors certain letters but has a small chance to choose another. This produces variability that is much desired, but now fewer of the words are coherent. Ultimately I decide this is not worth the effort.

In [None]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]

In [None]:
oldpattern = list(pattern)

In [None]:
len(oldpattern)

In [None]:
pattern = list(oldpattern)

In [None]:
pattern[-10:]

In [None]:
fuzzymatch('zerou',bow)

In [None]:
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
bender_str = ''
# generate characters without repetition
for i in range(200):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.random.choice(len(prediction[0]),p=prediction[0])
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    bender_str += result
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print ("\nDone.")

In [None]:
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
bender_str = ''
# generate characters without repetition
for i in range(200):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.random.choice(len(prediction[0]),p=prediction[0])
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    bender_str += result
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print ("\nDone.")

In [None]:
def lastpattern(pattern):
    word = []
    for i in pattern[::-1]:
        if i == 0:
            return word
        word.append(i)
    return word[::-1]

In [None]:
def lastword(pattern):
    word = ''
    for i in pattern:
        word += int_to_char[i]
    return word

In [None]:
def update_pattern(pattern):
    last_word = fuzzymatch(lastword(lastpattern(bender_pattern)[::-1]),bow)[0]
    update_pattern = [char_to_int[value] for value in last_word]
    new_pattern = bender_pattern[:-len(update_pattern)] + update_pattern
    return new_pattern

In [None]:
bender_str = 'forget your teo nt viradi uo me whth carh you mayirr a metsiedl oafat i siink ids bogneng  i lnvo my ttuef foo seilctatixe brrtcd we eont meanae oy fafe i make mnre pilpiog'

In [None]:
def fuzzylast(pattern):
    words = pattern.split()
    return fuzzymatch(words[-1],bow)[0]

In [None]:
bob = fuzzylast(bender_str)

In [None]:
print(bob)

In [None]:
bender_str

In [None]:
words = bender_str.split()

In [None]:
new_words = ''
for temp in words:
    new_words += fuzzymatch(temp,bow)[0] + ' '

In [None]:
new_words

In [None]:
fuzzymatch('teo',bow)

In [None]:
len(bow)