In [1]:
!ls 

ExtensionLive.png  front  kernel.ipynb	LICENSE  mle  README.md


In [2]:

nyt = 'mle/nyt.txt'

In [3]:
import multiprocessing, string
multiprocessing.cpu_count()

4

In [4]:

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# def tokenize(text):
#     tokens = []; word = ""
#     for char in text.lower():
#         if (char in string.whitespace) or (char in string.punctuation):
#             if word:
#                 tokens.append(word.strip(' '))
#                 word = ""
#         if char in string.punctuation:
#             tokens.append(char)
#         else:
#             word += char
#     return [word for word in tokens]

# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', '')
    # split into tokens by white space
    tokens = doc.split()
    #tokens = tokenize(doc)
    # remove punctuation from each token
    #table = str.maketrans('', '', string.punctuation)
    #tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    #tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    #tokens = [word.lower() for word in tokens]
    return tokens

# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [5]:
# load document
#in_filename = republic
in_filename = nyt
doc = load_doc(in_filename)
print(doc[:200])

# clean document
tokens = clean_doc(doc)
print(tokens[:50])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))


Sheryl Sandberg was seething.

Inside Facebook’s Menlo Park, Calif., headquarters, top executives gathered in the glass-walled conference room of its founder, Mark Zuckerberg. It was September 2017, m
Total Tokens: 5269
Unique Tokens: 2016


In [6]:
# organize into sequences of tokens
length = 50
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 5219


In [7]:
# save sequences to file
#out_filename = 'republic_sequences.txt'
out_filename = 'mle/sequences.txt'

In [8]:
save_doc(sequences, out_filename)

In [9]:
!head sequences.txt -n 2



# Word rNN (Training)

In [10]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Lambda
from keras import backend as K
K.clear_session()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
# load
in_filename = 'sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
lines[:5]



In [12]:
# integer encode sequences of words
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(lines)


In [13]:
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
vocab_size

1936

In [14]:
# separate into input and output
sequences = array(sequences)
sequences.shape

(5219, 50)

In [15]:
type(sequences)

numpy.ndarray

In [16]:
len(sequences[0]), len(sequences[1]),len(sequences[4])

(50, 50, 50)

In [17]:
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
y
X.shape

(5219, 49)

In [18]:
seq_length = X.shape[1]
seq_length

49

In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=200)
 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 49, 50)            96800     
_________________________________________________________________
lstm_3 (LSTM)                (None, 49, 100)           60400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 1936)              195536    
Total params: 443,236
Trainable params: 443,236
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/2

Epoch 74/200
Epoch 75/200
Epoch 76/200

In [None]:
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

# load cleaned text sequences
in_filename = 'mle/sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

In [24]:
# select a seed text
seed_text = lines[randint(0,len(lines))]

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

the company was ntk network — had been campaigns, of scrambled “our data — had been campaigns, of scrambled “our data — had been campaigns, of scrambled “our data — had been campaigns, of scrambled “our data — had been campaigns, of scrambled “our data — had been campaigns, of


In [None]:
from keras.utils import plot_model

In [None]:
plot_model(model,'model.png')