In [1]:
"""Import from parent directory."""
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)

In [39]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Activation, Dense, Embedding

In [7]:
def chr2val(ch):
    ch = ch.lower()
    if ch.isalpha():
        return 1 + (ord(ch) - ord('a'))
    else:
        return 0
    
def val2chr(v):
    if v == 0:
        return ' '
    else:
        return chr(ord('a') + v - 1)

In [63]:
text = ""
with open("datasets/alllines.txt") as f:
    for lines in f:
        text += lines[1:-2] + "\n"
    
text_num = np.array([chr2val(c) for c in text])
print(text[:100])
print(text_num[:100])

ACT I
SCENE I. London. The palace.
Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAN
[ 1  3 20  0  9  0 19  3  5 14  5  0  9  0  0 12 15 14  4 15 14  0  0 20
  8  5  0 16  1 12  1  3  5  0  0  5 14 20  5 18  0 11  9 14  7  0  8  5
 14 18 25  0  0 12 15 18  4  0 10 15  8 14  0 15  6  0 12  1 14  3  1 19
 20  5 18  0  0 20  8  5  0  5  1 18 12  0 15  6  0 23  5 19 20 13 15 18
  5 12  1 14]


In [64]:
with open("datasets/shakespeare.txt", 'w') as f:
    f.write(text)

In [12]:
[min(text_num), max(text_num)]

[0, 26]

In [50]:
len_vocab = 27
sentence_len = 80
# n_chars = len(text_num)//sentence_len*sentence_len
num_chunks = len(text_num)-sentence_len

def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    
    slice_size = batch_size * seq_length
    n_batches = len(int_text) // slice_size
    x = int_text[: n_batches*slice_size]
    y = int_text[1: n_batches*slice_size + 1]

    x = np.split(np.reshape(x,(batch_size,-1)),n_batches,1)
    y = np.split(np.reshape(y,(batch_size,-1)),n_batches,1)
    return x, y

x = np.zeros((num_chunks, sentence_len))
y = np.zeros(num_chunks)
for i in range(num_chunks):
    x[i,:] = text_num[i:i+sentence_len]
    y[i] = text_num[i+sentence_len]

# x = np.reshape(x, (num_chunks, sentence_len, 1))

In [51]:
x.shape, y.shape

((4360927, 80), (4360927,))

In [52]:
"".join([ val2chr(int(x[0][i])) for i in range(80) ])

'act i scene i  london  the palace  enter king henry  lord john of lancaster  the'

In [53]:
len_vocab = 27
sentence_len = 80
# n_chars = len(text_num)//sentence_len*sentence_len
num_chunks = len(text_num)-sentence_len

x = np.zeros((num_chunks, sentence_len))
y = np.zeros((num_chunks, sentence_len))
for i in range(num_chunks):
    x[i,:] = text_num[i:i+sentence_len]
    y[i,:] = text_num[i+1:i+sentence_len+1]
y = y.reshape(y.shape+(1,))

In [54]:
x.shape, y.shape

((4360927, 80), (4360927, 80, 1))

In [59]:
"""
For language modeling, we built a dataset from The Complete
Works of William Shakespeare [32]...

On this data we train a stacked character-level LSTM language
model, which after reading each character in a line,
predicts the next character [22]. The model takes a series of
characters as input and embeds each of these into a learned
8 dimensional space. The embedded characters are then
processed through 2 LSTM layers, each with 256 nodes.
Finally the output of the second LSTM layer is sent to a
softmax output layer with one node per character. The full
model has 866,578 parameters, and we trained using an
unroll length of 80 characters.
"""

num_characters = 27
num_nodes = 256
unroll_length = 80

model = Sequential()
model.add(Embedding(num_characters, 8, input_length=80))
model.add(LSTM(num_nodes, return_sequences=True, input_shape=(unroll_length, 8), unroll=True))
model.add(LSTM(num_nodes, return_sequences=True, unroll=True))
model.add(TimeDistributed(Dense(num_characters)))
model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd')

In [60]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 80, 8)             216       
_________________________________________________________________
lstm_15 (LSTM)               (None, 80, 256)           271360    
_________________________________________________________________
lstm_16 (LSTM)               (None, 80, 256)           525312    
_________________________________________________________________
time_distributed_6 (TimeDist (None, 80, 27)            6939      
Total params: 803,827
Trainable params: 803,827
Non-trainable params: 0
_________________________________________________________________


In [61]:
np.random.choice(3,10,p=[0.99, 0.01, 0])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [62]:
for i in range(10):
    model.fit(x,y, batch_size=128, epochs=1)
    sentence = []
    idx = np.random.choice(len(x),1)
    x_test = x[idx]
    if idx==len(x)-1:
        idx -= 1
#     sentence.append(val2chr(idx[0]))
    for i in range(100):
        p = model.predict(x_test)
        idx2 = np.random.choice(27,1,p=p.ravel())
        x_test = np.hstack([x_test[:,1:], idx2[None,:]])
        sentence.append(val2chr(idx2[0]))

    print(''.join(sentence))
    print('-'*20)
    print(''.join([val2chr(int(v)) for v in x[idx+1,:].tolist()[0]]))
    print('='*40)

Epoch 1/1
  43520/4360927 [..............................] - ETA: 10:31:44 - loss: 4.0313

KeyboardInterrupt: 

In [44]:
for i in range(10):
    sentence = []
    letter = [np.random.choice(len_vocab,1)[0]] #choose a random letter
    for i in range(100):
        sentence.append(val2chr(letter[-1]))
        p = model.predict(np.array(letter)[None,:])
        letter.append(np.random.choice(27,1,p=p[0][-1])[0])
    print(''.join(sentence))
    print('='*100)
    model.fit(x,y, batch_size=128, epochs=1)

ValueError: Error when checking : expected embedding_2_input to have shape (80,) but got array with shape (1,)

In [70]:
import tensorflow as tf
data = tf.keras.datasets.cifar10.load_data()

In [80]:
training_data, testing_data = data
X_train, y_train = training_data
X_test, y_test = testing_data

In [83]:
X_train.shape, y_train.shape

((50000, 32, 32, 3), (50000, 1))

In [84]:
X_test.shape, y_test.shape

((10000, 32, 32, 3), (10000, 1))

In [86]:
type(X_test)

numpy.ndarray