In [4]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

In [5]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [6]:
from tensorflow import keras

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Data

In [8]:
path_to_file = "Shakespeare.txt"
text = open(path_to_file, 'r').read()

In [16]:
print(text[94:200])

the riper should by time decease,
His tender heir might bear his memory:
But thou, contracted to thine own


In [10]:
len(text)

93577

# Data Processing

In [17]:
vocab = sorted(set(text))

In [20]:
char_2_ind = {char: ind for ind,char in enumerate(vocab)}

In [22]:
char_2_ind['V']

30

In [28]:
ind_2_char = np.array(vocab)

In [29]:
ind_2_char[30]

'V'

In [30]:
encoded_text = np.array([char_2_ind[c] for c in text])

In [49]:
encoded_text.shape

(93577,)

In [45]:
tokenizer = Tokenizer(char_level=True, lower = False)

In [46]:
tokenizer.fit_on_texts(vocab)

In [53]:
seq_len = 120

In [54]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [55]:
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

In [58]:
for i in sequences.take(500):
    print(ind_2_char[i.numpy()])

['F' 'R' 'O' 'M' ' ' 'f' 'a' 'i' 'r' 'e' 's' 't' ' ' 'c' 'r' 'e' 'a' 't'
 'u' 'r' 'e' 's' ' ' 'w' 'e' ' ' 'd' 'e' 's' 'i' 'r' 'e' ' ' 'i' 'n' 'c'
 'r' 'e' 'a' 's' 'e' ',' '\n' 'T' 'h' 'a' 't' ' ' 't' 'h' 'e' 'r' 'e' 'b'
 'y' ' ' 'b' 'e' 'a' 'u' 't' 'y' "'" 's' ' ' 'r' 'o' 's' 'e' ' ' 'm' 'i'
 'g' 'h' 't' ' ' 'n' 'e' 'v' 'e' 'r' ' ' 'd' 'i' 'e' ',' '\n' 'B' 'u' 't'
 ' ' 'a' 's' ' ' 't' 'h' 'e' ' ' 'r' 'i' 'p' 'e' 'r' ' ' 's' 'h' 'o' 'u'
 'l' 'd' ' ' 'b' 'y' ' ' 't' 'i' 'm' 'e' ' ' 'd' 'e']
['c' 'e' 'a' 's' 'e' ',' '\n' 'H' 'i' 's' ' ' 't' 'e' 'n' 'd' 'e' 'r' ' '
 'h' 'e' 'i' 'r' ' ' 'm' 'i' 'g' 'h' 't' ' ' 'b' 'e' 'a' 'r' ' ' 'h' 'i'
 's' ' ' 'm' 'e' 'm' 'o' 'r' 'y' ':' '\n' 'B' 'u' 't' ' ' 't' 'h' 'o' 'u'
 ',' ' ' 'c' 'o' 'n' 't' 'r' 'a' 'c' 't' 'e' 'd' ' ' 't' 'o' ' ' 't' 'h'
 'i' 'n' 'e' ' ' 'o' 'w' 'n' ' ' 'b' 'r' 'i' 'g' 'h' 't' ' ' 'e' 'y' 'e'
 's' ',' '\n' 'F' 'e' 'e' 'd' "'" 's' 't' ' ' 't' 'h' 'y' ' ' 'l' 'i' 'g'
 'h' 't' "'" 's' 't' ' ' 'f' 'l' 'a' 'm' 'e' ' ' 'w']
['i' 't' 'h

['e' 'd' ' ' 's' 'h' 'a' 'l' 'l' ' ' 'o' "'" 'e' 'r' '-' 'r' 'e' 'a' 'd'
 ',' '\n' 'A' 'n' 'd' ' ' 't' 'o' 'n' 'g' 'u' 'e' 's' ' ' 't' 'o' ' ' 'b'
 'e' ' ' 'y' 'o' 'u' 'r' ' ' 'b' 'e' 'i' 'n' 'g' ' ' 's' 'h' 'a' 'l' 'l'
 ' ' 'r' 'e' 'h' 'e' 'a' 'r' 's' 'e' '\n' 'W' 'h' 'e' 'n' ' ' 'a' 'l' 'l'
 ' ' 't' 'h' 'e' ' ' 'b' 'r' 'e' 'a' 't' 'h' 'e' 'r' 's' ' ' 'o' 'f' ' '
 't' 'h' 'i' 's' ' ' 'w' 'o' 'r' 'l' 'd' ' ' 'a' 'r' 'e' ' ' 'd' 'e' 'a'
 'd' ';' '\n' 'Y' 'o' 'u' ' ' 's' 't' 'i' 'l' 'l' ' ']
['s' 'h' 'a' 'l' 'l' ' ' 'l' 'i' 'v' 'e' '-' '-' 's' 'u' 'c' 'h' ' ' 'v'
 'i' 'r' 't' 'u' 'e' ' ' 'h' 'a' 't' 'h' ' ' 'm' 'y' ' ' 'p' 'e' 'n' '-'
 '-' '\n' 'W' 'h' 'e' 'r' 'e' ' ' 'b' 'r' 'e' 'a' 't' 'h' ' ' 'm' 'o' 's'
 't' ' ' 'b' 'r' 'e' 'a' 't' 'h' 'e' 's' ',' ' ' 'e' 'v' 'e' 'n' ' ' 'i'
 'n' ' ' 't' 'h' 'e' ' ' 'm' 'o' 'u' 't' 'h' 's' ' ' 'o' 'f' ' ' 'm' 'e'
 'n' '.' '\n' 'I' ' ' 'g' 'r' 'a' 'n' 't' ' ' 't' 'h' 'o' 'u' ' ' 'w' 'e'
 'r' 't' ' ' 'n' 'o' 't' ' ' 'm' 'a' 'r' 'r' 'i' 'e']
['d' ' ' 't

In [59]:
def create_seq_targets(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt, target_txt

In [60]:
dataset = sequences.map(create_seq_targets)

In [63]:
for input_txt, target_txt in dataset.take(1):
    print(ind_2_char[input_txt.numpy()])
    print(ind_2_char[target_txt.numpy()])

['F' 'R' 'O' 'M' ' ' 'f' 'a' 'i' 'r' 'e' 's' 't' ' ' 'c' 'r' 'e' 'a' 't'
 'u' 'r' 'e' 's' ' ' 'w' 'e' ' ' 'd' 'e' 's' 'i' 'r' 'e' ' ' 'i' 'n' 'c'
 'r' 'e' 'a' 's' 'e' ',' '\n' 'T' 'h' 'a' 't' ' ' 't' 'h' 'e' 'r' 'e' 'b'
 'y' ' ' 'b' 'e' 'a' 'u' 't' 'y' "'" 's' ' ' 'r' 'o' 's' 'e' ' ' 'm' 'i'
 'g' 'h' 't' ' ' 'n' 'e' 'v' 'e' 'r' ' ' 'd' 'i' 'e' ',' '\n' 'B' 'u' 't'
 ' ' 'a' 's' ' ' 't' 'h' 'e' ' ' 'r' 'i' 'p' 'e' 'r' ' ' 's' 'h' 'o' 'u'
 'l' 'd' ' ' 'b' 'y' ' ' 't' 'i' 'm' 'e' ' ' 'd']
['R' 'O' 'M' ' ' 'f' 'a' 'i' 'r' 'e' 's' 't' ' ' 'c' 'r' 'e' 'a' 't' 'u'
 'r' 'e' 's' ' ' 'w' 'e' ' ' 'd' 'e' 's' 'i' 'r' 'e' ' ' 'i' 'n' 'c' 'r'
 'e' 'a' 's' 'e' ',' '\n' 'T' 'h' 'a' 't' ' ' 't' 'h' 'e' 'r' 'e' 'b' 'y'
 ' ' 'b' 'e' 'a' 'u' 't' 'y' "'" 's' ' ' 'r' 'o' 's' 'e' ' ' 'm' 'i' 'g'
 'h' 't' ' ' 'n' 'e' 'v' 'e' 'r' ' ' 'd' 'i' 'e' ',' '\n' 'B' 'u' 't' ' '
 'a' 's' ' ' 't' 'h' 'e' ' ' 'r' 'i' 'p' 'e' 'r' ' ' 's' 'h' 'o' 'u' 'l'
 'd' ' ' 'b' 'y' ' ' 't' 'i' 'm' 'e' ' ' 'd' 'e']


In [66]:
batch_size = 128

In [67]:
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [73]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int32, tf.int32)>

In [70]:
vocab_size = len(vocab)
vocab_size

60

In [115]:
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [116]:
def sparse_cat_crossentropy(y_true, y_pred):
    return sparse_categorical_crossentropy(y_true, y_pred, from_logits = True)

In [117]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [118]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim, batch_input_shape = [batch_size, None]))
    model.add(GRU(rnn_neurons, return_sequences=True, stateful = True, recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    
    model.compile('adam', loss=sparse_cat_crossentropy)
    
    return model

In [119]:
model = create_model(vocab_size=vocab_size, embed_dim=64, rnn_neurons=1024, batch_size = batch_size)

In [120]:
model.fit(dataset, epochs=50)

Train for 6 steps
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x205313e3208>

In [121]:
model.save_weights("shakespeare_model_weights")

In [122]:
model_pred = create_model(vocab_size=vocab_size, embed_dim=64, rnn_neurons=1024, batch_size = 1)
model_pred.load_weights("shakespeare_model_weights")
model_pred.build(tf.TensorShape([1,None]))

In [123]:
def generate_text(model, start_seed, gen_size = 500, temp = 1.0):
    input_eval = [char_2_ind[l] for l in start_seed]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    
    model.reset_states()
    
    for i in range(gen_size):
        predictions = model(input_eval)
        
        predictions = tf.squeeze(predictions, 0)
        
        predictions = predictions/temp
        
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated.append(ind_2_char[predicted_id])
        
    return(start_seed + "".join(text_generated))

In [125]:
print(generate_text(model_pred, "Romeo:", gen_size=100))

Romeo:
You hor at as hearine ow theme speiver sseleds bert?
Se, losk hoor hem thea aks finduod thungith of
