<a href="https://colab.research.google.com/github/Alex-Ukraine/tf2-fcc/blob/master/tenth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
%tensorflow_version 2.x
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [5]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [6]:
#from google.colab import files
#path_to_file = list(files.upload().keys())[0]

In [7]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [8]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [9]:
vocab = sorted(set(text))
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [10]:
# lets look at how part of our text is encoded
print("Text: ", text[:13])
print("Encoded: ", text_to_int(text[:13]))

Text:  First Citizen
Encoded:  [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [11]:
def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


In [12]:
seq_length = 100 # length of sequence for a training example
examples_per_epochs = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [13]:
sequence = char_dataset.batch(seq_length+1, drop_remainder=True)

In [14]:
def split_input_target(chunk): # for the example: hello
    input_text = chunk[:-1] # hell
    target_text = chunk[1:] # ello
    return input_text, target_text # hell, ello

dataset = sequence.map(split_input_target) # we use map to applythe above function to every entry

In [15]:
for x, y in dataset.take(2):
    print("\n\nEXAMPLE\n")
    print("INPUT")
    print(int_to_text(x))
    print("\nOUTPUT")
    print(int_to_text(y))



EXAMPLE

INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


EXAMPLE

INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT
re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


In [16]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [17]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                             return_sequences=True,
                             stateful=True,
                             recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [18]:
for input_example_batch, target_example_batch in data.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [19]:
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[ 4.9541723e-03 -1.0022405e-03  1.5850919e-03 ...  1.9426997e-03
    2.3741531e-03 -5.0052702e-03]
  [ 3.8164728e-03 -9.6819876e-04  9.7900303e-04 ... -5.9615886e-03
    2.4710065e-03 -2.0958320e-03]
  [-4.0609809e-03 -8.8512944e-04  5.4686912e-03 ... -7.9739336e-03
    1.2142274e-03 -4.7166697e-03]
  ...
  [-1.4881484e-03  6.9137621e-03  5.9468206e-03 ...  6.0877441e-03
   -5.7464559e-03 -1.1508878e-02]
  [-3.2905736e-03  6.8410570e-03  6.5180073e-03 ...  3.0175792e-03
    1.1985451e-03 -7.8533879e-03]
  [-8.6078038e-03  5.0928015e-03  9.4198156e-03 ...  3.5288848e-04
   -3.8719759e-04 -9.8840632e-03]]

 [[ 3.9410675e-03  7.9534324e-03  2.7395627e-03 ...  6.3772040e-04
   -3.6351061e-03 -9.7288203e-04]
  [-3.3701081e-03  5.7613477e-03  6.2526381e-03 ... -3.5000539e-03
   -3.1556520e-03 -4.2213071e-03]
  [-1.1801691e-02  6.0356050e-03 -2.6416157e-03 ... -3.1260122e-04
    1.0641909e-03 -5.4476243e-03]
  ...
  [ 7.4175959e-03  1.7577818e-02  1.0221340e-02 ...  5.0198101e

In [20]:
pred = example_batch_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[ 0.00495417 -0.00100224  0.00158509 ...  0.0019427   0.00237415
  -0.00500527]
 [ 0.00381647 -0.0009682   0.000979   ... -0.00596159  0.00247101
  -0.00209583]
 [-0.00406098 -0.00088513  0.00546869 ... -0.00797393  0.00121423
  -0.00471667]
 ...
 [-0.00148815  0.00691376  0.00594682 ...  0.00608774 -0.00574646
  -0.01150888]
 [-0.00329057  0.00684106  0.00651801 ...  0.00301758  0.00119855
  -0.00785339]
 [-0.0086078   0.0050928   0.00941982 ...  0.00035289 -0.0003872
  -0.00988406]], shape=(100, 65), dtype=float32)


In [21]:
time_pred = pred[0]
print(len(time_pred))
print(time_pred)

65
tf.Tensor(
[ 0.00495417 -0.00100224  0.00158509  0.00487944 -0.00266748  0.00031202
 -0.00294612  0.00060726 -0.00178237  0.00650831  0.00168259  0.00251239
 -0.00065604 -0.00437894 -0.00332212  0.00061358 -0.0022222   0.00124658
  0.00099915 -0.0011928   0.00092941 -0.00073848 -0.00148657 -0.00654347
  0.00207511 -0.00017395  0.00668215 -0.00337454 -0.00432866 -0.00066134
  0.00057479 -0.00353349  0.00066513 -0.00144657  0.00201985 -0.00261644
 -0.00210377  0.00070904  0.00364265 -0.00233948 -0.00592984  0.0018767
  0.00014709  0.00518904 -0.00147786  0.00182355  0.00183402  0.00393316
  0.00207234  0.00475033 -0.00060931  0.00194855  0.00468348  0.00272603
 -0.00311457  0.00136928  0.01004859 -0.00662069 -0.00102763  0.00144665
 -0.00086631 -0.00443836  0.0019427   0.00237415 -0.00500527], shape=(65,), dtype=float32)


In [22]:
sampled_indices = tf.random.categorical(pred, num_samples=1)

sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars

'!VsqdLG3$$:Ppq:NDlHeX3rMHRPZ,c:fxRe?LGJW$uEUfPmJzIOPqSmuo&N-nT:3!LIzbek!aYeuchwOUkH&Qhi;Ve:JAiRZT;mz'

In [23]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [24]:
model.compile(optimizer='adam', loss=loss)

In [25]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [26]:
history = model.fit(data, epochs=40, callbacks=[checkpoint_callback])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
