In [1]:
import numpy as np 
import tensorflow as tf

In [2]:
# Read, then decode for py2 compat.
text = open('kanye_verses.txt', 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))
print(text[:250])

Length of text: 260341 characters
Let the suicide doors up
I threw suicides on the tour bus
I threw suicides on the private jet
You know what that mean, I'm fly to death
I step in Def Jam buildin' like I'm the shit
Tell 'em give me fifty million or I'ma quit
Most rappers' taste level


In [44]:
# number of unique characters 
print(len(set(text)),'unique characters:') 

# dict of these characters
chars = sorted(set(text))
print(chars)

char_dict = {char:i for i,char in enumerate(chars)}
idx2char = np.array(chars)
print(char_dict)

96 unique characters:
['\n', ' ', '!', '"', '#', '$', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '·', 'Á', 'é', 'í', 'ñ', 'ó', 'ā', '\u200b', '–', '‘', '’', '“', '”', '…']
{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '&': 6, "'": 7, '(': 8, ')': 9, '*': 10, '+': 11, ',': 12, '-': 13, '.': 14, '/': 15, '0': 16, '1': 17, '2': 18, '3': 19, '4': 20, '5': 21, '6': 22, '7': 23, '8': 24, '9': 25, ':': 26, ';': 27, '?': 28, 'A': 29, 'B': 30, 'C': 31, 'D': 32, 'E': 33, 'F': 34, 'G': 35, 'H': 36, 'I': 37, 'J': 38, 'K': 39, 'L': 40, 'M': 41, 'N': 42, 'O': 43, 'P': 44, 'Q': 45, 'R': 46, 'S': 47, 'T': 48, 'U': 49, 'V': 50, 'W': 51, 'X': 52, 'Y': 53, 'Z': 54, 'a':

In [4]:
#Sequence of the text
text_in_num =np.array([char_dict[i] for i in text])
print(text[:13])
print(text_in_num[:13])

print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_in_num[:13]))
text_in_num

Let the suici
[40 59 74  1 74 62 59  1 73 75 63 57 63]
'Let the suici' ---- characters mapped to int ---- > [40 59 74  1 74 62 59  1 73 75 63 57 63]


array([40, 59, 74, ..., 75, 14, 14])

In [5]:
#Convert to trainable data
seq_len = 50
example_per_epoc = len(text_in_num)//(seq_len+1)
example_per_epoc


5104

In [6]:
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_in_num)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

L
e
t
 
t


In [7]:
#
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'Let the suicide doors up\nI threw suicides on the to'
'ur bus\nI threw suicides on the private jet\nYou know'
" what that mean, I'm fly to death\nI step in Def Jam"
" buildin' like I'm the shit\nTell 'em give me fifty "
"million or I'ma quit\nMost rappers' taste level ain'"


In [8]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [9]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'Let the suicide doors up\nI threw suicides on the t'
Target data: 'et the suicide doors up\nI threw suicides on the to'


In [10]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 40 ('L')
  expected output: 59 ('e')
Step    1
  input: 59 ('e')
  expected output: 74 ('t')
Step    2
  input: 74 ('t')
  expected output: 1 (' ')
Step    3
  input: 1 (' ')
  expected output: 74 ('t')
Step    4
  input: 74 ('t')
  expected output: 62 ('h')


In [12]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset



<BatchDataset shapes: ((64, 50), (64, 50)), types: (tf.int32, tf.int32)>

In [14]:
# Length of the vocabulary in chars
vocab_size = len(chars)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [20]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
      ])
    return model

model = build_model(
  vocab_size = len(chars),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [21]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 50, 96) # (batch_size, sequence_length, vocab_size)


In [22]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           24576     
_________________________________________________________________
gru_1 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_1 (Dense)              (64, None, 96)            98400     
Total params: 4,061,280
Trainable params: 4,061,280
Non-trainable params: 0
_________________________________________________________________


In [23]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [24]:
sampled_indices

array([46,  1, 82,  9, 79, 85, 70, 40, 65, 44, 66, 37, 66, 33, 68, 90, 90,
       36,  1, 75, 16, 48, 43, 15, 58, 90, 26, 87, 17, 67, 67, 67,  7, 56,
       51, 13,  9, 33, 36, 27,  1, 19, 64, 94, 64, 40, 85, 27, 56, 12],
      dtype=int64)

In [25]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 "to air shit out\nNow what the fuck they gon' say no"

Next Char Predictions: 
 "R ·)yípLkPlIlEn––H u0TO/d–:ó1mmm'bW-)EH; 3j”jLí;b,"


In [26]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 50, 96)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.5648255


In [27]:
model.compile(optimizer='adam', loss=loss)

In [29]:
import os
import time

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [30]:
EPOCHS=10

In [31]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\ckpt_10'

In [33]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [34]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 256)            24576     
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_2 (Dense)              (1, None, 96)             98400     
Total params: 4,061,280
Trainable params: 4,061,280
Non-trainable params: 0
_________________________________________________________________


In [52]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
    num_generate = 300

  # Converting our start string to numbers (vectorizing)
    input_eval = [char_dict[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
    text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
    temperature = 1.0

  # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
      # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [54]:
print(generate_text(model, start_string="The time is coming"))

The time is coming?
Who be donetimis Spicasul like, hit into all favout
No peepiced, Ig, why you make now I got fly a said yoe
You seen makin I us
Fuck is *Since bitch (blaz))
Couldn't used to know in
We child na passedes
This yand to bitches on that old, he'll have need and make it
Told you know here? Many?
Lis Phil
