In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from tensorflow.keras.layers.experimental import preprocessing
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
books = os.listdir('data')
text=""
for book in books[:]:
    with open(f"data/{book}", 'r') as file:
        text = text + file.read()

In [3]:
pages = text.split("\n\n")

In [4]:
len(pages)

3430

In [5]:
len(max(pages,key=len))

4473

In [6]:
np.mean([len(page) for page in pages])

2846.140524781341

In [7]:
pages[100]

'"T"It was Jory," his steward Vayon Poole replied. "She\'s not been harmed." \n"Thank the gods," Ned said. His men had been searching for Arya for four days now, but the queen\'s \nmen had been out hunting as well. "Where is she? Tell Jory to bring her here at once." \n"I am sorry, my lord," Poole told him. "The guards on the gate were Lannister men, and they informed \nthe queen when Jory brought her in. She\'s being taken directly before the king . . ." \n"Damn that woman!" Ned said, striding to the door. "Find Sansa and bring her to the audience chamber. \nHer voice may be needed." He descended the tower steps in a red rage. He had led searches himself for \nthe first three days, and had scarcely slept an hour since Arya had disappeared. This morning he had \nbeen so heartsick and weary he could scarcely stand, but now his fury was on him, filling him with \nstrength. \nMen called out to him as he crossed the castle yard, but Ned ignored them in his haste. He would have \nrun, but h

In [8]:
text = text.replace("\n", " ")

In [9]:
len(text)

9769120

In [10]:
vocab = sorted(set(text))
print(f"vocab size: {len(vocab)}")

vocab size: 94


In [11]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab),
    mask_token=None
)

In [12]:
chars_from_ids = preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(),
    invert=True,
    mask_token=None
)

In [13]:
s = "Gared urged as the woods began to grow dark around them."
chars = tf.strings.unicode_split(s,input_encoding="UTF-8")
print(chars)

tf.Tensor(
[b'G' b'a' b'r' b'e' b'd' b' ' b'u' b'r' b'g' b'e' b'd' b' ' b'a' b's'
 b' ' b't' b'h' b'e' b' ' b'w' b'o' b'o' b'd' b's' b' ' b'b' b'e' b'g'
 b'a' b'n' b' ' b't' b'o' b' ' b'g' b'r' b'o' b'w' b' ' b'd' b'a' b'r'
 b'k' b' ' b'a' b'r' b'o' b'u' b'n' b'd' b' ' b't' b'h' b'e' b'm' b'.'], shape=(56,), dtype=string)


In [14]:
ids = ids_from_chars(chars)
print(ids)

tf.Tensor(
[33 57 74 61 60  1 77 74 63 61 60  1 57 75  1 76 64 61  1 79 71 71 60 75
  1 58 61 63 57 70  1 76 71  1 63 74 71 79  1 60 57 74 67  1 57 74 71 77
 70 60  1 76 64 61 69 12], shape=(56,), dtype=int64)


In [15]:
chars = chars_from_ids(ids)
print(chars)

tf.Tensor(
[b'G' b'a' b'r' b'e' b'd' b' ' b'u' b'r' b'g' b'e' b'd' b' ' b'a' b's'
 b' ' b't' b'h' b'e' b' ' b'w' b'o' b'o' b'd' b's' b' ' b'b' b'e' b'g'
 b'a' b'n' b' ' b't' b'o' b' ' b'g' b'r' b'o' b'w' b' ' b'd' b'a' b'r'
 b'k' b' ' b'a' b'r' b'o' b'u' b'n' b'd' b' ' b't' b'h' b'e' b'm' b'.'], shape=(56,), dtype=string)


In [16]:
tf.strings.reduce_join(chars,axis=-1).numpy()

b'Gared urged as the woods began to grow dark around them.'

In [17]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [18]:
text_ids = ids_from_chars(tf.strings.unicode_split(text,input_encoding="UTF-8"))

In [19]:
text_ids.shape

TensorShape([9769120])

In [20]:
ids_dataset = tf.data.Dataset.from_tensor_slices(text_ids)

In [21]:
seq_length = 512
examples_per_epoch = len(text)//(seq_length+1)

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [22]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [23]:
dataset = sequences.map(split_input_target)

In [24]:
for input_example, target_example in dataset.take(1):
    print("Input: ",text_from_ids(input_example).numpy())
    print("Target: ",text_from_ids(target_example).numpy())

Input:  b'PROLOGUE  "We should start back," Gared urged as the woods began to grow dark around them. "The wildlings are  dead."  "Do the dead frighten you?" Ser Waymar Royce asked with just the hint of a smile.  Gared did not rise to the bait. He was an old man, past fifty, and he had seen the lordlings come and go.  "Dead is dead," he said. "We have no business with the dead."  "Are they dead?" Royce asked softly. "What proof have we?"  "Will saw them," Gared said. "If he says they are dead, that\'s proof enough for '
Target:  b'ROLOGUE  "We should start back," Gared urged as the woods began to grow dark around them. "The wildlings are  dead."  "Do the dead frighten you?" Ser Waymar Royce asked with just the hint of a smile.  Gared did not rise to the bait. He was an old man, past fifty, and he had seen the lordlings come and go.  "Dead is dead," he said. "We have no business with the dead."  "Are they dead?" Royce asked softly. "What proof have we?"  "Will saw them," Gared said. "If 

In [25]:
BATCH_SIZE = 128
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)
dataset

<PrefetchDataset shapes: ((128, 512), (128, 512)), types: (tf.int64, tf.int64)>

In [26]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 512

In [27]:
class TextGeneration(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)
        if return_state:
            return x, states
        else:
            return x        

In [28]:
model = TextGeneration(vocab_size=len(ids_from_chars.get_vocabulary()),
                       embedding_dim=embedding_dim,
                       rnn_units=rnn_units)

In [29]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

### Check Points

In [30]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir,"ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

In [31]:
EPOCHS = 20

In [32]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [33]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
        
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        
        predicted_logits = predicted_logits + self.prediction_mask
        
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)
        
        predicted_chars = self.chars_from_ids(predicted_ids)

        return predicted_chars, states

In [34]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [39]:
import time
start = time.time()
states = None
next_char = tf.constant(['Jon'])
results = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    results.append(next_char)

results = tf.strings.join(results)
end = time.time()
print(results[0].numpy().decode("utf-8"), "\n\n"+ "_"*80)
print("\nRun time: ", end - start)

Jon agreed. Those are horses and I would not  eat the loose of the other Landing? Yet they were owh if I drew some line. He saw a bull in hobbing, "I'm not condemned, and that will have  a farpetifice alone. Certain of your head," said Late. The Light-gave again, even Halfmaer’s son. The Old Tongue did have  the clothing as it think. Their out the dirt and the cages. How long home, nor to stop his.  The second friend, he thinks  of our trees say, and Bronn brings the greatsword feral want, a goodfrey in the ships too. What afternoon was  she'd best be boiling into fiery liege  lot at each pity. All turn to the clanks, Dany thought.  And it is still apart. The  knight belonged a hundred Prince Column and Dulk, the man no more to be able. The crown had  had the fellow. This settling throble greater had supposed to be a baly from hip.  When he was made a dozen Dany thought, squeal, Janos Wylts.  “Trusted that too my widow, and some rankled sigil of your father, Lord Snow?” avoid  on a few