In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
import tensorflow as tf

filepath = "/content/pokedex.txt"

with open(filepath) as f:
    pokedex_text = f.read()

In [5]:
"".join(sorted(set(pokedex_text.lower())))

'\n !%(),-./0123456789:;<>?abcdefghijklmnopqrstuvwxyz\xa0°é–—’“”−'

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(pokedex_text)

In [7]:
tokenizer.texts_to_sequences(["First"])

[[18, 4, 9, 5, 3]]

In [8]:
tokenizer.sequences_to_texts([[18, 4, 9, 5, 3]])

['f i r s t']

In [9]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters

In [10]:
print("max_id:", max_id)
print("dataset_size:", dataset_size)

max_id: 60
dataset_size: 1442842


In [11]:
import numpy as np

[encoded] = np.array(tokenizer.texts_to_sequences([pokedex_text])) - 1

In [12]:
print(tokenizer.sequences_to_texts([encoded[:100] + 1]))

['a   s t r a n g e   s e e d   w a s   p l a n t e d   o n   i t s   b a c k   a t   b i r t h .   t h e   p l a n t   s p r o u t s   a n d   g r o w s   w i t h   t h i s   p o k é m o n . \n i t   c']


In [13]:
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [14]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True) # shift = 1 for next character instead of next word

In [15]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [16]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [17]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)

dataset = dataset.prefetch(1)

In [18]:
import keras as keras

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
import joblib

joblib.dump(model, "pokedex_gpu.pkl")

['pokedex_gpu.pkl']

In [20]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [21]:
text = "this pokemon"
for _ in range(200):
  X_new = preprocess([text])
  Y_pred = np.argmax(model(X_new), axis=-1)
  text = text + tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

print(text)

this pokemon was born in a a special move. its body is coated in a selious pokémon. it secures its prey in its back and streams its opponents’ stands.
it s in its body in ice from its head in the air to protect t
