In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import tensorflow as tf

filepath = "/content/drive/MyDrive/Colab Notebooks/pokedex.txt"

with open(filepath) as f:
    pokedex_text = f.read()

In [4]:
"".join(sorted(set(pokedex_text.lower())))

'\n !%(),-./0123456789:;<>?abcdefghijklmnopqrstuvwxyz\xa0°é–—’“”−'

In [5]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(pokedex_text)

In [6]:
tokenizer.texts_to_sequences(["First"])

[[18, 4, 9, 5, 3]]

In [7]:
tokenizer.sequences_to_texts([[18, 4, 9, 5, 3]])

['f i r s t']

In [8]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters

In [9]:
print("max_id:", max_id)
print("dataset_size:", dataset_size)

max_id: 60
dataset_size: 1442842


In [10]:
import numpy as np

[encoded] = np.array(tokenizer.texts_to_sequences([pokedex_text])) - 1

In [11]:
print(tokenizer.sequences_to_texts([encoded[:100] + 1]))

['a   s t r a n g e   s e e d   w a s   p l a n t e d   o n   i t s   b a c k   a t   b i r t h .   t h e   p l a n t   s p r o u t s   a n d   g r o w s   w i t h   t h i s   p o k é m o n . \n i t   c']


In [12]:
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [13]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True) # shift = 1 for next character instead of next word

In [14]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [15]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [16]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)

dataset = dataset.prefetch(1)

In [18]:
import keras as keras

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=1)



In [21]:
import joblib

joblib.dump(model, "pokedex_cpu.pkl")

['pokedex_cpu.pkl']

In [22]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [23]:
text = "this pokemon"
for _ in range(200):
  X_new = preprocess([text])
  Y_pred = np.argmax(model(X_new), axis=-1)
  text = text + tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

print(text)

this pokemon are said to be the product of areas where they can be made of its body.
they live in mountains on its body is control the forest. they live in mountains on its body is control the forest. they live i
