In [None]:
import numpy as np
import pandas as pd
import time

import warnings
warnings.filterwarnings('ignore')

import os

import tensorflow as tf

In [2]:
data = open('dinos.txt', 'r').read()
data= data.lower()
# The unique characters in the file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters ' % (data_size, vocab_size))

There are 19909 total characters and 27 unique characters in your data.


In [3]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

{0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


![](http://datascience-enthusiast.com/figures/dinos3.png)

In [4]:
def build_data(vocab_size, Tx = 40):
    """
    Create a training set by scanning a window of size Tx over the text corpus, with stride 3.
    
    Arguments:
    text -- string, corpus of Shakespearian poem
    Tx -- sequence length, number of time-steps (or characters) in one training example
    stride -- how much the window shifts itself while scanning
    
    Returns:
    X -- list of training examples
    Y -- list of training labels
    """

    # Build list of all dinosaur names (training examples)
    with open("dinos.txt") as f:
        examples = f.readlines()
    examples = [x.lower().strip() for x in examples]
    
    m = len(examples)
    X = np.zeros((m, Tx, vocab_size))
    Y = np.zeros((m, Tx, vocab_size))
    
    for i, name in enumerate(examples):
        name_ids = [char_to_ix[ch] for ch in name]
        name_onehot = tf.one_hot(name_ids, depth=27)
        
        X[i,0:len(name),:] = name_onehot
        X[i,len(name):,0] = 1
        
        Y[i,0:len(name)-1,:] = name_onehot[1:,:]
        Y[i,len(name)-1:,0] = 1
    
    print('number of training examples:', m)
    
    return X, Y

In [5]:
X, Y = build_data(vocab_size, Tx = 40)


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=false
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hyper'
   KMP_REDUCTION_BARRIER='1,1'
   KMP_REDUCTION_BAR

number of training examples: 1536


In [6]:
Y.shape

(1536, 40, 27)

In [7]:
from keras.models import Sequential
from keras.layers import Input, Dense, GRU  # Embedding

In [8]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, rnn_units):
    super().__init__(self)
    # self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size) #, activation='softmax')

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    # x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [9]:
# Length of the vocabulary in chars
vocab_size = vocab_size

# Number of RNN units
rnn_units = 256

model = MyModel(
    vocab_size=vocab_size,
    rnn_units=rnn_units)
model.build((None, X.shape[1], X.shape[2]))
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    multiple                  218880    
_________________________________________________________________
dense (Dense)                multiple                  6939      
Total params: 225,819
Trainable params: 225,819
Non-trainable params: 0
_________________________________________________________________


In [10]:
loss = tf.losses.CategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, optimizer='adam')  #, metrics=['accuracy'])
history = model.fit(X, Y, batch_size=64, epochs=300, verbose = 2)

Epoch 1/300


2022-01-03 19:24:04.555378: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


24/24 - 4s - loss: 1.7370
Epoch 2/300
24/24 - 2s - loss: 0.9344
Epoch 3/300
24/24 - 2s - loss: 0.8558
Epoch 4/300
24/24 - 2s - loss: 0.8144
Epoch 5/300
24/24 - 2s - loss: 0.7734
Epoch 6/300
24/24 - 2s - loss: 0.7251
Epoch 7/300
24/24 - 2s - loss: 0.6700
Epoch 8/300
24/24 - 2s - loss: 0.6268
Epoch 9/300
24/24 - 2s - loss: 0.6029
Epoch 10/300
24/24 - 2s - loss: 0.5860
Epoch 11/300
24/24 - 2s - loss: 0.5721
Epoch 12/300
24/24 - 2s - loss: 0.5614
Epoch 13/300
24/24 - 2s - loss: 0.5517
Epoch 14/300
24/24 - 2s - loss: 0.5422
Epoch 15/300
24/24 - 2s - loss: 0.5349
Epoch 16/300
24/24 - 2s - loss: 0.5280
Epoch 17/300
24/24 - 2s - loss: 0.5222
Epoch 18/300
24/24 - 2s - loss: 0.5161
Epoch 19/300
24/24 - 2s - loss: 0.5116
Epoch 20/300
24/24 - 2s - loss: 0.5058
Epoch 21/300
24/24 - 2s - loss: 0.5010
Epoch 22/300
24/24 - 2s - loss: 0.4969
Epoch 23/300
24/24 - 2s - loss: 0.4929
Epoch 24/300
24/24 - 2s - loss: 0.4887
Epoch 25/300
24/24 - 2s - loss: 0.4847
Epoch 26/300
24/24 - 2s - loss: 0.4808
Epoch 2

In [11]:
class OneStep(tf.keras.Model):
  def __init__(self, model):
    super().__init__()
    self.model = model


  def generate_one_step(self, inputs, states=None, Tx=1, vocab_size=27):
    # Convert strings to token IDs.
    input_x = np.zeros((1, len(inputs), vocab_size))
    name_ids = [char_to_ix[ch] for ch in inputs]
    name_onehot = tf.one_hot(name_ids, depth=27)
    input_x[0,:,:] = name_onehot
    
    input_x = tf.constant(input_x)

    # Run the model.
    predicted_logits, states = self.model(inputs=input_x, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = ''.join(ix_to_char[index] for index in predicted_ids.numpy())

    # Return the characters and model state.
    return predicted_chars, states

one_step_model = OneStep(model)

In [12]:
start = time.time()

# Generate 10 dinasaurus names beginning with 'm'
for i in range(10):
    states = None
    next_char = 'm'
    result = [next_char]
    for n in range(20):
        next_char, states = one_step_model.generate_one_step(next_char, states=states)
        if next_char=='\n':
            break
        result.append(next_char)
    result = tf.strings.join(result)
    result = result.numpy().decode('utf-8')
    result = result[0].upper() + result[1:]
    print(result)

end = time.time()
print('\nRun time:', end - start)

Mendozasaurus
Mendozasaurus
Mamenchisaurus
Mtapaiasaurus
Microceratus
Mourocosaurus
Mei
Microraptor
Microraptor
Merrudiceratops

Run time: 0.6334753036499023
