In [1]:
import tensorflow as tf
import numpy as np
import os

In [2]:
url = "https://raw.githubusercontent.com/karpathy/makemore/master/names.txt"

ds = tf.keras.utils.get_file(origin=url, cache_dir=".")

In [3]:
with open ("./datasets/names.txt", "r") as f:
  names = f.read().splitlines()
  print(len(names))
  print(names[:3])

32033
['emma', 'olivia', 'ava']


In [4]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character", output_mode="int")
text_vec_layer.adapt(names)

In [5]:
text_vec_layer(names[:3])

<tf.Tensor: shape=(3, 6), dtype=int64, numpy=
array([[ 3, 12, 12,  2,  0,  0],
       [10,  6,  5, 20,  5,  2],
       [ 2, 20,  2,  0,  0,  0]])>

# Step 2: Write a function to produce training examples

In [6]:
def get_training_examples(idxs, context_size):
  # padding:
  zeros = tf.zeros([idxs.shape[0], 3], dtype=tf.int64)
  idxs = tf.concat([zeros, idxs], 1)
  zeros2 = tf.zeros([idxs.shape[0], 1], dtype=tf.int64)
  idxs = tf.concat([idxs, zeros2], -1)

  # new tensor of context_size + 1 columns
  tensor = tf.concat([idxs[:,i:context_size + i + 1] for i in range(0, idxs.shape[1] - context_size)], 0)

  # remove rows with 2 ending 0's
  mask = tf.reduce_any(tf.math.not_equal(tensor[:, -2:], 0), axis=1)
  tensor_filtered = tensor[mask]

  # create a tuple of tensors
  tuple_ = (tensor_filtered[:,:-1], tensor_filtered[:,-1])

  return tuple_

In [7]:
get_training_examples(text_vec_layer(names[:2]), 3)

(<tf.Tensor: shape=(12, 3), dtype=int64, numpy=
 array([[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  3],
        [ 0,  0, 10],
        [ 0,  3, 12],
        [ 0, 10,  6],
        [ 3, 12, 12],
        [10,  6,  5],
        [12, 12,  2],
        [ 6,  5, 20],
        [ 5, 20,  5],
        [20,  5,  2]])>,
 <tf.Tensor: shape=(12,), dtype=int64, numpy=array([ 3, 10, 12,  6, 12,  5,  2, 20,  0,  5,  2,  0])>)

# Step 3: Build the model

In [145]:
def get_model(context_size, embedding_size, hidden_size, vocab_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=context_size),
      tf.keras.layers.Reshape((context_size * embedding_size,)),
      tf.keras.layers.Dense(units=hidden_size, activation="tanh", kernel_initializer="he_uniform"),
      tf.keras.layers.Dense(units=vocab_size, kernel_initializer="he_uniform")
  ])
  return model

# Step 4: Create a function that samples from the model

In [24]:
def sample_from_model(model, num_samples, context_size, vocabulary):

  vocabulary = np.array(vocabulary)
  final_result = []

  for _ in range(num_samples):

    context = tf.zeros([1, context_size], dtype=tf.int64)
    result = []

    while True:
      logits = model.predict(context, verbose=0)
      select = tf.random.categorical(logits=logits, num_samples=1)
      context = tf.reshape(tf.concat([context[0][1:], select[0]], axis=0), [1,context_size])
      if int(select[0]) == 0:
        break
      result.append(int(select[0]))

    result = vocabulary[result]
    result = "".join(result)
    final_result.append(result)

  return final_result

# Step 5: Train the model

In [146]:
import random
random.seed(42)
random.shuffle(names)

In [147]:
perc_10 = int(round((len(names) / 100 * 10), 0)) # len(names) // 100 * 10
perc_80 = int(round((len(names) / 100 * 80), 0)) # len(names) // 100 * 80
perc_10, perc_80

(3203, 25626)

In [148]:
test = perc_80 + perc_10 * 2
len(names), test

(32033, 32032)

In [149]:
train_ds = names[:perc_80]
val_ds = names[perc_80:-perc_10]
test_ds = names[-perc_10:]

len(train_ds), len(val_ds), len(test_ds)

(25626, 3204, 3203)

In [150]:
X_train, y_train = get_training_examples(text_vec_layer(train_ds), 3)
X_val, y_val = get_training_examples(text_vec_layer(val_ds), 3)
X_test, y_test = get_training_examples(text_vec_layer(test_ds), 3)

In [151]:
X_train.shape

TensorShape([182580, 3])

In [152]:
arr = X_train[:25000] == 0
np.unique(np.asarray(arr).astype(int))

array([1])

In [153]:
model = get_model(context_size=3, embedding_size=10, hidden_size=200, vocab_size=text_vec_layer.vocabulary_size())

In [154]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), loss="binary_crossentropy")
escb = tf.keras.callbacks.EarlyStopping(patience=5, min_delta=0.01, monitor="val_loss", restore_best_weights=True)

getrained voor 20 epochs, dan 5 epochs dan 3 epochs

In [155]:
model.fit(X_train, y_train, batch_size=256, callbacks=[escb], validation_data=(X_val, y_val), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7aac7bcd9ff0>

In [156]:
model.evaluate(X_test, y_test)



9.789724349975586

# Step 6: Sample from model

In [157]:
t1= sample_from_model(model, 10, 3, text_vec_layer.get_vocabulary())
t1

['phgt[UNK]noleyewyqlmlavoiyiznainotgssfelwtygatdmeeyxmh',
 '',
 'rn[UNK]rwuypblseshnf[UNK]xdwtixjflxoeqdmvviowwh[UNK]lqgax[UNK]',
 'vdtpjkzqrbslb',
 'd',
 'ligloloigegabrgggdyxbfvbqdykhuibzqdstwmnbknwwrrbujrjochgwggrnkgofrtpvqgzb[UNK]uxkrqewbqgu[UNK]j[UNK]ge[UNK]jbx',
 'xgvq',
 'zrdy[UNK]fsweijsobkwmdbxmu',
 'oohhnudbxybn[UNK]nxmwd[UNK]mqkhnate',
 'ammgyawumajzsfopm[UNK]hozveio']