# Keras Layers

In [1]:
import tensorflow as tf

### `StringLookup`

In [10]:
vocab = ["a", "b", "c", "d"]
layer = tf.keras.layers.StringLookup(vocabulary=vocab, 
                                     output_mode='int' # default is 'int' which is to return int index
                                    )

data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer(data) # in this case, z is OOV, so z is encoded as 0

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[1, 3, 4],
       [4, 0, 2]])>

In [9]:
layer.get_vocabulary()

['[UNK]', 'a', 'b', 'c', 'd']

- Note: that the OOV token `"[UNK]"` has been added to the vocabulary

In [5]:
vocab = ["a", "b", "c", "d"]
layer = tf.keras.layers.StringLookup(vocabulary=vocab,
                                     num_oov_indices=2)
layer(vocab) # vocab index is now increased by 2, to leave 0 & 1 for OOV as num_oov_indices=2

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([2, 3, 4, 5])>

In [7]:
data = tf.constant([["a", "y", "d"], ["m", "z", "b"]])
layer(data) # in this case 'y', 'z' is encoded as 1, while m is encoded as '0'

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[2, 1, 5],
       [0, 1, 3]])>

In [14]:
characters = ['2', '3', '4', '5', '6', '7', '8', 'b', 'c', 'd', 'e', 'f', 'g', 'm', 'n', 'p', 'w', 'x', 'y']

# Mapping characters to integers
char_to_num = tf.keras.layers.StringLookup(
  vocabulary=characters,
  # mask_token=None # If set to None, no mask term will be added. 
) 
# Mapping integers back to original characters
num_to_char = tf.keras.layers.StringLookup(
  vocabulary=char_to_num.get_vocabulary(),
  mask_token=None,
  invert=True # Only valid when output_mode is "int". If True, this layer will map indices to vocabulary items instead of mapping vocabulary items to indices. 
)
num_to_char(char_to_num('4'))

<tf.Tensor: shape=(), dtype=string, numpy=b'4'>