In [None]:
# Vectorization of words guide https://www.tensorflow.org/text/guide/word_embeddings

In [None]:
import tensorflow as tf
from keras.layers import Embedding, TextVectorization
from keras import Input, Model
import numpy as np


VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 2

In [None]:
vectorize_layer = TextVectorization(
    VOCAB_SIZE,
    "strip_punctuation",
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH
)

In [None]:
# Fit the vectorization layer here to vocabulary
vectorize_layer.adapt(["hello", "bye", "hello bye man wow woof"])

In [None]:
vectorize_layer.call(["hhh hello bye hello"])

In [None]:
# Defining the embedding model
inputs = Input((1,), dtype=tf.string)
vectorized = vectorize_layer(inputs)
embeddings = Embedding(VOCAB_SIZE, EMBEDDING_DIM, name="embedding")(vectorized)

text_embedding_model = Model(inputs=inputs, outputs=embeddings)

In [None]:
text_embedding_model.compile(optimizer="adam", loss="mean_squared_error")

In [None]:
text_embedding_model.predict(["h"])

In [None]:
# Saving weights and vocabulary
# Project weights and vocabulary: http://projector.tensorflow.org/
# also possible in tensorboard directly
embedding_weights = text_embedding_model.get_layer("embedding").get_weights()[0]
vocabulary = vectorize_layer.get_vocabulary()

with open("embedding_weights.tsv", "w") as out_w:
  with open("vocabulary.tsv", "w") as out_v:
    for index, word in enumerate(vocabulary):
      if index == 0:
        continue  # skip 0, it's padding.
      vec = embedding_weights[index]
      out_w.write('\t'.join([str(x) for x in vec]) + "\n")
      out_v.write(word + "\n")  