In [4]:
# Vectorization of words guide https://www.tensorflow.org/text/guide/word_embeddings

In [96]:
import tensorflow as tf
from keras.layers import Embedding, TextVectorization
from keras import Input, Model
import numpy as np


VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 2

In [106]:
vectorize_layer = TextVectorization(
    VOCAB_SIZE,
    "strip_punctuation",
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH
)

In [107]:
# Fit the vectorization layer here to vocabulary
vectorize_layer.adapt(["hello", "bye", "hello bye man wow woof"])



In [115]:
vectorize_layer.call(["hhh hello bye hello"])

<tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)>

In [99]:
# Defining the embedding model
inputs = Input((1,), dtype=tf.string)
vectorized = vectorize_layer(inputs)
embeddings = Embedding(VOCAB_SIZE, EMBEDDING_DIM, name="embedding")(vectorized)

text_embedding_model = Model(inputs=inputs, outputs=embeddings)

In [100]:
text_embedding_model.compile(optimizer="adam", loss="mean_squared_error")

In [101]:
text_embedding_model.predict(["h"])



array([[[ 0.03912521, -0.04143658],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.03891527],
        [ 0.01337456,  0.038

In [105]:
# Saving weights and vocabulary
embedding_weights = text_embedding_model.get_layer("embedding").get_weights()[0]
vocabulary = vectorize_layer.get_vocabulary()

with open("embedding_weights.tsv", "w") as out_w:
  with open("vocabulary.tsv", "w") as out_v:
    for index, word in enumerate(vocabulary):
      if index == 0:
        continue  # skip 0, it's padding.
      vec = embedding_weights[index]
      out_w.write('\t'.join([str(x) for x in vec]) + "\n")
      out_v.write(word + "\n")  