In [3]:
import tensorflow as tf
import keras_nlp
import keras

# Sample sentences
sentences = [
    'i love machine learning',
    'i love deep learning',
    'i enjoy deep learning',
    'i enjoy programming',
    'i love programming'
]

# Create a TextVectorization layer
# max_tokens: The maximum size of the vocabulary.
# output_sequence_length: The length of the output sequences.
vectorize_layer = keras.layers.TextVectorization(
    max_tokens=10, 
    output_sequence_length=4
)

# Adapt the layer to your data
# This is where the vocabulary is built
vectorize_layer.adapt(sentences)

# Convert sentences to integer sequences
vectorized_data = vectorize_layer(tf.constant(sentences))

print("Vocabulary:")
print(vectorize_layer.get_vocabulary())
print("\nVectorized Data (Sequences):")
print(vectorized_data)

Vocabulary:
['', '[UNK]', 'i', 'love', 'learning', 'programming', 'enjoy', 'deep', 'machine']

Vectorized Data (Sequences):
tf.Tensor(
[[2 3 8 4]
 [2 3 7 4]
 [2 6 7 4]
 [2 6 5 0]
 [2 3 5 0]], shape=(5, 4), dtype=int64)


In [4]:
from keras.layers import Embedding
    
# Get the vocabulary size and embedding dimension
vocab_size = vectorize_layer.vocabulary_size()
embedding_dim = 64  # This is a hyperparameter you can tune

# Create a simple model
model = keras.Sequential([
    vectorize_layer,  # The TextVectorization layer we created earlier
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
    )
])

# Print a summary of the model
model.summary()

In [5]:
# Create a dummy input to get the output of the embedding layer
sample_input = tf.constant(sentences)
# Pass the input through the model to get the embedding vectors
word_embeddings = model(sample_input)

# Print the shape and first few embedding vectors
print("Shape of word embeddings:", word_embeddings.shape)
print("Embedding vectors:\n", word_embeddings)

Shape of word embeddings: (5, 4, 64)
Embedding vectors:
 tf.Tensor(
[[[-0.02249515 -0.04291451  0.00474689 ... -0.02096723 -0.04835026
   -0.0268787 ]
  [-0.04161547 -0.01314483  0.01210267 ... -0.01529636 -0.0094137
   -0.02867313]
  [-0.04098078 -0.0174413   0.02731076 ... -0.01953267 -0.01705965
    0.01653096]
  [-0.02212741 -0.00045705  0.00820044 ...  0.04049009 -0.02096406
   -0.03435572]]

 [[-0.02249515 -0.04291451  0.00474689 ... -0.02096723 -0.04835026
   -0.0268787 ]
  [-0.04161547 -0.01314483  0.01210267 ... -0.01529636 -0.0094137
   -0.02867313]
  [ 0.00825231  0.03822804  0.02113021 ...  0.02787054 -0.03367592
    0.03322575]
  [-0.02212741 -0.00045705  0.00820044 ...  0.04049009 -0.02096406
   -0.03435572]]

 [[-0.02249515 -0.04291451  0.00474689 ... -0.02096723 -0.04835026
   -0.0268787 ]
  [ 0.04473734  0.0164606  -0.02382165 ... -0.00968386  0.01655009
   -0.0035928 ]
  [ 0.00825231  0.03822804  0.02113021 ...  0.02787054 -0.03367592
    0.03322575]
  [-0.02212741 -0

In [7]:
# We need labels for classification
labels = tf.constant([0, 0, 1, 1, 0]) # 0 for "learning", 1 for "programming"

# A simple classification model
classification_model = keras.Sequential([
    vectorize_layer,
    keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim
    ),
    keras.layers.GlobalAveragePooling1D(), # A layer to reduce the dimensions
    keras.layers.Dense(1, activation='sigmoid') # A dense layer for binary classification
])

classification_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

classification_model.summary()

callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor="val_loss"),
    keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.2, monitor="val_loss")
]

# Train the model
history = classification_model.fit(
    tf.constant(sentences),
    labels,
    epochs=100,
    callbacks=callbacks,
    validation_split=0.2
)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 597ms/step - accuracy: 0.2500 - loss: 0.6955 - val_accuracy: 1.0000 - val_loss: 0.6832 - learning_rate: 0.0010
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.2500 - loss: 0.6929 - val_accuracy: 1.0000 - val_loss: 0.6834 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5000 - loss: 0.6904 - val_accuracy: 1.0000 - val_loss: 0.6835 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.7500 - loss: 0.6879 - val_accuracy: 1.0000 - val_loss: 0.6837 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 1.0000 - loss: 0.6854 - val_accuracy: 1.0000 - val_loss: 0.6839 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/