A self-supervised learning approach using Tensorflow to fill a missing word in a sentence

In [None]:
import tensorflow as tf
import numpy as np


Define a list of sentences

In [None]:

sentences = ["I like to eat pizza", 
             "She enjoys reading books",
             "He plays basketball on weekends",
             "We went to the beach last summer"]

Define a tokenizer

In [None]:

tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)

Convert the sentences to sequences of tokens

In [None]:

sequences = tokenizer.texts_to_sequences(sentences)

Pad the sequences to a fixed length

In [None]:

max_length = max(len(seq) for seq in sequences)
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')

Define the model for self-supervised learning

In [None]:

inputs = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
embedding = tf.keras.layers.Embedding(len(tokenizer.word_index)+1, 128)(inputs)
context = tf.keras.layers.LSTM(128)(embedding)
hidden = tf.keras.layers.Dense(128, activation='relu')(context)
output = tf.keras.layers.Dense(len(tokenizer.word_index)+1, activation='softmax')(hidden)
model = tf.keras.models.Model(inputs=inputs, outputs=output)

Define a masked categorical crossentropy loss function

In [None]:

def masked_categorical_crossentropy(y_true, y_pred):
    mask = tf.not_equal(y_true, 0)
    mask = tf.cast(mask, tf.float32)
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
    loss *= mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

Compile the model

In [None]:

model.compile(optimizer='adam', loss=masked_categorical_crossentropy)

Define a dataset for self-supervised learning

In [None]:

dataset = tf.data.Dataset.from_tensor_slices(padded_sequences)
dataset = dataset.shuffle(1000).batch(32)

Train the model for self-supervised learning

In [None]:

model.fit(dataset, epochs=10)

Generate a new sentence with a missing word

In [None]:

sentence = "I like to ____ pizza"
sequence = tokenizer.texts_to_sequences([sentence])[0]
input_sequence = np.zeros((1, max_length), dtype=np.int32)
input_sequence[0, :len(sequence)] = sequence

Predict the missing word

In [None]:

preds = model.predict(input_sequence)
pred_idx = np.argmax(preds[0, :len(sequence)])

Convert the predicted index to a word

In [None]:

word = tokenizer.index_word[pred_idx]

Print the predicted word

In [None]:

print("Predicted word:", word)
