In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
sentences=[
    "the cat sat on the mat",
    "the dog sat on the log ",
    "cats and dogs are get pets",
    "dogs are better than cats ",
    "the mat is on the floor"
]

tokenizer=Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words=len(tokenizer.word_index)+1
print("Total words:", total_words)

Total words: 18


In [31]:
def create_cbow_pairs(sentences,window_size=2):
    input_data=[]
    output_data=[]

    for sentence in sentences:
        words= sentence.split()
        for i, word in enumerate(words):
            start_ind = max(0,i - window_size)
            end_ind = min(len(words),i+window_size+1)
            context= [words[j] for j in range(start_ind,end_ind)if j!=i]

            input_data.append(context)
            output_data.append(word)

    return input_data, output_data
    

In [32]:
input_data, output_data = create_cbow_pairs(sentences) 

input_sequences= tokenizer.texts_to_sequences(input_data)
output_sequences= tokenizer.texts_to_sequences(output_data)

In [33]:

# Convert output to one-hot encoding
output_sequences = tf.keras.utils.to_categorical(output_sequences, num_classes=total_words)

max_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='post')

model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10))  # Embedding layer
model.add(Flatten())  # Flatten the output of embedding layer
model.add(Dense(total_words, activation='softmax'))  # Output layer

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(input_sequences, output_sequences, epochs=100, verbose=1)


# Function to predict word from context
def predict_word(context):
    context_seq = tokenizer.texts_to_sequences([context])
    context_seq = pad_sequences(context_seq, maxlen=max_length, padding='post')
    predicted = model.predict(context_seq)
    return tokenizer.index_word[np.argmax(predicted)]


# Example prediction
context_example = ["the", "cat", "on", "the"]
predicted_word = predict_word(context_example)
print(f"Predicted word for context {context_example}: {predicted_word}")

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0345 - loss: 2.8853
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.0690 - loss: 2.8811
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.0690 - loss: 2.8769
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.0690 - loss: 2.8727
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1379 - loss: 2.8685
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.1379 - loss: 2.8642
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1379 - loss: 2.8600
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.1379 - loss: 2.8557
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m