In [1]:
# ---------------------------------------------------------
# Continuous Bag of Words (CBOW) Model Implementation
# ---------------------------------------------------------

# -----------------------------
# a. Import required libraries
# -----------------------------
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Embedding, Lambda, Dense
from tensorflow.keras.models import Model




In [2]:

# -----------------------------
# b. Data Preparation
# -----------------------------
corpus = [
    "the quick brown fox jumped over the lazy dog",
    "I love natural language processing",
    "word embeddings capture semantic meaning"
]

In [3]:
# Tokenize words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}
vocab_size = len(word2id) + 1

print("Vocabulary size:", vocab_size)
print("Word to index mapping:", word2id)

# Convert text to sequences of word indices
sequences = tokenizer.texts_to_sequences(corpus)

Vocabulary size: 19
Word to index mapping: {'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'jumped': 5, 'over': 6, 'lazy': 7, 'dog': 8, 'i': 9, 'love': 10, 'natural': 11, 'language': 12, 'processing': 13, 'word': 14, 'embeddings': 15, 'capture': 16, 'semantic': 17, 'meaning': 18}


In [4]:
# -----------------------------
# c. Generate CBOW (context → target) training pairs
# -----------------------------
window_size = 2
data = []

for seq in sequences:
    for i in range(window_size, len(seq) - window_size):
        context = []
        
        # left context words
        for j in range(window_size):
            context.append(seq[i - j - 1])
        
        # right context words
        for j in range(window_size):
            context.append(seq[i + j + 1])
        
        target = seq[i]
        data.append((context, target))

# Show sample
c, t = data[0]
print("\nSample CBOW pair:", [id2word[w] for w in c], "->", id2word[t])

# Prepare input & output
contexts = np.array([x for x, _ in data])
targets = np.array([y for _, y in data])
targets = to_categorical(targets, vocab_size)   # one-hot targets


Sample CBOW pair: ['quick', 'the', 'fox', 'jumped'] -> brown


In [5]:
# -----------------------------
# d. Build CBOW Model
# -----------------------------
embedding_dim = 10

# Input is context words
input_layer = Input(shape=(window_size * 2,))
embedding = Embedding(vocab_size, embedding_dim)(input_layer)

# Average embeddings
x = Lambda(lambda z: tf.reduce_mean(z, axis=1))(embedding)

# Output predicts target word
output_layer = Dense(vocab_size, activation='softmax')(x)

model = Model(inputs=input_layer, outputs=output_layer)

In [6]:
# -----------------------------
# e. Compile the model
# -----------------------------
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

In [7]:
# -----------------------------
# f. Train the model
# -----------------------------
history = model.fit(contexts, targets, epochs=50, verbose=1)
print("\nCBOW model training completed successfully!")


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step - accuracy: 0.1429 - loss: 2.9396
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1429 - loss: 2.9366
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5714 - loss: 2.9335
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5714 - loss: 2.9305
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5714 - loss: 2.9275
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5714 - loss: 2.9245
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5714 - loss: 2.9214
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5714 - loss: 2.9184
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [8]:
# -----------------------------
# g. Test prediction with known words
# -----------------------------
test_sentences = [
    "i love language processing",
    "the quick fox jumped",
    "word embeddings semantic meaning"
]

# Convert test sentences to sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
print("\nTest sequences:", test_sequences)

# Predict center word
for seq in test_sequences:
    if len(seq) >= 4:     # need 4 context words
        ctx = np.array(seq[:4])
        ctx_words = [id2word[i] for i in ctx]

        prediction = model.predict(ctx.reshape(1, -1))
        predicted_word = id2word[np.argmax(prediction)]

        print("\nContext words:", ctx_words)
        print("Predicted center word:", predicted_word)



Test sequences: [[9, 10, 12, 13], [1, 2, 4, 5], [14, 15, 17, 18]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step

Context words: ['i', 'love', 'language', 'processing']
Predicted center word: natural
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step

Context words: ['the', 'quick', 'fox', 'jumped']
Predicted center word: over
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step

Context words: ['word', 'embeddings', 'semantic', 'meaning']
Predicted center word: capture
