In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# Sample text corpus
corpus = """Natural language processing enables computers to understand human languageDeep learning models like CBOW help us learn word representations.
Word embeddings capture semantic meaning in vector space.
This makes NLP applications like translation and sentiment analysis possible.
The CBOW model predicts a word based on its context words."""
# Split into sentences and lowercase
sentences = [s.strip().lower() for s in corpus.split('.') if s.strip()]
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
index_word[0] = "<PAD>"
print("Vocabulary:", word_index)
print("Sequences:", sequences)
print("Vocab Size:", vocab_size)


Vocabulary: {'word': 1, 'like': 2, 'cbow': 3, 'natural': 4, 'language': 5, 'processing': 6, 'enables': 7, 'computers': 8, 'to': 9, 'understand': 10, 'human': 11, 'languagedeep': 12, 'learning': 13, 'models': 14, 'help': 15, 'us': 16, 'learn': 17, 'representations': 18, 'embeddings': 19, 'capture': 20, 'semantic': 21, 'meaning': 22, 'in': 23, 'vector': 24, 'space': 25, 'this': 26, 'makes': 27, 'nlp': 28, 'applications': 29, 'translation': 30, 'and': 31, 'sentiment': 32, 'analysis': 33, 'possible': 34, 'the': 35, 'model': 36, 'predicts': 37, 'a': 38, 'based': 39, 'on': 40, 'its': 41, 'context': 42, 'words': 43}
Sequences: [[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 2, 3, 15, 16, 17, 1, 18], [1, 19, 20, 21, 22, 23, 24, 25], [26, 27, 28, 29, 2, 30, 31, 32, 33, 34], [35, 3, 36, 37, 38, 1, 39, 40, 41, 42, 43]]
Vocab Size: 44


In [6]:
# Cell 2: Generate training data (context -> target)
def generate_cbow_pairs(sequences, window_size=2):
    contexts, targets = [], []
    context_len = window_size * 2
    
    for seq in sequences:
        for i, target in enumerate(seq):
            context = []
            for j in range(i - window_size, i + window_size + 1):
                if j == i:
                    continue
                if 0 <= j < len(seq):
                    context.append(seq[j])
                else:
                    context.append(0)  # padding
            contexts.append(context)
            targets.append(target)
    
    return np.array(contexts), np.array(targets)

# Generate dataset
X, y = generate_cbow_pairs(sequences, window_size=2)
print("Context shape:", X.shape)
print("Target shape:", y.shape)
print("Example:")
for i in range(5):
    print([index_word[idx] for idx in X[i]], "->", index_word[y[i]])


Context shape: (47, 4)
Target shape: (47,)
Example:
['<PAD>', '<PAD>', 'language', 'processing'] -> natural
['<PAD>', 'natural', 'processing', 'enables'] -> language
['natural', 'language', 'enables', 'computers'] -> processing
['language', 'processing', 'computers', 'to'] -> enables
['processing', 'enables', 'to', 'understand'] -> computers


In [8]:
# Cell 3: Build and Train CBOW model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense, Input

embedding_dim = 50
context_len = X.shape[1]

# Define CBOW model
model = Sequential()
model.add(Input(shape=(context_len,)))
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=context_len))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))  # Average embeddings
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train
history = model.fit(X, y, epochs=80, batch_size=16, verbose=2)







Epoch 1/80
3/3 - 2s - 508ms/step - accuracy: 0.0426 - loss: 3.7840
Epoch 2/80
3/3 - 0s - 38ms/step - accuracy: 0.1064 - loss: 3.7758
Epoch 3/80
3/3 - 0s - 40ms/step - accuracy: 0.1064 - loss: 3.7693
Epoch 4/80
3/3 - 0s - 56ms/step - accuracy: 0.1702 - loss: 3.7627
Epoch 5/80
3/3 - 0s - 44ms/step - accuracy: 0.2553 - loss: 3.7560
Epoch 6/80
3/3 - 0s - 40ms/step - accuracy: 0.3191 - loss: 3.7495
Epoch 7/80
3/3 - 0s - 40ms/step - accuracy: 0.3830 - loss: 3.7428
Epoch 8/80
3/3 - 0s - 34ms/step - accuracy: 0.5106 - loss: 3.7359
Epoch 9/80
3/3 - 0s - 36ms/step - accuracy: 0.5957 - loss: 3.7291
Epoch 10/80
3/3 - 0s - 43ms/step - accuracy: 0.6170 - loss: 3.7223
Epoch 11/80
3/3 - 0s - 57ms/step - accuracy: 0.6383 - loss: 3.7150
Epoch 12/80
3/3 - 0s - 41ms/step - accuracy: 0.6596 - loss: 3.7078
Epoch 13/80
3/3 - 0s - 60ms/step - accuracy: 0.6596 - loss: 3.7002
Epoch 14/80
3/3 - 0s - 35ms/step - accuracy: 0.6809 - loss: 3.6927
Epoch 15/80
3/3 - 0s - 33ms/step - accuracy: 0.6809 - loss: 3.6845
Epo

In [10]:
# Cell 4: Output - Predictions and embeddings

# Get learned embeddings
embeddings = model.get_layer("embedding").get_weights()[0]
print("Embedding matrix shape:", embeddings.shape)

# Function to predict missing word from context
def predict_word(context_words):
    context_indices = [word_index.get(w, 0) for w in context_words]
    context_indices = np.array(context_indices).reshape(1, -1)
    probs = model.predict(context_indices, verbose=0)[0]
    pred_idx = np.argmax(probs)
    return index_word[pred_idx], float(probs[pred_idx])

# Example prediction
context = ["deep", "models", "cbow", "help"]  # must match context length = 4
pred_word, prob = predict_word(context)
print("Context:", context)
print("Predicted Word:", pred_word, "with probability:", prob)

# Nearest words in embedding space
def nearest_words(word, top_k=5):
    if word not in word_index:
        return []
    w_idx = word_index[word]
    vec = embeddings[w_idx]
    norms = np.linalg.norm(embeddings, axis=1)
    sims = embeddings.dot(vec) / (norms * np.linalg.norm(vec) + 1e-9)
    top = np.argsort(-sims)[1: top_k + 1]
    return [(index_word[i], float(sims[i])) for i in top]

print("Nearest words to 'learning':", nearest_words("learning"))


Embedding matrix shape: (44, 50)
Context: ['deep', 'models', 'cbow', 'help']
Predicted Word: like with probability: 0.06344014406204224
Nearest words to 'learning': [('understand', 0.5812535285949707), ('models', 0.5486053824424744), ('languagedeep', 0.42523548007011414), ('human', 0.3501772880554199), ('and', 0.29222819209098816)]
