#### Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 1)

In [82]:
# a.Import libraries

import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

### a. Data preparation

In [83]:
# Download tokenizer (only first time)
nltk.download('punkt')

# Load text file
with open("CBOW.txt", "r") as file:
    text = file.read().lower()

# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)

# Tokenize text into words
tokens = word_tokenize(text)

print("Total tokens:", len(tokens))
print("Sample tokens:", tokens[:15])

# Create a vocabulary
vocab = sorted(set(tokens))
vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)

# Create mapping of word to index
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}


Total tokens: 177
Sample tokens: ['the', 'speed', 'of', 'transmission', 'is', 'an', 'important', 'point', 'of', 'difference', 'between', 'the', 'two', 'viruses', 'influenza']
Vocabulary size: 92


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aashlesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### b. Generate Training Data

In [84]:
window_size = 2  # context window
data = []

# Create (context, target) pairs
for i in range(window_size, len(tokens) - window_size):
    context = [tokens[i - 2], tokens[i - 1], tokens[i + 1], tokens[i + 2]]
    target = tokens[i]
    data.append((context, target))

print("Total training pairs:", len(data))
print("Example pair:", data[100])


Total training pairs: 173
Example pair: (['in', 'contrast', 'we', 'are'], 'while')


In [85]:
# Convert words to one-hot vectors
def one_hot_encode(word):
    vector = np.zeros(vocab_size)
    vector[word2idx[word]] = 1
    return vector

# Prepare training data
X = []  # inputs (contexts)
Y = []  # outputs (target)

for context, target in data:
    context_vec = np.sum([one_hot_encode(w) for w in context], axis=0)
    X.append(context_vec)
    Y.append(one_hot_encode(target))

X = np.array(X)
Y = np.array(Y)

print("Input shape:", X.shape)
print("Output shape:", Y.shape)


Input shape: (173, 92)
Output shape: (173, 92)


### c. Train Model

In [86]:
model = Sequential()
model.add(Dense(32, input_dim=vocab_size, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

history = model.fit(X, Y, epochs=100, verbose=1)


Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.0058 - loss: 4.5182     
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1618 - loss: 4.2551 
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2081 - loss: 3.9644 
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2081 - loss: 3.6093 
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1965 - loss: 3.2999 
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2601 - loss: 2.9806 
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4104 - loss: 2.6313 
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5145 - loss: 2.2818
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[

### d. Output

In [87]:
# Pick a random context to test
import random
idx = random.randint(0, len(data) - 1)
test_context, actual_target = data[idx]

context_vec = np.sum([one_hot_encode(w) for w in test_context], axis=0)
pred = model.predict(context_vec.reshape(1, -1))
pred_word = idx2word[np.argmax(pred)]

print(f"\nRandom example index: {idx}")
print("Context words:", test_context)
print("Actual target word:", actual_target)
print("Predicted target word:", pred_word)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step

Random example index: 56
Context words: ['virus', 'the', 'interval', 'is']
Actual target word: serial
Predicted target word: serial
