In [1]:
# --- a. Data Preparation ---

import numpy as np
import re
from collections import defaultdict

In [2]:
# Load the text file
with open("CBOW.txt", "r", encoding="utf-8") as file:
    text = file.read().lower()

In [3]:
# Clean and tokenize
text = re.sub(r'[^a-z\s]', '', text)
words = text.split()
print("Total words:", len(words))
print("Sample words:", words[:20])

Total words: 177
Sample words: ['the', 'speed', 'of', 'transmission', 'is', 'an', 'important', 'point', 'of', 'difference', 'between', 'the', 'two', 'viruses', 'influenza', 'has', 'a', 'shorter', 'median', 'incubation']


In [4]:
# Create vocabulary and mappings
vocab = sorted(set(words))
vocab_size = len(vocab)

word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}

print("Vocabulary size:", vocab_size)
print("Example mapping:", list(word_to_index.items())[:10])

Vocabulary size: 92
Example mapping: [('a', 0), ('an', 1), ('and', 2), ('appear', 3), ('appearance', 4), ('are', 5), ('at', 6), ('be', 7), ('before', 8), ('between', 9)]


In [5]:
def generate_cbow_data(tokens, w2i, vsize, wsize):
    X, Y = [], []
    for i, t in enumerate(tokens):
        ctx = [tokens[j] for j in range(i-wsize, i+wsize+1) if j!=i and 0<=j<len(tokens)]
        if not ctx: continue
        x = np.zeros(vsize); [x.__setitem__(w2i[w], x[w2i[w]]+1) for w in ctx]
        y = np.zeros(vsize); y[w2i[t]] = 1
        X.append(x); Y.append(y)
    X, Y = np.array(X, 'float32'), np.array(Y, 'float32')
    print(f"✅ {len(X)} samples | Input Shape:{X.shape} | Output Shape:{Y.shape}")
    return X, Y

# Call the function
WINDOW_SIZE = 2
X_cbow, Y_cbow = generate_cbow_data(words, word_to_index, vocab_size, WINDOW_SIZE)


✅ 177 samples | Input Shape:(177, 92) | Output Shape:(177, 92)


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

# --- d. CBOW Model Training ---
EMBEDDING_DIM = 100
EPOCHS = 100

model = Sequential([
    Input(shape=(vocab_size,), name="Input_Layer"),
    Dense(EMBEDDING_DIM, activation='linear', name='Embedding'),
    Dense(vocab_size, activation='softmax', name='Output')
])

model.compile(optimizer=Adam(0.01), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [8]:
print(f"\nTraining CBOW model for {EPOCHS} epochs...")
model.fit(X_cbow, Y_cbow, epochs=EPOCHS, verbose=1)
print("\n✅ CBOW model training complete.")


Training CBOW model for 100 epochs...
Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.0226 - loss: 4.5377  
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.3842 - loss: 3.6771
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4802 - loss: 2.9262 
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5424 - loss: 2.2530 
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7345 - loss: 1.5981
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8927 - loss: 1.0563
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9322 - loss: 0.6803 
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9718 - loss: 0.4342
Epoch 9/100
[

In [9]:
# --- Extract & Display Word Embeddings ---
embeddings = model.get_layer('Embedding').get_weights()[0]
print("Embedding matrix shape:", embeddings.shape)

word = 'learning'
if word in word_to_index:
    print(f"\n'{word}' embedding (first 5 dims):", embeddings[word_to_index[word]][:5])
else:
    print(f"'{word}' not in vocabulary.")

# --- Predict Target Word ---
def predict(context, model, w2i, i2w, vocab_size):
    vec = np.zeros((1, vocab_size))
    for w in context:
        if w in w2i: vec[0, w2i[w]] += 1
    probs = model.predict(vec, verbose=0)[0]
    idx = np.argmax(probs)
    return i2w[idx], probs[idx]

# --- Example ---
context = ['shorter', 'incubation', 'period']
word, p = predict(context, model, word_to_index, index_to_word, vocab_size)
print(f"\nContext: {context}\nPredicted: '{word}' (p={p:.4f})")


Embedding matrix shape: (92, 100)

'learning' embedding (first 5 dims): [-0.00434319 -0.49688897 -0.02037597  0.23936544  0.21846247]

Context: ['shorter', 'incubation', 'period']
Predicted: 'median' (p=0.9434)
