In [1]:
# a. Data preparation
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K

In [3]:
# Read text file
with open("CBOW.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word2idx = tokenizer.word_index
vocab_size = len(word2idx) + 1


In [4]:
# Convert text to sequence safely (handles punctuation automatically)
seq = tokenizer.texts_to_sequences([text])[0]

In [5]:
# b. Generate training data
window = 2
X, y = [], []
for i in range(window, len(seq) - window):
    context = [seq[i - 2], seq[i - 1], seq[i + 1], seq[i + 2]]
    target = seq[i]
    X.append(context)
    y.append(target)
X, y = np.array(X), np.array(y)

In [6]:
# c. Define & train CBOW model
model = Sequential([
    Embedding(vocab_size, 10, input_length=4),
    Lambda(lambda x: K.mean(x, axis=1)),
    Dense(vocab_size, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.fit(X, y, epochs=200, verbose=0)

<keras.callbacks.History at 0x192aa989300>

In [7]:
# d. Output
embeddings = model.layers[0].get_weights()[0]
print("Word Embedding for 'model':", embeddings[word2idx.get('model', 0)])

Word Embedding for 'model': [-0.02550179 -0.0236535   0.02393511 -0.02052165  0.029468    0.00899116
 -0.00842275 -0.02160802 -0.04466204 -0.02733266]
