In [1]:
# Step 1: Import libraries
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend as K

In [2]:
# Step 2: Data Preparation
corpus = [
    "The quick brown fox jumped over the lazy dog",
    "I love deep learning and neural networks",
    "Natural language processing is fascinating"
]

# Convert to lowercase
corpus = [sentence.lower().split() for sentence in corpus]
print("Tokenized Corpus:", corpus)

Tokenized Corpus: [['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog'], ['i', 'love', 'deep', 'learning', 'and', 'neural', 'networks'], ['natural', 'language', 'processing', 'is', 'fascinating']]


In [3]:
# Step 3: Build Vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts([' '.join(sentence) for sentence in corpus])
word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}
vocab_size = len(word2id) + 1
print("\nVocabulary:", word2id)



Vocabulary: {'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'jumped': 5, 'over': 6, 'lazy': 7, 'dog': 8, 'i': 9, 'love': 10, 'deep': 11, 'learning': 12, 'and': 13, 'neural': 14, 'networks': 15, 'natural': 16, 'language': 17, 'processing': 18, 'is': 19, 'fascinating': 20}


In [4]:
# Step 4: Generate Training Data (CBOW)
window_size = 2
data = []

for sentence in corpus:
    sentence_ids = [word2id[word] for word in sentence]
    for idx, target in enumerate(sentence_ids):
        start = max(0, idx - window_size)
        end = min(len(sentence_ids), idx + window_size + 1)
        context = [sentence_ids[i] for i in range(start, end) if i != idx]
        for word in context:
            data.append((context, target))

print("\nSample (context -> target):")
for i in range(3):
    ctx_words = [id2word[w] for w in data[i][0]]
    tgt_word = id2word[data[i][1]]
    print(f"Context: {ctx_words} → Target: {tgt_word}")


Sample (context -> target):
Context: ['quick', 'brown'] → Target: the
Context: ['quick', 'brown'] → Target: the
Context: ['the', 'brown', 'fox'] → Target: quick


In [5]:
# Step 5: Prepare Input and Output Arrays
X = []
y = []

for context, target in data:
    X.append(context)
    y.append(target)

# Pad / reshape context
max_context_len = 2 * window_size
X = np.array([np.pad(x, (0, max_context_len - len(x)), constant_values=0) for x in X])
y = np.array(y)

In [6]:
# Step 6: Define CBOW Model
embedding_dim = 10
input_layer = Input(shape=(max_context_len,))
embedding = Embedding(vocab_size, embedding_dim, input_length=max_context_len)(input_layer)
mean = Lambda(lambda x: K.mean(x, axis=1))(embedding)
output = Dense(vocab_size, activation='softmax')(mean)

model = Model(inputs=input_layer, outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()






In [7]:
# Step 7: Train the model
history = model.fit(X, y, epochs=50, verbose=0)
print("\nModel training completed!")


Model training completed!


In [8]:
# Step 8: Output word embeddings
weights = model.get_layer('embedding').get_weights()[0]
print("\nWord Embedding for 'deep':")
print(weights[word2id['deep']])


Word Embedding for 'deep':
[-0.09790551 -0.14152099  0.02024945 -0.06452678  0.09479229 -0.0211451
  0.07026905  0.08406613 -0.03262172 -0.01925941]
