In [None]:
#Running a shallow neural network program to demonstrate CBOW.

In [19]:
# Ignore deprecation warnings for cleaner output.
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Import libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential  # Sequential model
from tensorflow.keras.layers import Dense, Embedding, Flatten  # Layers for neural network
from tensorflow.keras.preprocessing.text import Tokenizer  # Tokenizer for text preprocessing
from tensorflow.keras.preprocessing.sequence import pad_sequences  # For padding sequences

# Sample corpus of sentences (training curpus) 
sentences = [
    "he is a great scholar",
    "a great scholar writes great papers",
    "the scholar is very great",
    "great ideas come from great minds"
]

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)  # Fit tokenizer on sentences
total_words = len(tokenizer.word_index) + 1  # Total unique words in corpus

# Initialize lists for CBOW input-output pairs
input_data = []  # Context words
output_data = []  # Target word

# Define the context window size
window_size = 2

# Create input-output pairs for CBOW
for sentence in sentences:
    words = sentence.split()
    for i in range(window_size, len(words) - window_size):  # Loop through words
        context = []
        for j in range(i - window_size, i + window_size + 1):
            if j != i:  # Skip target word
                context.append(tokenizer.word_index[words[j]])  # Append context word index
        input_data.append(context)  # Add context to input data
        output_data.append(tokenizer.word_index[words[i]])  # Add target word to output data

# Pad input sequences to ensure uniform length
input_data = pad_sequences(input_data, padding='post')

# One-hot encode output data for categorical labels
output_data = np.array(output_data)
output_data = np.eye(total_words)[output_data]  # Convert to one-hot

# Build CBOW model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=window_size * 2))  # Embedding layer
model.add(Flatten())  # Flatten to 1D
model.add(Dense(total_words, activation='softmax'))  # Output layer with softmax for word prediction

# Compile model with optimizer, loss, and metrics
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model on input-output pairs
model.fit(input_data, output_data, epochs=20, verbose=1)

# Retrieve embedding for the word "great"
great_index = tokenizer.word_index['great']  # Index of "great" in vocabulary
great_embedding = model.layers[0].get_weights()[0][great_index]  # Extract embedding
print("Embedding for 'great':", great_embedding)

Epoch 1/20




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 725ms/step - accuracy: 0.1667 - loss: 2.6119
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.3333 - loss: 2.6053
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5000 - loss: 2.5986
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.5000 - loss: 2.5919
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5000 - loss: 2.5852
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5000 - loss: 2.5785
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.6667 - loss: 2.5718
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.6667 - loss: 2.5651
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2