# Implement the Continuous Bag of Words (CBOW) model.

a) Data Preparation. 

In [1]:
corpus = [
    "I like to learn deep learning",
    "Deep learning is interesting",
    "I enjoy studying deep learning"
]

b) Generate Training data.

In [2]:
# tokenizer is to split paragraph into small unit
from tensorflow.keras.preprocessing.text import Tokenizer
# skipgram is to skipped token
from tensorflow.keras.preprocessing.sequence import skipgrams

# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}

# Generate training data
vocab_size = len(word2idx) + 1
target_words, context_words = [], []
# for control flow statement
for sentence in corpus:
    tokenized = tokenizer.texts_to_sequences([sentence])[0]
    pairs, _ = skipgrams(tokenized, vocabulary_size=vocab_size, window_size=1, negative_samples=0)
    target, context = zip(*pairs)
    target_words.extend(target)
    context_words.extend(context)

c) Train Model.

In [3]:
# tensorflow is library for ML for training deep NN
import tensorflow as tf
# Sequentials is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor.
from tensorflow.keras.models import Sequential
# reshape give shape dense feed output from previous layer embedding is implement functionality
from tensorflow.keras.layers import Embedding, Dense, Reshape

embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1))
model.add(Reshape((embedding_dim,)))
model.add(Dense(units=vocab_size, activation='softmax'))

#adam is optimization of stochastic gradient descent
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.summary()

# Train the model
# epochs is nn that train data for one cycle
model.fit(x=target_words, y=context_words, epochs=50)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 100)            1100      
                                                                 
 reshape (Reshape)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 11)                1111      
                                                                 
Total params: 2211 (8.64 KB)
Trainable params: 2211 (8.64 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/

<keras.src.callbacks.History at 0x23bb986a590>

d) Output.

In [4]:
word_to_lookup = "deep"
word_idx = word2idx[word_to_lookup]
word_embedding = model.layers[0].get_weights()[0][word_idx]

print(f"Embedding for '{word_to_lookup}': {word_embedding}")

Embedding for 'deep': [ 8.41328874e-02  5.61344139e-02 -5.13058566e-02 -7.88685083e-02
 -9.91607383e-02  2.61893086e-02  2.06635892e-02 -6.09128252e-02
 -3.73820066e-02  5.98195046e-02 -5.57249784e-03  8.60996023e-02
  2.02384237e-02 -9.22028869e-02  6.75316453e-02  9.99911204e-02
  2.28100065e-02 -4.97749075e-02 -6.98095113e-02  7.34467953e-02
 -5.73070869e-02 -2.31165849e-02 -4.10012715e-03 -4.91816700e-02
  6.94095641e-02  2.69928086e-03  4.69533214e-03  9.61602200e-03
 -3.08772596e-03 -5.56841753e-02  3.45945656e-02 -7.15998039e-02
  9.93780419e-02  8.83758292e-02 -5.64689375e-02  7.85832927e-02
  1.78814139e-02  6.97733238e-02 -7.09630502e-03 -8.90499502e-02
  9.29317400e-02  3.82730849e-02 -4.85376781e-03  1.84024461e-02
  2.18894016e-02  2.34995678e-05  8.21887329e-02  1.04050953e-02
 -7.20224436e-03  1.20419599e-02  1.87101327e-02  6.81096166e-02
 -7.08519947e-03 -8.06856677e-02  1.00942455e-01  9.44088697e-02
  4.73482609e-02 -3.77868675e-02  5.39150536e-02  4.03142301e-03
  8