<div style="border-radius:10px; padding: 15px; background-color: yellow; font-size:130%; text-align:left">

<h2 align="left"><font color=black>Implement the Continuous Bag of Words (CBOW) Model. Stages can be:</font></h2>
a. Data preparation  
b. Generate training data  
c. Train model  
d. Output
<h4 align="right"><font color=black>-Tushar Bhagat Roll_No. 07</font></h4>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten

# Sample data
corpus = [
    "I love machine learning",
    "Deep learning is a subset of machine learning",
    "Natural language processing is fascinating",
    "I enjoy learning new things",
    "Machine learning can be applied in various fields"
]

# Step 1: Data preparation
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
total_words = len(word_index) + 1  # Add 1 for padding

print("Word Index:", word_index)
print("Total Words:", total_words)


Word Index: {'learning': 1, 'machine': 2, 'i': 3, 'is': 4, 'love': 5, 'deep': 6, 'a': 7, 'subset': 8, 'of': 9, 'natural': 10, 'language': 11, 'processing': 12, 'fascinating': 13, 'enjoy': 14, 'new': 15, 'things': 16, 'can': 17, 'be': 18, 'applied': 19, 'in': 20, 'various': 21, 'fields': 22}
Total Words: 23


In [2]:
# Step 2: Generate training data
def generate_cbow_data(corpus, window_size=2):
    input_data = []
    output_data = []
    
    for sentence in corpus:
        tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
        for i in range(window_size, len(tokenized_sentence) - window_size):
            context = []
            for j in range(i - window_size, i + window_size + 1):
                if j != i:  # Skip the target word
                    context.append(tokenized_sentence[j])
            input_data.append(context)
            output_data.append(tokenized_sentence[i])
    
    return np.array(input_data), np.array(output_data)

input_data, output_data = generate_cbow_data(corpus)
print("Input Data:", input_data)
print("Output Data:", output_data)


Input Data: [[ 6  1  7  8]
 [ 1  4  8  9]
 [ 4  7  9  2]
 [ 7  8  2  1]
 [10 11  4 13]
 [ 3 14 15 16]
 [ 2  1 18 19]
 [ 1 17 19 20]
 [17 18 20 21]
 [18 19 21 22]]
Output Data: [ 4  7  8  9 12  1 17 18 19 20]


In [3]:
# Step 3: Train model
# One-hot encoding of the output data
output_data = tf.keras.utils.to_categorical(output_data, num_classes=total_words)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=input_data.shape[1]))
model.add(Flatten())
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(input_data, output_data, epochs=100, verbose=1)


Epoch 1/100




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0000e+00 - loss: 3.1428
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0000e+00 - loss: 3.1363
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 3.1298
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.0000e+00 - loss: 3.1233
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.0000e+00 - loss: 3.1168
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0000e+00 - loss: 3.1102
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.1000 - loss: 3.1037
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.2000 - loss: 3.0972
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1fbde20e950>

In [6]:
# Step 4: Output
def predict_word(context):
    context_seq = tokenizer.texts_to_sequences([context])[0]
    context_seq = pad_sequences([context_seq], maxlen=input_data.shape[1], padding='post')
    predicted = model.predict(context_seq)
    predicted_word_index = np.argmax(predicted, axis=-1)[0]
    for word, index in word_index.items():
        if index == predicted_word_index:
            return word

# Test prediction
context_words = ["I", "love", "language", "machine"]
predicted_word = predict_word(context_words)
print(f"Context: {context_words}, Predicted Word: {predicted_word}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Context: ['I', 'love', 'language', 'machine'], Predicted Word: learning
