In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.decomposition import PCA 


# Define the corpus
corpus = [
    'The cat sat on the mat',
    'The dog ran in the park',
    'The bird sang in the tree'
]

# Convert the corpus to a sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
print("After converting our words in the corpus into vector of integers:")
print(sequences)


After converting our words in the corpus into vector of integers:
[[1, 3, 4, 5, 1, 6], [1, 7, 8, 2, 1, 9], [1, 10, 11, 2, 1, 12]]


In [None]:
# Define the parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_size = 10
window_size = 2

# Generate the context-target pairs
contexts = []
targets = []
for sequence in sequences:
    for i in range(window_size, len(sequence) - window_size):
        context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
        target = sequence[i]
        contexts.append(context)
        targets.append(target)

# Convert the contexts and targets to numpy arrays
X = np.array(contexts)
y = to_categorical(targets, num_classes=vocab_size)

# Define the CBOW model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=2 * window_size))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
model.add(Dense(units=vocab_size, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100, verbose=0)


In [None]:
# Extract the embeddings
embedding_layer = model.layers[0]
embeddings = embedding_layer.get_weights()[0]

# Perform PCA to reduce the dimensionality of the embeddings
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Visualize the embeddings
plt.figure(figsize=(5, 5))
for word, idx in tokenizer.word_index.items():
    x, y = reduced_embeddings[idx]
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), xytext=(5, 2),
                 textcoords='offset points', ha='right', va='bottom')
plt.title("Word Embeddings Visualized")
plt.show()


In [5]:
# Import libraries
import tensorflow as tf
import keras
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Our small dataset (a list of sentences)
sentences = [
    'The cat sat on the mat',
    'The dog ran in the park',
    'The bird sang in the tree'
]

# Step 1: Convert words to numbers
# We create a "dictionary" to assign a unique number to each word
word_to_number = {}
number = 1
for sentence in sentences:
    for word in sentence.lower().split():
        if word not in word_to_number:
            word_to_number[word] = number
            number += 1

# Convert sentences to lists of numbers
number_sequences = []
for sentence in sentences:
    sequence = [word_to_number[word] for word in sentence.lower().split()]
    number_sequences.append(sequence)

print("Sentences as numbers:", number_sequences)

# Step 2: Create training data
vocab_size = len(word_to_number) + 1  # Total unique words (+1 for indexing)
embedding_size = 10  # Size of word vectors
window_size = 2  # Number of words to look at before and after the target word

contexts = []  # Input: surrounding words
targets = []   # Output: middle word

for sequence in number_sequences:
    for i in range(window_size, len(sequence) - window_size):
        # Context: words before and after the target (e.g., [word1, word2, word4, word5])
        context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
        # Target: the middle word (e.g., word3)
        target = sequence[i]
        contexts.append(context)
        targets.append(target)

# Convert to arrays for training
X = np.array(contexts)  # Input data
y = np.zeros((len(targets), vocab_size))  # Output data (one-hot encoded)
for i, target in enumerate(targets):
    y[i, target] = 1  # Set the target word's position to 1

# Step 3: Build the model
model = Sequential([
    # Layer 1: Convert words to vectors (embeddings)
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=2 * window_size),
    # Layer 2: Average the vectors of context words
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    # Layer 3: Predict the target word
    Dense(units=vocab_size, activation='softmax')
])

# Step 4: Compile and train the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=0)  # Train for 100 rounds

# Step 5: Get the word vectors
embeddings = model.layers[0].get_weights()[0]  # Get the learned word vectors

# Step 6: Reduce vectors to 2D for visualization
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(embeddings)

# Step 7: Plot the words
plt.figure(figsize=(6, 6))
for word, idx in word_to_number.items():
    x, y = reduced_vectors[idx]
    plt.scatter(x, y)
    plt.text(x + 0.05, y, word, fontsize=10)
plt.title("Word Vectors Visualized")
plt.show()

Sentences as numbers: [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 1, 9], [1, 10, 11, 8, 1, 12]]


NameError: name 'Sequential' is not defined