In [2]:
# Using the model to predict embeddings for the same input (self-supervised)
model.fit(X, X, epochs=10, batch_size=32)


Epoch 1/10


ValueError: Dimensions must be equal, but are 808 and 64 for '{{node compile_loss/mean_squared_error/sub}} = Sub[T=DT_FLOAT](compile_loss/mean_squared_error/Cast, functional_1/dense_1_2/Relu)' with input shapes: [?,808], [?,64].

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Embedding, Input, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('gutenberg')

# Load sample text corpus from NLTK
corpus = nltk.corpus.gutenberg.sents('austen-emma.txt')
sentences = [' '.join(sent) for sent in corpus[:1000]]  # Using first 1000 sentences for this example

# Preprocess text: Tokenize characters and pad sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
maxlen = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=maxlen)
vocab_size = len(tokenizer.word_index) + 1

# Define CNN architecture for character embeddings
input_layer = Input(shape=(maxlen,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen)(input_layer)
conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dense_layer = Dense(128, activation='relu')(pooling_layer)
output_layer = Dense(vocab_size, activation='softmax')(dense_layer)  # Update to match vocabulary size

# Build and compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')  # Use appropriate loss function
model.summary()

# Adjust data for training the model
y = np.expand_dims(X, -1)  # Adjust target to match expected output shape

# Using the model to predict embeddings for the same input (self-supervised)
model.fit(X, y, epochs=10, batch_size=32)

# Extract sentence embeddings
embedding_model = Model(inputs=model.input, outputs=model.get_layer('dense').output)
sentence_embeddings = embedding_model.predict(X)

# Reduce dimensionality for visualization using PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(sentence_embeddings)

# Plot PCA results
plt.figure(figsize=(10, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1])
plt.title('PCA of Sentence Embeddings')
plt.show()

# Reduce dimensionality for visualization using t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(sentence_embeddings)

# Plot t-SNE results
plt.figure(figsize=(10, 6))
plt.scatter(tsne_result[:, 0], tsne_result[:, 1])
plt.title('t-SNE of Sentence Embeddings')
plt.show()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\22anj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\22anj\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Epoch 1/10


ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(None, 808), output.shape=(None, 48)