In [1]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import gensim

In [2]:
import numpy as np
import tensorflow as tf
from scipy.spatial.distance import cdist

# Load GloVe word vectors
def load_word_vectors(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
    return words, word_to_vec_map

# Load data and create vocabulary
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().split()
            data.append(line)
    words = set([word for line in data for word in line])
    word_to_idx = {word: idx for idx, word in enumerate(words)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    return data, word_to_idx, idx_to_word

# Generate batches for training
def generate_batches(data, word_to_idx, window_size, batch_size):
    X, y = [], []
    while True:
        for line in data:
            for i, word in enumerate(line):
                for j in range(max(i - window_size, 0), min(i + window_size + 1, len(line))):
                    if i != j:
                        X.append(word_to_idx[word])
                        y.append(word_to_idx[line[j]])
                        if len(X) == batch_size:
                            yield np.array(X), np.array(y)
                            X, y = [], []

# Build the CBOW model
class CBOWModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size):
        super(CBOWModel, self).__init__()
        self.embeddings = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=1)
        self.mean = tf.keras.layers.Lambda(lambda x: tf.keras.backend.mean(x, axis=1))
        self.dense = tf.keras.layers.Dense(units=vocab_size, activation='softmax')

    def call(self, inputs):
        x = self.embeddings(inputs)
        x = self.mean(x)
        x = self.dense(x)
        return x

# Train the CBOW model
def train_cbow_model(data_file, glove_file, window_size=2, embedding_size=100, batch_size=128, num_epochs=50, learning_rate=0.01, save_path=None):
    # Load data and word vectors
    data, word_to_idx, idx_to_word = load_data(data_file)
    words, word_to_vec_map = load_word_vectors(glove_file)
    vocab_size = len(word_to_idx)

    # Generate batches for training
    batch_generator = generate_batches(data, word_to_idx, window_size, batch_size)

    # Build and compile the model
    model = CBOWModel(vocab_size, embedding_size)
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy')

    # Train the model
    history = model.fit(batch_generator, epochs=num_epochs, steps_per_epoch=len(data) * (len(data[0]) - 2*window_size) // batch_size)

    # Save the model
    if save_path is not None:
        model.save(save_path)

    # Return the trained model
    return model, word_to_idx, idx_to_word

In [6]:
train_cbow_model("Frankenstein.txt", "glove.6B.100d.txt", save_path="model.h5")

ValueError: Exception encountered when calling layer 'cbow_model' (type CBOWModel).

Input 0 of layer "dense" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (128,)

Call arguments received by layer 'cbow_model' (type CBOWModel):
  • inputs=tf.Tensor(shape=(128,), dtype=int32)