In [1]:
import numpy as np
import tensorflow as tf
import operator
from tensorflow import keras
from keras.utils import np_utils

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
import keras.backend as K
import matplotlib.pyplot as plt
from keras.models import load_model
from cbow_model import cbow_model

In [2]:
file_name = 'ALICE.txt'
corpus = open(file_name).readlines()

In [3]:
# Remove sentences with fewer than 3 words
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

# Remove punctuation in text and fit tokenizer on entire corpus
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)

# Convert text to sequence of integer values
corpus = tokenizer.texts_to_sequences(corpus)
n_samples = sum(len(s) for s in corpus) # Total number of words in the corpus
V = len(tokenizer.word_index) + 1 # Total number of unique words in the corpus

n_samples, V

(27165, 2557)

In [4]:
my_dict = tokenizer.word_index

vocab = []
for key, value in my_dict.items():
    vocab.append(key)

print(len(vocab))

2556


In [5]:
file_name = 'ALICE.txt'
corpus_lst = open(file_name).readlines()
# Remove sentences with fewer than 3 words
corpus_lst = [sentence for sentence in corpus_lst if sentence.count(" ") >= 2]

In [6]:
# Parameters
window_size = 2 
window_size_corpus = window_size*2

# Set numpy seed for reproducible results
np.random.seed(42)

In [7]:
#cbow_model = cbow_model(vocab_size = V, embedding_dim = 100 , window_size = window_size)
cbow_model = load_model('my_model.h5')
cbow_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            255700    
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2557)              258257    
                                                                 
Total params: 513,957
Trainable params: 513,957
Non-trainable params: 0
_________________________________________________________________


In [8]:
from keras.preprocessing import sequence

# Prepare the data for the CBOW model
def generate_data_cbow(corpus, window_size, V):
    all_in = []
    all_out = []

    # Iterate over all sentences
    for sentence in corpus:
        L = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1

            # Empty list which will store the context words
            context_words = []
            for i in range(start, end):
                # Skip the 'same' word
                if i != index:
                    # Add a word as a context word if it is within the window size
                    if 0 <= i < L:
                        context_words.append(sentence[i])
                    else:
                        # Pad with zero if there are no words 
                        context_words.append(0)
            # Append the list with context words
            all_in.append(context_words)

            # Add one-hot encoding of the target word
            all_out.append(to_categorical(word, V))
                 
    return (np.array(all_in), np.array(all_out))

In [9]:
# Create the training data
X_cbow, y_cbow = generate_data_cbow(corpus, window_size, V)
X_cbow.shape, y_cbow.shape

((27165, 4), (27165, 2557))

In [82]:
# Define the vocabulary
vocabulary = vocab

# Define a context of words
context = 'In another moment down went'.split()
context

['In', 'another', 'moment', 'down', 'went']

In [84]:
# Convert the context to a vector
context_vector = np.zeros((len(vocabulary),))
for word in context:
    if word in vocabulary:
        context_vector[vocabulary.index(word)] += 1

# Normalize the context vector
context_vector /= np.linalg.norm(context_vector)

# Reshape the context vector to have a shape of (1, len(vocabulary))
context_vector = context_vector.reshape(1, len(vocabulary))

array([[0., 0., 0., ..., 0., 0., 0.]])

In [9]:
# Define the context window
context = ["this", "is", "a", "test"]

# Convert the context to a vector
context_vector = np.zeros((len(vocabulary),))
for word in context:
    if word in vocabulary:
        context_vector[vocabulary.index(word)] += 1

# Normalize the context vector
context_vector /= np.linalg.norm(context_vector)

# Reshape the context vector to have a shape of (1, len(vocabulary))
context_vector = context_vector.reshape(1, len(vocabulary))

# Predict the target word
prediction = cbow_model.predict(context_vector)[0]

# Get the index of the predicted word
predicted_index = np.argmax(prediction)

# Get the predicted word from the vocabulary
predicted_word = vocabulary[predicted_index]

print("The predicted next word is:", predicted_word)

NameError: name 'vocabulary' is not defined

In [52]:
from keras.models import load_model
import numpy as np
import re

# Load the CBOW model from an HDF5 file
model = load_model('my_model.h5')

# Define a misspelled word to correct
misspelled_word = 'zhe'

# Check if the misspelled word is in the vocabulary
if misspelled_word in model.get_layer('embedding').get_weights()[0].tolist():
    # If the misspelled word is already in the vocabulary, there's nothing to correct
    print(f'{misspelled_word} is already in the vocabulary')
else:
    # If the misspelled word is not in the vocabulary, try to correct it
    # by finding the most similar word in the vocabulary
    # First, remove any non-alphabetic characters from the misspelled word
    misspelled_word = re.sub('[^a-zA-Z]', '', misspelled_word)

    # Check if the cleaned-up misspelled word is still not in the vocabulary
    if misspelled_word not in model.get_layer('embedding').get_weights()[0].tolist():
        # If the cleaned-up misspelled word is still not in the vocabulary,
        # we can't correct it
        print(f'{misspelled_word} is not in the vocabulary and can\'t be corrected')
    else:
        # Get the embedding for the misspelled word
        misspelled_word_embedding = model.get_layer('embedding_layer').get_weights()[0][model.get_layer('embedding').get_weights()[0].tolist().index(misspelled_word)]

        # Calculate the cosine similarity between the misspelled word and all other words in the vocabulary
        word_embeddings = model.get_layer('embedding_layer').get_weights()[0]
        cosine_similarities = np.dot(word_embeddings, misspelled_word_embedding) / (np.linalg.norm(word_embeddings, axis=1) * np.linalg.norm(misspelled_word_embedding))

        # Get the most similar word in the vocabulary to the misspelled word
        most_similar_word_index = np.argsort(cosine_similarities)[::-1][1]
        most_similar_word = model.get_layer('embedding').get_weights()[0][most_similar_word_index]

        # Print the corrected word and its cosine similarity to the misspelled word
        cosine_similarity = cosine_similarities[most_similar_word_index]
        print(f'{misspelled_word} corrected to {most_similar_word} with cosine similarity {cosine_similarity}')

zhe is not in the vocabulary and can't be corrected


In [8]:
# Load the CBOW model from the h5 file
model = load_model('my_model.h5')

# Define a function to correct a misspelled word
def correct_spelling(word):
    # Convert the word to lowercase
    word = word.lower()
    
    # Find the nearest neighbors of the misspelled word
    neighbors = model.(word, topn=5)
    
    # Extract the corrected word with the highest similarity score
    corrected_word = neighbors[0][0]
    
    return corrected_word