In [1]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import gensim

In [2]:
import numpy as np
import tensorflow as tf
from scipy.spatial.distance import cdist

# Load GloVe word vectors
def load_word_vectors(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
    return words, word_to_vec_map

# Load data and create vocabulary
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().split()
            data.append(line)
    words = set([word for line in data for word in line])
    word_to_idx = {word: idx for idx, word in enumerate(words)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    return data, word_to_idx, idx_to_word

# Generate batches for training
def generate_batches(data, word_to_idx, window_size, batch_size):
    X, y = [], []
    while True:
        for line in data:
            for i, word in enumerate(line):
                for j in range(max(i - window_size, 0), min(i + window_size + 1, len(line))):
                    if i != j:
                        X.append(word_to_idx[word])
                        y.append(word_to_idx[line[j]])
                        if len(X) == batch_size:
                            yield np.array(X), np.array(y)
                            X, y = [], []

# Build the CBOW model
class CBOWModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size):
        super(CBOWModel, self).__init__()
        self.embeddings = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=1)
        self.mean = tf.keras.layers.Lambda(lambda x: tf.keras.backend.mean(x, axis=1))
        self.dense = tf.keras.layers.Dense(units=vocab_size, activation='softmax')
        

    def call(self, inputs):
        x = self.embeddings(inputs)
        x = self.mean(x)
        x = self.dense(x.reshape(x,))
        return x

# Train the CBOW model
def train_cbow_model(data_file, glove_file, window_size=2, embedding_size=100, batch_size=128, num_epochs=50, learning_rate=0.01, save_path=None):
    # Load data and word vectors
    data, word_to_idx, idx_to_word = load_data(data_file)
    words, word_to_vec_map = load_word_vectors(glove_file)
    vocab_size = len(word_to_idx)

    # Generate batches for training
    batch_generator = generate_batches(data, word_to_idx, window_size, batch_size)

    # Build and compile the model
    model = CBOWModel(vocab_size, embedding_size)
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy')

    # Train the model
    history = model.fit(batch_generator, epochs=num_epochs, steps_per_epoch=len(data) * (len(data[0]) - 2*window_size) // batch_size)

    # Save the model
    if save_path is not None:
        model.save(save_path)

    # Return the trained model
    return model, word_to_idx, idx_to_word

In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from spellchecker import SpellChecker

In [16]:
# Load the GloVe word embeddings
def load_embeddings(embedding_file):
    embeddings_index = {}
    with open(embedding_file, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

In [17]:
embedding_file = 'glove.6B.100d.txt'
embeddings_index = load_embeddings(embedding_file)

In [18]:
# Define a function to predict the next word given a context
def predict_next_word(context):
    context_words = context.split()
    context_embedding = np.zeros((len(embeddings_index[next(iter(embeddings_index))]),))
    for word in context_words:
        if word in embeddings_index:
            context_embedding += embeddings_index[word]
    context_embedding /= len(context_words)
    
    similarities = {}
    for word in embeddings_index.keys():
        if word not in context_words:
            word_embedding = embeddings_index[word]
            sim = cosine_similarity([context_embedding], [word_embedding])[0][0]
            similarities[word] = sim
            
    top_3_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:3]
    return [w[0] for w in top_3_words]


In [None]:
# Example usage
context = "The cat sat on the"
predictions = predict_next_word(context)
print(predictions)

In [52]:
def spell_checker(text):
    spell = SpellChecker()
    spell.word_frequency.load_text_file('Frankenstein.txt')

    # find those words that may be misspelled
    misspelled = spell.unknown(['asd', 'theaa', 'funny'])
    return misspelled


In [53]:
spell_checker("waht the fuc")

{'theaa'}

In [24]:
"waht the fuc".split()

['waht', 'the', 'fuc']

In [54]:
import Levenshtein

# Define the two strings
str1 = 'hello'
str2 = 'hlalo'

# Calculate the Levenshtein distance between the two strings
dist = Levenshtein.distance(str1, str2)
print(dist)


2
