Imports

In [2]:
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import random
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Masking, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.models import load_model
import json

Load training words

In [3]:

with open('words_250000_train.txt', 'r') as f:
    words = f.read().splitlines()

train_words, val_words = train_test_split(words, test_size=0.1, random_state=42)

Preprocess Data

In [10]:
def character_frequency(word_list):
    return Counter(''.join(word_list))

def generate_training_examples(word_list):
    training_examples = []
    for i in range(5):  # Generate 5 training examples for each word
        for word in word_list:
            chars = list(word)
            # Randomly select characters to remove
            indices_to_remove = random.sample(range(len(chars)), random.randint(1, len(chars)))
            # Collect the removed characters
            removed_chars = [chars[i] for i in indices_to_remove]
            for i in indices_to_remove:
                chars[i] = '_'
            highest_freq_char = max(removed_chars, key=lambda char: freq_counter[char])
            # Add the modified word and highest frequency character to the training examples
            training_examples.append((' '.join(chars), highest_freq_char))
    return training_examples

# Calculate character frequency from training words
freq_counter = character_frequency(train_words)

# Generate training examples for training and validation sets
training_examples = generate_training_examples(train_words)
val_examples = generate_training_examples(val_words)

print(training_examples[0:10])

[('_ a _ _ a _ _', 'n'), ('t e t r _ x i a l', 'a'), ('n o _ _ _ _ d', 'e'), ('s c a r l e t l i _ e d', 'n'), ('_ e _ _ _ _ t o', 'i'), ('u p t h r u s _ s', 't'), ('p t _ _ _ _ o _ _ g _ _ a l', 'e'), ('_ _ _ _ _ _ _ _ _', 'a'), ('_ _ _ _ _ _ _ _', 'e'), ('_ o _ _ _ _ _ _ _ _ _ _ _ _', 'e')]


In [5]:
# Create a tokenizer for characters
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_words + ['_'])  # Include underscore in tokenizer fitting
vocab_size = len(tokenizer.word_index) + 1

# Encode current state and next letter
def encode_state(state, tokenizer):
    state = state.replace(' ', '')
    return tokenizer.texts_to_sequences([list(state)])[0]

def encode_letter(letter, tokenizer):
    return tokenizer.texts_to_sequences([[letter]])[0][0]

# Encode and pad training data
X_train = [encode_state(state, tokenizer) for state, _ in training_examples]
y_train = [encode_letter(next_letter, tokenizer) for _, next_letter in training_examples]
X_train = pad_sequences(X_train, maxlen=32, padding='post')
X_train = np.array(X_train)
y_train = np.array(y_train)

# Encode and pad validation data
X_val = [encode_state(state, tokenizer) for state, _ in val_examples]
y_val = [encode_letter(next_letter, tokenizer) for _, next_letter in val_examples]
X_val = pad_sequences(X_val, maxlen=32, padding='post')
X_val = np.array(X_val)
y_val = np.array(y_val)


Define Model

In [6]:
# Define the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=None),
    Masking(mask_value=0.0),
    Bidirectional(LSTM(32, return_sequences=False)),
    # Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          1400      
                                                                 
 masking (Masking)           (None, None, 50)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 64)               21248     
 l)                                                              
                                                                 
 dense (Dense)               (None, 28)                1820      
                                                                 
Total params: 24,468
Trainable params: 24,468
Non-trainable params: 0
_________________________________________________________________


Train

In [9]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Save

In [11]:
model.save('rnn.keras')

tokenizer_json = tokenizer.to_json()
with open('tokenizer-rnn.json', 'w') as file:
    file.write(tokenizer_json)

Load and Predict

In [None]:
model = load_model('rnn.keras')
with open('tokenizer-rnn.json', 'r') as file:
    tokenizer_json = file.read()
tokenizer = tokenizer_from_json(tokenizer_json)

In [16]:
def guess(current_word, guessed_letters, model, tokenizer):
    features = encode_state(' '.join(current_word), tokenizer)
    features = pad_sequences([features], padding='post')
    probabilities = model.predict(features)[0]
    
    # Sort letters by probability
    sorted_indices = np.argsort(probabilities)[::-1]
    
    for index in sorted_indices:
        letter = tokenizer.index_word[index]
        if letter not in guessed_letters:
            return letter

# Example usage
current_word = '_ p p l _'
guessed_letters = {'r', 'b', 'n', 'u', 'i'}
next_guess = guess(current_word, guessed_letters, model, tokenizer)
print(f'Next guess: {next_guess}')

Next guess: e
