In [2]:
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import random

In [3]:

with open('words_250000_train.txt', 'r') as f:
    words = f.read().splitlines()

train_words, val_words = train_test_split(words, test_size=0.1, random_state=42)

In [4]:
def character_frequency(word_list):
    return Counter(''.join(word_list))

def generate_training_examples(word_list):
    training_examples = []
    for i in range(5):  # Generate 5 training examples for each word
        for word in word_list:
            chars = list(word)
            # Randomly select characters to remove
            indices_to_remove = random.sample(range(len(chars)), random.randint(1, len(chars)))
            # Collect the removed characters
            removed_chars = [chars[i] for i in indices_to_remove]
            for i in indices_to_remove:
                chars[i] = '_'
            highest_freq_char = max(removed_chars, key=lambda char: freq_counter[char])
            # Add the modified word and highest frequency character to the training examples
            training_examples.append((' '.join(chars), highest_freq_char))
    return training_examples

# Calculate character frequency from training words
freq_counter = character_frequency(train_words)

# Generate training examples for training and validation sets
training_examples = generate_training_examples(train_words)
val_examples = generate_training_examples(val_words)

print(training_examples[0:10])

[('c a s t _ _ _', 'a'), ('t _ _ _ _ x _ a _', 'e'), ('_ _ _ _ _ a d', 'e'), ('_ c a _ l e _ _ _ n e _', 'i'), ('_ _ _ _ _ _ _ _', 'e'), ('u _ _ h _ u _ t _', 'r'), ('p t e r y l o l o g _ c a l', 'i'), ('_ h a h z a d a _', 's'), ('_ r a s u _ _ s', 'e'), ('n _ n _ _ q _ _ s i t i _ n', 'e')]


In [5]:
def state_to_features(state):
    features = np.zeros(26)
    for char in state.replace(' ', ''):
        if char != '_':
            features[ord(char) - ord('a')] = 1
    return features

X = np.array([state_to_features(state) for state, _ in training_examples])
y = np.array([ord(next_letter) - ord('a') for _, next_letter in training_examples])

In [6]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.42


In [9]:
def guess(current_word, guessed_letters):
    features = state_to_features(current_word)
    probabilities = model.predict_proba([features])[0]
    sorted_indices = np.argsort(probabilities)[::-1]
    
    for index in sorted_indices:
        letter = chr(index + ord('a'))
        if letter not in guessed_letters:
            return letter

# Example usage
current_word = '_ p p _ e'
guessed_letters = {'t', 'r', 'i', 'o', 'u'}
next_guess = guess(current_word, guessed_letters)
print(f'Next guess: {next_guess}')

Next guess: e
