# Requirements

## Get all candidate words

In [1]:
import numpy as np
from collections import Counter


WORD_FILE_LOC = 'wordle-answers-alphabetical.txt'

with open(WORD_FILE_LOC, "r") as f:
   word_list = f.read().splitlines()

## Data manipulation functions

In [2]:
def standardized(a: np.ndarray):
    return (a - np.mean(a) ) / np.std(a) if len(a) > 1 else 0


def word_to_array(w:str) -> np.array:
    return np.array(list(w))


def words_to_matrix(words:list) -> np.ndarray:
    return np.array([word_to_array(s) for s in np.array(words)])


def array_to_word(a: np.array) -> str:
    return ''.join(a)


def matrix_to_words(b: np.ndarray) -> list:
    return [array_to_word(s) for s in b]

assert(matrix_to_words(words_to_matrix(word_list)) == word_list)

## Frequency-based scoring functions

In [4]:
def mtrx_get_pos_ltr_frq(words_mtrx: np.ndarray) -> np.array:
    """position-specific frequency of letters (returns array of 5 counters for matrix of 5-letter words)"""
    return np.apply_along_axis(Counter, 0, words_mtrx)


def aggregate_frequencies(freq_array: np.array) -> Counter:
    """Consolidate an array of counters"""
    return sum(freq_array, Counter())


def array_get_pos_frq_wtd_score(word_arr, pos_ltr_frq) -> int:
    """for a given word_array, return the sum of position-specific frequency score of its letters"""
    return sum([f[l] for l,f in zip(word_arr,pos_ltr_frq)])


def array_get_frq_wtd_score(word_arr, ltr_frq) -> int:
    """for a given word array, return the sum of position-agnostic frequency score of its letters"""
    return sum([ltr_frq[l] for l in word_arr])


def mtrx_get_pos_frq_wtd_score(words_mtrx, pos_ltr_frq: np.array = np.array([])) -> np.array:
    """For a given word matrix, return an array of position-specific frequency scores.
       The frequencies are calculated from the word matrix, but can be specified to avoid redundant calc.
    """
    if not sum([sum(c.values()) for c in pos_ltr_frq]):
        pos_ltr_frq = mtrx_get_pos_ltr_frq(words_mtrx)
    return np.apply_along_axis(lambda x: array_get_pos_frq_wtd_score(x,pos_ltr_frq),1,words_mtrx)


def mtrx_get_frq_wtd_score(words_mtrx, ltr_frq=Counter()) -> np.array:
    """For a given word matrix, return an array of position-agnostic frequency scores
       The frequencies are calculated from the word matrix, but can be specified to avoid redundant calc.
    """
    if not sum(ltr_frq.values()):
        ltr_frq = aggregate_frequencies(mtrx_get_pos_ltr_frq(words_mtrx))
    return np.apply_along_axis(lambda x: array_get_frq_wtd_score(x,ltr_frq),1,words_mtrx)


def mtrx_get_blended_score(words_mtrx, pos_ltr_frq=Counter(), 
                           position_specific_weight = 0.5
                          ):
    """For a given word matrix, return an array of frequency scores that average position-specific and -agnostic
       The frequencies are calculated from the word matrix, but can be specified to avoid redundant calc.
    """
    if not sum(pos_ltr_frq.values()):
        pos_ltr_frq = mtrx_get_pos_ltr_frq(words_mtrx)  
    ltr_frq = aggregate_frequencies(pos_ltr_frq)
    pos_frq_wtd_score = mtrx_get_pos_frq_wtd_score(words_mtrx,pos_ltr_frq)
    frq_wtd_score = mtrx_get_frq_wtd_score(words_mtrx,ltr_frq)
    return( position_specific_weight * standardized(pos_frq_wtd_score) 
            + (1-position_specific_weight) * standardized(frq_wtd_score)
          )

def word_matches_fixed_loc(word, letters_fix_loc: list) -> bool:
    return all([word[v]==k for k,v in letters_fix_loc])


def word_matches_excluded_loc(word, letters_exc_loc: list) -> bool:
    return all([word[v]!=k for k,v in letters_exc_loc])


def letter_count_matches_min(letter_count: Counter, letters_min_freq: dict) -> bool:
    return all([letter_count[k]>=v for k,v in letters_min_freq.items()])


def letter_count_matches_max(letter_count: Counter, letters_max_freq: dict) -> bool:
    return all([letter_count[k]<=v for k,v in letters_max_freq.items()])


def generate_feedback(true_word, guessed_word):
    assert len(true_word)==len(guessed_word)
    letters_min_freq,  letters_max_freq = Counter(), Counter()
    letters_fix_loc = []
    letters_exc_loc = []
    true_word_counter = Counter(true_word)
    guessed_word_counter = Counter(guessed_word)
    for pos,ltr in enumerate(guessed_word):
        if ltr == true_word[pos]:
            letters_fix_loc.append((ltr,pos))
        else:
            letters_exc_loc.append((ltr,pos))
    for ltr, times_guessed in guessed_word_counter.items():
        times_actual = true_word_counter[ltr]  # will be 0 if not in true word
        # you can discover up to as many occurences of the letter as you guessed
        # wrap in max to prevent more specific minimum from being overwritten (if passed through)
        ltr_min_freq = min(times_guessed, times_actual)
        if ltr_min_freq > letters_min_freq[ltr]:  # does the minimum provide new information?
            letters_min_freq[ltr] = ltr_min_freq
        if times_guessed > times_actual:
            letters_max_freq[ltr] = times_actual  # don't worry about overwriting
    return { 'letters_min_freq':letters_min_freq,
             'letters_max_freq':letters_max_freq,
             'letters_fix_loc':letters_fix_loc,
             'letters_exc_loc':letters_exc_loc
           }


def filter_mtrx_from_feedback(words_mtrx: np.ndarray,
                              letters_min_freq: Counter = Counter(),  # e.g., {'e':1} if "e" must appear at least once
                              letters_max_freq: Counter = Counter(),  # e.g., {'b':1} if "b" can't appear twice
                              letters_fix_loc: list = [],    # e.g., [('a',3)] for "a" in 4th position
                              letters_exc_loc: list = [],    # e.g., [('c':3)] for 4th position CANNOT be "c"
                             ) -> np.array:
    words_passing = np.ones(shape=words_mtrx.shape[0], dtype=bool)
    for i,word_array in enumerate(words_mtrx):
        letter_count = Counter(word_array)
        words_passing[i] = ( word_matches_fixed_loc(word_array, letters_fix_loc)
                               and word_matches_excluded_loc(word_array, letters_exc_loc)
                               and letter_count_matches_min(letter_count,letters_min_freq)
                               and letter_count_matches_max(letter_count,letters_max_freq)
                           )
    return words_passing

# Simulation

## Actual word

In [5]:
all_words_matrix = words_to_matrix(word_list)
true_word = np.random.choice(word_list)
print(f"Don't tell anyone, but the true word is '{true_word}'.")

Don't tell anyone, but the true word is 'aisle'.


## Guesses

In [6]:
remaining_words_mtrx = all_words_matrix
round_num = 1
solved = False

while not solved and round_num < 6:
    
    print(f"===== Round {round_num}: {remaining_words_mtrx.shape[0]} possible words remaining. =====")
    
    blended_word_scores = mtrx_get_blended_score(remaining_words_mtrx)
    
    print("\nTop words using blended frequencies:")
    print("\n".join(matrix_to_words(remaining_words_mtrx[np.argsort(-blended_word_scores),:][:5,:])))
    try_word = array_to_word(remaining_words_mtrx[np.argsort(-blended_word_scores),:][:1][0])
    print(f"\nLet's try: '{try_word}'")
    if try_word==true_word:
        solved=True
        print("Correct!")
    else:
        feedback = generate_feedback(true_word, try_word)
        print("We get the feedback:")
        print(feedback)

        remaining_words_mtrx = \
            remaining_words_mtrx[filter_mtrx_from_feedback(words_mtrx=remaining_words_mtrx, **feedback),:]

        round_num += 1

if not solved:
    print(f"You lose. The word was '{true_word}.'")

===== Round 1: 2315 possible words remaining. =====

Top words using blended frequencies:
erase
tease
slate
cease
lease

Let's try: 'erase'
We get the feedback:
{'letters_min_freq': Counter({'e': 1, 'a': 1, 's': 1}), 'letters_max_freq': Counter({'e': 1, 'r': 0}), 'letters_fix_loc': [('e', 4)], 'letters_exc_loc': [('e', 0), ('r', 1), ('a', 2), ('s', 3)]}
===== Round 2: 11 possible words remaining. =====

Top words using blended frequencies:
taste
caste
baste
haste
paste

Let's try: 'taste'
We get the feedback:
{'letters_min_freq': Counter({'a': 1, 's': 1, 'e': 1}), 'letters_max_freq': Counter({'t': 0}), 'letters_fix_loc': [('s', 2), ('e', 4)], 'letters_exc_loc': [('t', 0), ('a', 1), ('t', 3)]}
===== Round 3: 1 possible words remaining. =====

Top words using blended frequencies:
aisle

Let's try: 'aisle'
Correct!
