In [211]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [204]:
#load data
guesses_df = pd.read_csv('valid_guesses.csv')
solutions_df = pd.read_csv('valid_solutions.csv')
all_words = pd.concat([guesses_df, solutions_df])

In [114]:
#split data so each char has it's own column
guesses_char_df = guesses_df['word'].apply(lambda x: pd.Series(list(x)))
solutions_char_df = solutions_df['word'].apply(lambda x: pd.Series(list(x)))

In [115]:
#function for splitting guesses into separate columns
def split_char_df(word_df):
    return word_df['word'].apply(lambda x: pd.Series(list(x)))

In [116]:
#compute the distribution of characters at each position of the word
def get_char_dist(word_df):
    char_df = split_char_df(word_df)
    n_rows = len(char_df)
    dist_list = []
    for col in char_df.columns.to_list():
        dist_list.append(char_df[col].value_counts()/n_rows)
    return pd.concat(dist_list, axis=1).sort_index()
    

In [123]:
#defining word value by the sum of probabilities of each letter at each position within the word
def get_word_value(word, dist_df):
    word_val = 0
    for i, char in enumerate(word):
        word_val += dist_df[i].loc[char]
    return word_val

In [124]:
guesses_dist = get_char_dist(guesses_df)
get_word_value('tears', guesses_dist)

0.7009477338838321

In [53]:
word_value = {}
best_value = 0
best_word = ''
for word in guesses_df['word'].to_list():
    word_value[word] = get_word_value(word, guess_dist)
    if word_value[word]>best_value:
        best_word = word
        best_value = word_value[word]
        print('{} is best w/ value {}'.format(word,word_value[word]))

aahed is best w/ value 0.5048325044571643
abacs is best w/ value 0.5415220043164117
abbas is best w/ value 0.5416158393544149
abbes is best w/ value 0.644646711081918
abies is best w/ value 0.6922210753495355
acnes is best w/ value 0.7026367645678896
acres is best w/ value 0.7223421225485596
aedes is best w/ value 0.7720746926902505
aunes is best w/ value 0.7838040724406493
aures is best w/ value 0.8035094304213193
babes is best w/ value 0.8355071783804072
bales is best w/ value 0.8784836257858685
banes is best w/ value 0.8868349441681523
bares is best w/ value 0.9065403021488223
sales is best w/ value 0.9219292483813456
sanes is best w/ value 0.9302805667636295
sores is best w/ value 0.9366613493478464


In [143]:
def get_best_word(word_df, dist_df, verbose=False):
    word_value = {}
    best_value = 0
    best_word = ''
    for word in word_df['word'].to_list():
        word_value[word] = get_word_value(word, dist_df)
        if word_value[word]>best_value:
            best_word = word
            best_value = word_value[word]
            if verbose:
                print('{} is best w/ value {}'.format(word,word_value[word]))
    return best_word

In [127]:
best_word(guesses_df, guesses_dist)

'sores'

In [128]:
response = {'grey': 'ugh', 'yellow':{0:'t'}, 'green':{1:'o'}}

In [191]:
def trim_dataset(word_df, response, verbose=False):
    char_df = split_char_df(word_df)
    #filter words without 
    
    if verbose:
        print("{} words to begin".format(len(word_df)))
        
    for col in response['green'].keys():
        green_char = response['green'][col]
        word_df = word_df[word_df['word'].str[col]==green_char]
        
    if verbose:
        print("{} words after green".format(len(word_df)))
        
    for col in response['yellow'].keys():
        yellow_char = response['yellow'][col]
        word_df = word_df[word_df['word'].str.contains(yellow_char)]
        word_df = word_df[~(word_df['word'].str[col]==yellow_char)]
        
    if verbose:
        print("{} words after yellow".format(len(word_df)))    
        
    for grey_char in response['grey']:
        word_df = word_df[~word_df['word'].str.contains(grey_char)]
        
    if verbose:
        print("{} words after grey".format(len(word_df)))
        
    return word_df


In [192]:
trimmed_df = trim_dataset(guesses_df, response, verbose=True)
trimmed_df

10657 words to begin
1817 words after green
242 words after yellow
180 words after grey


Unnamed: 0,word
993,boart
994,boats
1011,boets
1027,boite
1039,bolts
...,...
10313,wonts
10325,wootz
10330,worts
10624,zoist


In [193]:
def generate_response(word, target_word):
    if word==target_word:
        return False
    
    response = {'grey': '', 'yellow': {}, 'green':{}}
    for i, char in enumerate(word):
        if char in target_word:
            if target_word[i]==char:
                response['green'][i]=char
            else:
                response['yellow'][i] = char
        else:
             response['grey'] = response['grey'] + char
    return response

In [194]:
generate_response('tough', 'point') 

{'grey': 'ugh', 'yellow': {0: 't'}, 'green': {1: 'o'}}

In [213]:
def play_game(word_df, target_word, verbose=False):
    counts = [len(word_df)]
    
    while len(counts)<8:
        #determine the 'optimal' word
        dist_df = get_char_dist(word_df)
        best_word = get_best_word(word_df, dist_df)

        if verbose:
            print('Guessing {}'.format(best_word))

        #generate response
        response = generate_response(best_word, target_word)


        if not response:
            if verbose:
                print('Word {} guessed correctly!'.format(best_word))
            return counts
        
        if verbose:
            print(response)

        #trim dataset
        word_df = trim_dataset(word_df, response, verbose=verbose)
        counts.append(len(word_df))
        
        if verbose:
            print(word_df['word'].to_list())
    
    
    return counts
    

In [205]:
play_game(all_words, 'point', verbose=True)

Guessing sores
{'grey': 'sres', 'yellow': {}, 'green': {1: 'o'}}
12972 words to begin
2096 words after green
2096 words after yellow
520 words after grey
['bobac', 'bobak', 'bobol', 'bocca', 'bocci', 'bodhi', 'boffo', 'bogan', 'boggy', 'boing', 'boink', 'bolix', 'bombo', 'bonny', 'bonza', 'booai', 'booay', 'boody', 'boofy', 'boogy', 'booky', 'boomy', 'boong', 'boppy', 'bothy', 'botty', 'boult', 'bovid', 'bowat', 'boxla', 'boxty', 'boyau', 'boyla', 'coact', 'coady', 'coala', 'coaly', 'coapt', 'coati', 'cobby', 'cobia', 'cobza', 'cocci', 'cocco', 'cocky', 'codon', 'cogon', 'cohab', 'cohog', 'coign', 'colby', 'colic', 'colin', 'colly', 'colog', 'colza', 'comal', 'combi', 'combo', 'comby', 'comix', 'commo', 'commy', 'compo', 'compt', 'conga', 'congo', 'conia', 'conin', 'conky', 'conto', 'convo', 'cooch', 'cooky', 'cooly', 'coomb', 'coomy', 'coopt', 'copal', 'copay', 'coppy', 'coqui', 'cotan', 'cotta', 'couta', 'couth', 'covin', 'cowal', 'cowan', 'coxal', 'coxib', 'coyau', 'coypu', 'dobby',

[12972, 520, 145, 14, 1]

In [206]:
guess_words = solutions_df.sample(250)['word'].to_list()

In [None]:
word_dist = []
for word in tqdm(guess_words):
    word_dist.append(play_game(all_words, word))

 98%|█████████▊| 246/250 [21:57<00:22,  5.60s/it]