In [110]:
import sys
import pandas as pd
import plotly.express as px
from math import log2

In [111]:
# globals
POSSIBLE_WORDS_FILE = './data/possible_words.txt'
ALLOWED_WORDS_FILE = './data/allowed_words.txt'

In [112]:
def get_word_list(all):  # returns a list of strings
    result = []

    # loading either the allowed or possible words
    file = ALLOWED_WORDS_FILE if all else POSSIBLE_WORDS_FILE
    with open(file):
        result.extend([word.strip() for word in open(file).readlines()])

    return result


In [113]:
# returns true if the word passed as a parameter matches the given patter, false otherwise

# Reminder of how Wordle works when it comes to match the pattern of an allowed  word to the word to guess:
#   If the allowed word has multiple occurences of the same letter and one is  green
#   and the other is black it means that the word to guess has that letter at that
#   index but it does not have any other occurence of that letter.
#   The same applies when it comes to yellow letters: if one occurence is yellow and
#   the other is black it means that there's no second occurence of that letter in the
#   word we're guessing.
#
# This is the reason that forces us to count the number of occurences for each letter to prevent
# mistakes and bugs.


def matches_pattern(word, possible_word, pattern):
    match = False

    for i, color in enumerate(pattern):
        letter_occurences = word.count(word[i])

        if color == 'black':
            # if letter occurs just once with a black cell then the possible
            # word must not have any occurence for that letter in order to match.
            if letter_occurences == 1:
                match = possible_word.find(word[i]) == -1
            else:
                # if the letter occurs more than once throughout the word then a
                # match is found if both the letter at index i of the possible word
                # doesn't correspond to to the letter at index i of the word and
                # the number of occurences of that letter corresponds for both words.

                match = possible_word[i] != word[i] and possible_word.count(
                    word[i]) == letter_occurences
        elif color == 'green':
            # letter at index i MUST correspond
            match = possible_word[i] == word[i]
        elif color == 'yellow':
            match = possible_word[i] != word[i] and possible_word.find(
                word[i]) != -1

        if not match:
            break

    return match

In [114]:
def count_words_left_from_pattern(word, pattern, possible_words):
    count = 0
    words = []

    # print(pattern)

    for possible_word in possible_words:
        if matches_pattern(word, possible_word, pattern):
            count += 1
            words.append(possible_word)

    return (count, words)

In [115]:
# in wordle a pattern is an assignment for each letter in the word of either the color green,
# yellow or grey, respectively meaning that the letter was in the right position, it occured
# at least once in the word but in the wrong position and it wasn't at all in the word.


def compute_patterns():  # returns a list of tuples (each tuple represents a specific pattern)
    colors = ['black', 'yellow', 'green']

    patterns = []

    # less verbose way to create the pattern array
    patterns = [(a, b, c, d, e)
                for a in colors for b in colors for c in colors for d in colors for e in colors]  # all possible patterns for a 5-letter word
    return patterns

In [116]:
# returns a string used by the chart to display each pattern in the x-axis labels

def pattern_to_chart_label_string(pattern):
    label = ''
    
    for color in pattern:
        if color == 'black':
            label += '⬛'
        if color == 'green':
            label += '🟩'
        if color == 'yellow':
            label += '🟨'
    return label

In [117]:
# probabilty in this case is defined as the words left
# from a specific pattern over the total number of words
def probability(words_total, words_left):
    return words_left / words_total

In [118]:
# surprise is defined as the logarithm base 2 of one over the probability
def surprise(probability):
    return log2(1/probability)

In [119]:
# entropy is the expected value of the surprise given a specific patter related to a word.
#
# In our case the entropy is defined as:
# E(surpise) = - ∑ p(x) * log2(p(x))
#
# where p(x) is the probability to get a pattern x and 
# log2(p(x)) is the surpise we get given a pattern

def calculate_entropy(probabilities, surprises):
    E = 0
    
    for prob, surpr in zip(probabilities, surprises):
        E += surpr * prob
    return E

In [120]:
word = "cults"
patterns = compute_patterns()
possible_words = get_word_list(all=True)

# array mapping the number of words left for each pattern related to a given word
words_left = []

for pattern in patterns:
    count, words = count_words_left_from_pattern(word, pattern, possible_words)
    if count > 0:
        words_left.append({
            'pattern': pattern_to_chart_label_string(pattern), 'value': count, 'words': words })

# building pandas like data frame sorted by value compatible with plotly express bar
df = pd.DataFrame(
    data=words_left,
    columns=['pattern', 'value', 'words']
).sort_values(
    by='value',
    ascending=False
)

# figure manipulation
fig = px.bar(
    data_frame=df,
    x='pattern',
    y='value',
    title=f'Words left distribution for word: "{word.upper()}"',
    height=1000,
    text='value',
    hover_data=['pattern', 'value', 'words'],
)

fig.update_xaxes(
    tickangle=45,
    tickfont=dict(size=10),
    title_text="Pattern",
    title_font={"size": 16},
    title_standoff=25,
    showticklabels=False
)

fig.update_yaxes(
    title_text="Words left",
    title_font={"size": 16},
    showgrid=False,
    zeroline=False
)

fig.update_layout(
    hovermode="x",
    hoverlabel=dict(
        bgcolor="white",
    ),
    autosize=True
)

fig.update_traces(
    marker_color='rgb(37,106,236)',
    marker_line_width=1.5,
)

fig.show()