In this part of code, we will define several index of words. By these indexes we can furthermore analyze the difficulty of guessing the words. 

# Index

We will define several indexes of a word: I1, I2, I3, I4. That is: repeatedness of characters, percentage of vowels, rank of characters, and seperation of keyboard. 

In [5]:
import numpy as np
import pandas as pd
import scipy.stats as stats

## I1
repeatedness of characters. We define them by calculating the information entropy of characters in a word.

$\displaystyle \mathrm {H} (X):=-\sum _{x\in {\mathcal {X}}}p(x)\log p(x)=\mathbb {E} [-\log p(X)]$

In [6]:
# returns I1. s is the word.
def charCount(s):
    dic = {}
    for ch in s:
        try:
            dic[ch] += 1
        except KeyError:
            dic[ch] = 1
    return dic

def calc_I1(s): 
    dic = charCount(s)
    l = [val for val in dic.values()]
    return stats.entropy(l) # will normalize


# I1 gets bigger when the repeadness of a word is lower. 
print(calc_I1('mummy'), calc_I1('apple'), calc_I1('audio'))

0.9502705392332347 1.3321790402101223 1.6094379124341005


## I2 
percentage of vowels. We define them by calculating the percentage of vowel characters in a word.


In [7]:
def calc_I2(s): 
    dic = charCount(s)
    I2 = 0
    for c in s:
        if c in ['a', 'e', 'i', 'o', 'u']: I2 += 1
    return I2 / 5

# I2 counts percentage of vowel characters in a word. 
print(calc_I2('mummy'), calc_I2('apple'), calc_I2('audio'))

0.2 0.4 0.8


# I3 
Expectation of Yellow Hit and Green Hit.

In [8]:
def calc_rank():
    with open('./dataset/wordle_wordbank.txt') as f:
        lines = f.readlines()
    print(len(lines))
    lines = [s.strip()for s in lines]
    dic = {}
    dic2 = [{} for i in range(5)]
    for s in lines:
        # print(s)
        for c in set(s.lower()):
            # print(ord(c) - ord('a'))
            try: dic[c] += 1
            except KeyError: dic[c] = 1
        for i, c in enumerate(s):
            try: dic2[i][c] += 1
            except KeyError: dic2[i][c] = 1
    return  dict(sorted(dic.items())), [dict(sorted(dic2[i].items())) for i in range(5)]
 
print(calc_rank())

2309
({'a': 906, 'b': 266, 'c': 446, 'd': 370, 'e': 1053, 'f': 206, 'g': 299, 'h': 377, 'i': 646, 'j': 27, 'k': 202, 'l': 645, 'm': 298, 'n': 548, 'o': 672, 'p': 345, 'q': 29, 'r': 835, 's': 617, 't': 667, 'u': 456, 'v': 148, 'w': 193, 'x': 37, 'y': 416, 'z': 35}, [{'a': 140, 'b': 173, 'c': 198, 'd': 111, 'e': 72, 'f': 135, 'g': 115, 'h': 69, 'i': 34, 'j': 20, 'k': 20, 'l': 87, 'm': 107, 'n': 37, 'o': 41, 'p': 141, 'q': 23, 'r': 105, 's': 365, 't': 149, 'u': 33, 'v': 43, 'w': 82, 'y': 6, 'z': 3}, {'a': 304, 'b': 16, 'c': 40, 'd': 20, 'e': 241, 'f': 8, 'g': 11, 'h': 144, 'i': 201, 'j': 2, 'k': 10, 'l': 200, 'm': 38, 'n': 87, 'o': 279, 'p': 61, 'q': 5, 'r': 267, 's': 16, 't': 77, 'u': 185, 'v': 15, 'w': 44, 'x': 14, 'y': 22, 'z': 2}, {'a': 306, 'b': 56, 'c': 56, 'd': 75, 'e': 177, 'f': 25, 'g': 67, 'h': 9, 'i': 266, 'j': 3, 'k': 12, 'l': 112, 'm': 61, 'n': 137, 'o': 243, 'p': 57, 'q': 1, 'r': 163, 's': 80, 't': 111, 'u': 165, 'v': 49, 'w': 26, 'x': 12, 'y': 29, 'z': 11}, {'a': 162, 'b': 

In [9]:
charRank, charRankByPos =   {'a': 906, 'b': 266, 'c': 446, 'd': 370, 'e': 1053, 'f': 206, 'g': 299, 'h': 377, 'i': 646, \
                             'j': 27, 'k': 202, 'l': 645, 'm': 298, 'n': 548, 'o': 672, 'p': 345, 'q': 29, 'r': 835,\
                                  's': 617, 't': 667, 'u': 456, 'v': 148, 'w': 193, 'x': 37, 'y': 416, 'z': 35}, \
                                    [{'a': 140, 'b': 173, 'c': 198, 'd': 111, 'e': 72, 'f': 135, 'g': 115, 'h': 69, \
                                      'i': 34, 'j': 20, 'k': 20, 'l': 87, 'm': 107, 'n': 37, 'o': 41, 'p': 141, 'q': 23,\
                                          'r': 105, 's': 365, 't': 149, 'u': 33, 'v': 43, 'w': 82, 'y': 6, 'z': 3},\
                                              {'a': 304, 'b': 16, 'c': 40, 'd': 20, 'e': 241, 'f': 8, 'g': 11, 'h': 144, 'i': 201, 'j': 2, 'k': 10, 'l': 200, 'm': 38, 'n': 87, 'o': 279, 'p': 61, 'q': 5, 'r': 267, 's': 16, 't': 77, 'u': 185, 'v': 15, 'w': 44, 'x': 14, 'y': 22, 'z': 2}, {'a': 306, 'b': 56, 'c': 56, 'd': 75, 'e': 177, 'f': 25, 'g': 67, 'h': 9, 'i': 266, 'j': 3, 'k': 12, 'l': 112, 'm': 61, 'n': 137, 'o': 243, 'p': 57, 'q': 1, 'r': 163, 's': 80, 't': 111, 'u': 165, 'v': 49, 'w': 26, 'x': 12, 'y': 29, 'z': 11}, {'a': 162, 'b': 24, 'c': 150, 'd': 69, 'e': 318, 'f': 35, 'g': 76, 'h': 28, 'i': 158, 'j': 2, 'k': 55, 'l': 162, 'm': 68, 'n': 182, 'o': 132, 'p': 50, 'r': 150, 's': 171, 't': 139, 'u': 82, 'v': 45, 'w': 25, 'x': 3, 'y': 3, 'z': 20}, {'a': 63, 'b': 11, 'c': 31, 'd': 118, 'e': 422, 'f': 26, 'g': 41, 'h': 137, 'i': 11, 'k': 113, 'l': 155, 'm': 42, 'n': 130, 'o': 58, 'p': 56, 'r': 212, 's': 36, 't': 253, 'u': 1, 'w': 17, 'x': 8, 'y': 364, 'z': 4}]
totWords = 2309
# I3Y is expectation of yellow hits (same letter doesn't count twice)
def calc_I3Y(s):
    I3 = 0
    for c in set(s):
        I3 += charRank[c] / totWords
    return I3

# I3 counts how easy can people guess correctly one character from the word
print(calc_I3Y('mummy'), calc_I3Y('apple'), calc_I3Y('audio'))

# I3G is expectation of green hits
def calc_I3G(s):
    I3 = 0
    for i, c in enumerate(s):
        I3 += charRankByPos[i][c] / totWords
    return I3

print(calc_I3G('mummy'), calc_I3G('apple'), calc_I3G('audio'))


0.5067128627111304 1.2771762667821567 1.3209181463837159
0.33997401472498917 0.364660025985275 0.2667821567778259
