In this part of code, we will define several index of words. By these indexes we can furthermore analyze the difficulty of guessing the words. 

# Attributes

We will define several attributes of a word: I1, I2, I3, I4. That is: repeatedness of characters, percentage of vowels, rank of characters, and seperation of keyboard. 

In [37]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

## I1
repeatedness of characters. We define them by calculating the information entropy of characters in a word.

$\displaystyle \mathrm {H} (X):=-\sum _{x\in {\mathcal {X}}}p(x)\log p(x)=\mathbb {E} [-\log p(X)]$

In [38]:
# returns I1. s is the word.
def charCount(s):
    dic = {}
    for ch in s:
        try:
            dic[ch] += 1
        except KeyError:
            dic[ch] = 1
    return dic

def calc_A1(s): 
    dic = charCount(s)
    l = [val for val in dic.values()]
    return stats.entropy(l) # will normalize


# I1 gets bigger when the repeadness of a word is lower. 
print(calc_A1('mummy'), calc_A1('apple'), calc_A1('audio'))

0.9502705392332347 1.3321790402101223 1.6094379124341005


## I2 
percentage of vowels. We define them by calculating the percentage of vowel characters in a word.


In [39]:
def calc_A2(s): 
    dic = charCount(s)
    I2 = 0
    for c in s:
        if c in ['a', 'e', 'i', 'o', 'u']: I2 += 1
    return I2 / 5

# I2 counts percentage of vowel characters in a word. 
print(calc_A2('mummy'), calc_A2('apple'), calc_A2('audio'))

0.2 0.4 0.8


# A3 
Expectation of Yellow Hit and Green Hit.

In [40]:
def calc_rank():
    with open('./dataset/Wordle_Wordbank.txt') as f:
        lines = f.readlines()
    print(len(lines))
    lines = [s.strip()for s in lines]
    dic = {}
    dic2 = [{} for i in range(5)]
    for s in lines:
        # print(s)
        for c in set(s.lower()):
            # print(ord(c) - ord('a'))
            try: dic[c] += 1
            except KeyError: dic[c] = 1
        for i, c in enumerate(s):
            try: dic2[i][c] += 1
            except KeyError: dic2[i][c] = 1
    return  dict(sorted(dic.items())), [dict(sorted(dic2[i].items())) for i in range(5)]
 
print(calc_rank())

2309
({'a': 906, 'b': 266, 'c': 446, 'd': 370, 'e': 1053, 'f': 206, 'g': 299, 'h': 377, 'i': 646, 'j': 27, 'k': 202, 'l': 645, 'm': 298, 'n': 548, 'o': 672, 'p': 345, 'q': 29, 'r': 835, 's': 617, 't': 667, 'u': 456, 'v': 148, 'w': 193, 'x': 37, 'y': 416, 'z': 35}, [{'a': 140, 'b': 173, 'c': 198, 'd': 111, 'e': 72, 'f': 135, 'g': 115, 'h': 69, 'i': 34, 'j': 20, 'k': 20, 'l': 87, 'm': 107, 'n': 37, 'o': 41, 'p': 141, 'q': 23, 'r': 105, 's': 365, 't': 149, 'u': 33, 'v': 43, 'w': 82, 'y': 6, 'z': 3}, {'a': 304, 'b': 16, 'c': 40, 'd': 20, 'e': 241, 'f': 8, 'g': 11, 'h': 144, 'i': 201, 'j': 2, 'k': 10, 'l': 200, 'm': 38, 'n': 87, 'o': 279, 'p': 61, 'q': 5, 'r': 267, 's': 16, 't': 77, 'u': 185, 'v': 15, 'w': 44, 'x': 14, 'y': 22, 'z': 2}, {'a': 306, 'b': 56, 'c': 56, 'd': 75, 'e': 177, 'f': 25, 'g': 67, 'h': 9, 'i': 266, 'j': 3, 'k': 12, 'l': 112, 'm': 61, 'n': 137, 'o': 243, 'p': 57, 'q': 1, 'r': 163, 's': 80, 't': 111, 'u': 165, 'v': 49, 'w': 26, 'x': 12, 'y': 29, 'z': 11}, {'a': 162, 'b': 

In [41]:
charRank, charRankByPos =   {'a': 906, 'b': 266, 'c': 446, 'd': 370, 'e': 1053, 'f': 206, 'g': 299, 'h': 377, 'i': 646, \
                             'j': 27, 'k': 202, 'l': 645, 'm': 298, 'n': 548, 'o': 672, 'p': 345, 'q': 29, 'r': 835,\
                                  's': 617, 't': 667, 'u': 456, 'v': 148, 'w': 193, 'x': 37, 'y': 416, 'z': 35}, \
                                    [{'a': 140, 'b': 173, 'c': 198, 'd': 111, 'e': 72, 'f': 135, 'g': 115, 'h': 69, \
                                      'i': 34, 'j': 20, 'k': 20, 'l': 87, 'm': 107, 'n': 37, 'o': 41, 'p': 141, 'q': 23,\
                                          'r': 105, 's': 365, 't': 149, 'u': 33, 'v': 43, 'w': 82, 'y': 6, 'z': 3},\
                                              {'a': 304, 'b': 16, 'c': 40, 'd': 20, 'e': 241, 'f': 8, 'g': 11, 'h': 144, 'i': 201, 'j': 2, 'k': 10, 'l': 200, 'm': 38, 'n': 87, 'o': 279, 'p': 61, 'q': 5, 'r': 267, 's': 16, 't': 77, 'u': 185, 'v': 15, 'w': 44, 'x': 14, 'y': 22, 'z': 2}, {'a': 306, 'b': 56, 'c': 56, 'd': 75, 'e': 177, 'f': 25, 'g': 67, 'h': 9, 'i': 266, 'j': 3, 'k': 12, 'l': 112, 'm': 61, 'n': 137, 'o': 243, 'p': 57, 'q': 1, 'r': 163, 's': 80, 't': 111, 'u': 165, 'v': 49, 'w': 26, 'x': 12, 'y': 29, 'z': 11}, {'a': 162, 'b': 24, 'c': 150, 'd': 69, 'e': 318, 'f': 35, 'g': 76, 'h': 28, 'i': 158, 'j': 2, 'k': 55, 'l': 162, 'm': 68, 'n': 182, 'o': 132, 'p': 50, 'r': 150, 's': 171, 't': 139, 'u': 82, 'v': 45, 'w': 25, 'x': 3, 'y': 3, 'z': 20}, {'a': 63, 'b': 11, 'c': 31, 'd': 118, 'e': 422, 'f': 26, 'g': 41, 'h': 137, 'i': 11, 'k': 113, 'l': 155, 'm': 42, 'n': 130, 'o': 58, 'p': 56, 'r': 212, 's': 36, 't': 253, 'u': 1, 'w': 17, 'x': 8, 'y': 364, 'z': 4}]
totWords = 2309
# I3Y is expectation of yellow hits (same letter doesn't count twice)
def calc_A3Y(s):
    I3 = 0
    for c in set(s):
        I3 += charRank[c] / totWords
    return I3

# I3 counts how easy can people guess correctly one character from the word
print(calc_A3Y('mummy'), calc_A3Y('apple'), calc_A3Y('audio'))

# I3G is expectation of green hits
def calc_A3G(s):
    I3 = 0
    for i, c in enumerate(s):
        I3 += charRankByPos[i][c] / totWords
    return I3

print(calc_A3G('mummy'), calc_A3G('apple'), calc_A3G('audio'))


0.5067128627111304 1.2771762667821567 1.3209181463837159
0.33997401472498917 0.364660025985275 0.2667821567778259


# Check the correlation between Attributes and TP & H/T

In [44]:
data = pd.read_excel("./dataset/wordle_data.xlsx", index_col=0)
print(data)

            contest_num   word  result_num  hard_mode_num  trial_1  trial_2  \
date                                                                          
2022-12-31          560  manly       20380           1899        0        2   
2022-12-30          559  molar       21204           1973        0        4   
2022-12-29          558  havoc       20001           1919        0        2   
2022-12-28          557  impel       20160           1937        0        3   
2022-12-27          556  condo       20879           2012        0        2   
...                 ...    ...         ...            ...      ...      ...   
2022-01-11          206  drink      153880           3017        1        9   
2022-01-10          205  query      107134           2242        1        4   
2022-01-09          204  gorge       91477           1913        1        3   
2022-01-08          203  crank      101503           1763        1        5   
2022-01-07          202  slump       80630          

In [46]:
def get_result(df, func_name, row_name):
    valAttrib = [func_name(word.strip()) for word in df['word']]
    res = np.asarray(df[row_name])
    pearson = np.corrcoef(valAttrib, res)
    # plt.scatter(valAttrib, res)
    # plt.show()
    return pearson[0][1]

cols = ['trial_' + str(i) for i in range(1,7)] + ['trial_x'] + ['H/T']
attribs = [calc_A1, calc_A2, calc_A3Y, calc_A3G]
dic = {}
for attrib in attribs: 
    l = []
    for colname in cols:
        l.append(get_result(data, attrib, colname))
    dic[attrib.__name__] = l
res = pd.DataFrame(dic, index = cols)
    
print(res)



          calc_A1   calc_A2  calc_A3Y  calc_A3G
trial_1  0.222949  0.085566  0.338521  0.280004
trial_2  0.362358  0.125828  0.580781  0.335103
trial_3  0.432937  0.006557  0.501621  0.172001
trial_4  0.098531 -0.104577 -0.116221 -0.311545
trial_5 -0.419158  0.034670 -0.521995 -0.259876
trial_6 -0.361306 -0.005921 -0.368545 -0.011690
trial_x -0.202787 -0.046894 -0.123720  0.099137
H/T     -0.081527  0.079484 -0.039814 -0.119932
