In [1]:
import pandas as pd
import numpy as np
import string

In [2]:
wordle_words = pd.read_csv('Word Data/wordle_words.csv')
wordle_words

Unnamed: 0,word,count
0,about,1226734006
1,other,978481319
2,which,810514085
3,their,782849411
4,there,701170205
...,...,...
39928,goovo,12711
39929,goova,12711
39930,goolh,12711
39931,gooek,12711


In [3]:
wordle_words["log_count"] = wordle_words["count"].apply(lambda x: np.log(x))
wordle_words["probability"] = wordle_words["log_count"]/wordle_words["log_count"].sum()
wordle_words

Unnamed: 0,word,count,log_count,probability
0,about,1226734006,20.927621,0.000047
1,other,978481319,20.701512,0.000046
2,which,810514085,20.513179,0.000046
3,their,782849411,20.478451,0.000046
4,there,701170205,20.368261,0.000046
...,...,...,...,...
39928,goovo,12711,9.450223,0.000021
39929,goova,12711,9.450223,0.000021
39930,goolh,12711,9.450223,0.000021
39931,gooek,12711,9.450223,0.000021


In [4]:
wordle_words['first'] = wordle_words['word'].str[0]
wordle_words['second'] = wordle_words['word'].str[1]
wordle_words['third'] = wordle_words['word'].str[2]
wordle_words['fourth'] = wordle_words['word'].str[3]
wordle_words['fifth'] = wordle_words['word'].str[4]
wordle_words

Unnamed: 0,word,count,log_count,probability,first,second,third,fourth,fifth
0,about,1226734006,20.927621,0.000047,a,b,o,u,t
1,other,978481319,20.701512,0.000046,o,t,h,e,r
2,which,810514085,20.513179,0.000046,w,h,i,c,h
3,their,782849411,20.478451,0.000046,t,h,e,i,r
4,there,701170205,20.368261,0.000046,t,h,e,r,e
...,...,...,...,...,...,...,...,...,...
39928,goovo,12711,9.450223,0.000021,g,o,o,v,o
39929,goova,12711,9.450223,0.000021,g,o,o,v,a
39930,goolh,12711,9.450223,0.000021,g,o,o,l,h
39931,gooek,12711,9.450223,0.000021,g,o,o,e,k


In [5]:
letters = list(string.ascii_lowercase)
cols = ['first','second','third','fourth','fifth']
counts_dict = {}
for col in cols:
    counts = ({letter: wordle_words[col].str.contains(letter).sum() for letter in string.ascii_lowercase})
    counts_dict[col] = counts
counts_dict

{'first': {'a': np.int64(2822),
  'b': np.int64(2347),
  'c': np.int64(2445),
  'd': np.int64(1946),
  'e': np.int64(1436),
  'f': np.int64(1450),
  'g': np.int64(1921),
  'h': np.int64(1606),
  'i': np.int64(1176),
  'j': np.int64(846),
  'k': np.int64(1705),
  'l': np.int64(1751),
  'm': np.int64(2445),
  'n': np.int64(1658),
  'o': np.int64(1102),
  'p': np.int64(2172),
  'q': np.int64(170),
  'r': np.int64(1793),
  's': np.int64(3430),
  't': np.int64(2144),
  'u': np.int64(569),
  'v': np.int64(912),
  'w': np.int64(1015),
  'x': np.int64(258),
  'y': np.int64(416),
  'z': np.int64(398)},
 'second': {'a': np.int64(7387),
  'b': np.int64(435),
  'c': np.int64(887),
  'd': np.int64(546),
  'e': np.int64(4847),
  'f': np.int64(354),
  'g': np.int64(411),
  'h': np.int64(1399),
  'i': np.int64(4061),
  'j': np.int64(103),
  'k': np.int64(349),
  'l': np.int64(1787),
  'm': np.int64(687),
  'n': np.int64(1156),
  'o': np.int64(5526),
  'p': np.int64(714),
  'q': np.int64(66),
  'r': np

In [6]:
column_probabilities = pd.DataFrame(counts_dict)
column_probabilities.index.name = "letter"
column_probabilities.index = column_probabilities.index.astype(str)
column_probabilities

Unnamed: 0_level_0,first,second,third,fourth,fifth
letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,2822,7387,3340,3998,4395
b,2347,435,1118,761,342
c,2445,887,1486,1687,917
d,1946,546,1480,1543,1455
e,1436,4847,2552,5534,4538
f,1450,354,524,522,358
g,1921,411,1331,1156,765
h,1606,1399,757,835,1036
i,1176,4061,2918,3491,2103
j,846,103,255,216,65


In [7]:
def word_log_likelihood(word, freq_df):
    column_pos = [[0,'first'],[1,'second'],[2,'third'],[3,'fourth'],[4,'fifth']]
    return sum(np.log(freq_df.loc[word[col[0]], col[1]]) for col in column_pos)

word_log_likelihood("whizz", column_probabilities)

np.float64(34.25854862955148)

In [8]:
wordle_words["log_likelihood"] = wordle_words["word"].apply(lambda w: word_log_likelihood(w, column_probabilities))
wordle_words

Unnamed: 0,word,count,log_count,probability,first,second,third,fourth,fifth,log_likelihood
0,about,1226734006,20.927621,0.000047,a,b,o,u,t,36.694864
1,other,978481319,20.701512,0.000046,o,t,h,e,r,36.753145
2,which,810514085,20.513179,0.000046,w,h,i,c,h,36.518640
3,their,782849411,20.478451,0.000046,t,h,e,i,r,38.590206
4,there,701170205,20.368261,0.000046,t,h,e,r,e,38.886777
...,...,...,...,...,...,...,...,...,...,...
39928,goovo,12711,9.450223,0.000021,g,o,o,v,o,37.977552
39929,goova,12711,9.450223,0.000021,g,o,o,v,a,38.469967
39930,goolh,12711,9.450223,0.000021,g,o,o,l,h,38.685304
39931,gooek,12711,9.450223,0.000021,g,o,o,e,k,39.507803


In [10]:
wordle_words.to_csv('Word Data/wordle_words.csv')
column_probabilities.to_csv('Word Data/letter_probabilities.csv')