# Analysis of Wordle dataset

This document aims to analyse all of the words within wordle dictionary to suggest best possible words that could be used for guessing the wordle words.

## Install dependencies

In [51]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pandas



## Load data form files

In [80]:
from IPython.display import display
import pandas

def load_data_from_csv(file):
    return pandas.read_csv(file, header=None)

la = load_data_from_csv('./data/wordle-la.txt')
ta = load_data_from_csv('./data/wordle-ta.txt')

dataset = pandas.concat([la, ta])
dataset_size = dataset.size

%store dataset

print(f"Loaded {dataset_size} rows")

Stored 'dataset' (DataFrame)
Loaded 12972 rows


## Count characters

In [81]:
data = pandas.DataFrame([list(d[1][0]) for d in dataset.iterrows()])
data = data.apply(pandas.value_counts)
data.rename(columns={0: 'Pos 1',1: 'Pos 2',2: 'Pos 3',3: 'Pos 4',4: 'Pos 5'}, inplace=True)
data['Total Count'] = data.sum(axis=1)

index_labels=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
data = pandas.DataFrame(data, index=index_labels)
%store data

Stored 'data' (DataFrame)


### Display character counts per position sorted alphabetically

In [78]:
display(data)

Unnamed: 0,Pos 1,Pos 2,Pos 3,Pos 4,Pos 5,Total Count
a,737,2263,1236,1074,680,5990
b,909,81,335,243,59,1627
c,922,176,392,411,127,2028
d,685,84,390,471,823,2453
e,303,1628,882,2327,1522,6662
f,598,24,178,233,82,1115
g,638,76,364,423,143,1644
h,489,546,120,235,370,1760
i,165,1383,1051,880,280,3759
j,202,11,46,29,3,291


### Display character count in each position by general popularity

In [None]:
pos_total = data.sort_values(by=['Total Count'], ascending=False)

%store pos_total

display(pos_total)

Stored 'pos_total' (DataFrame)


Unnamed: 0,Pos 1,Pos 2,Pos 3,Pos 4,Pos 5,Total Count
s,1565,93,533,516,3958,6665
e,303,1628,882,2327,1522,6662
a,737,2263,1236,1074,680,5990
o,262,2096,993,698,389,4438
r,628,940,1198,719,673,4158
i,165,1383,1051,880,280,3759
l,577,699,848,771,476,3371
t,815,239,616,898,727,3295
n,325,345,964,788,530,2952
u,189,1187,667,401,67,2511


### Display most popular letters sorted by first position in the word

In [None]:
pos_1 = data.sort_values(by=['Pos 1'], ascending=False)

%store pos_1

display(pos_1)

Stored 'pos_1' (DataFrame)


Unnamed: 0,Pos 1,Pos 2,Pos 3,Pos 4,Pos 5,Total Count
s,1565,93,533,516,3958,6665
c,922,176,392,411,127,2028
b,909,81,335,243,59,1627
p,859,231,364,418,147,2019
t,815,239,616,898,727,3295
a,737,2263,1236,1074,680,5990
m,693,188,511,402,182,1976
d,685,84,390,471,823,2453
g,638,76,364,423,143,1644
r,628,940,1198,719,673,4158


### Display most popular letters sorted by second position in the word

In [None]:
pos_2 = data.sort_values(by=['Pos 2'], ascending=False)

%store pos_2

display(pos_2)

Stored 'pos_2' (DataFrame)


Unnamed: 0,Pos 1,Pos 2,Pos 3,Pos 4,Pos 5,Total Count
a,737,2263,1236,1074,680,5990
o,262,2096,993,698,389,4438
e,303,1628,882,2327,1522,6662
i,165,1383,1051,880,280,3759
u,189,1187,667,401,67,2511
r,628,940,1198,719,673,4158
l,577,699,848,771,476,3371
h,489,546,120,235,370,1760
n,325,345,964,788,530,2952
y,181,271,213,108,1301,2074


### Display most popular letters sorted by third position in the word

In [None]:
pos_3 = data.sort_values(by=['Pos 3'], ascending=False)

%store pos_3

display(pos_3)

Stored 'pos_3' (DataFrame)


Unnamed: 0,Pos 1,Pos 2,Pos 3,Pos 4,Pos 5,Total Count
a,737,2263,1236,1074,680,5990
r,628,940,1198,719,673,4158
i,165,1383,1051,880,280,3759
o,262,2096,993,698,389,4438
n,325,345,964,788,530,2952
e,303,1628,882,2327,1522,6662
l,577,699,848,771,476,3371
u,189,1187,667,401,67,2511
t,815,239,616,898,727,3295
s,1565,93,533,516,3958,6665


### Display most popular letters sorted by fourth position in the word

In [None]:
pos_4 = data.sort_values(by=['Pos 4'], ascending=False)

%store pos_4

display(pos_4)

Stored 'pos_4' (DataFrame)


Unnamed: 0,Pos 1,Pos 2,Pos 3,Pos 4,Pos 5,Total Count
e,303,1628,882,2327,1522,6662
a,737,2263,1236,1074,680,5990
t,815,239,616,898,727,3295
i,165,1383,1051,880,280,3759
n,325,345,964,788,530,2952
l,577,699,848,771,476,3371
r,628,940,1198,719,673,4158
o,262,2096,993,698,389,4438
s,1565,93,533,516,3958,6665
k,376,95,272,503,259,1505


### Display most popular letters sorted by fifth position in the word

In [None]:
pos_5 = data.sort_values(by=['Pos 5'], ascending=False)

%store pos_5

display(pos_5)

Stored 'pos_5' (DataFrame)


Unnamed: 0,Pos 1,Pos 2,Pos 3,Pos 4,Pos 5,Total Count
s,1565,93,533,516,3958,6665
e,303,1628,882,2327,1522,6662
y,181,271,213,108,1301,2074
d,685,84,390,471,823,2453
t,815,239,616,898,727,3295
a,737,2263,1236,1074,680,5990
r,628,940,1198,719,673,4158
n,325,345,964,788,530,2952
l,577,699,848,771,476,3371
o,262,2096,993,698,389,4438


In [86]:
data['Ranking'] = 0


for i, v in enumerate(pos_total.index.tolist()):
    data.loc[v, 'Ranking'] += i + 1

for i, v in enumerate(pos_1.index.tolist()):
    data.loc[v, 'Ranking'] += i + 1

for i, v in enumerate(pos_2.index.tolist()):
    data.loc[v, 'Ranking'] += i + 1

for i, v in enumerate(pos_3.index.tolist()):
    data.loc[v, 'Ranking'] += i + 1

for i, v in enumerate(pos_4.index.tolist()):
    data.loc[v, 'Ranking'] += i + 1

for i, v in enumerate(pos_5.index.tolist()):
    data.loc[v, 'Ranking'] += i + 1

pos_ranking = data.sort_values(by=['Ranking'], ascending=True)

%store pos_ranking

display(pos_ranking)

Stored 'pos_ranking' (DataFrame)


Unnamed: 0,Pos 1,Pos 2,Pos 3,Pos 4,Pos 5,Total Count,Ranking
a,737,2263,1236,1074,680,5990,19
e,303,1628,882,2327,1522,6662,31
r,628,940,1198,719,673,4158,37
s,1565,93,533,516,3958,6665,39
t,815,239,616,898,727,3295,41
o,262,2096,993,698,389,4438,46
l,577,699,848,771,476,3371,48
i,165,1383,1051,880,280,3759,52
n,325,345,964,788,530,2952,52
d,685,84,390,471,823,2453,65
