In [6]:
import pandas as pd
import numpy as np
import plotly.express as px 
from ast import literal_eval
from collections import Counter


In [5]:
weighted = pd.read_csv('/home/ali/Desktop/wordle_simulator/results/weighted.csv')
non_weighted = pd.read_csv('/home/ali/Desktop/wordle_simulator/results/non_weighted.csv')

Weighted vs Non-Weighted Random Choice

In [9]:
for df, name in (weighted, 'weighted'), (non_weighted, 'non_weighted'):
    print(f"win rate: %{round(df['won'].sum() / len(df), 2) * 100}")
    print(f"mean_tries: {round(df['n_tries'].mean(), 2)}")
    fig = px.box(df['n_tries'], height = 500, width = 350, title = f'{name}_tries')
    fig.show()

win rate: %98.0
mean_tries: 4.01


win rate: %90.0
mean_tries: 4.73


In [16]:
for df, name in (weighted, 'weighted'), (non_weighted, 'non_weighted'):
    sample = df['n_choices'].apply(lambda x: literal_eval(x)[-1])
    print(f'mean_n_last_choices: {round(sample.mean(), 2)}')
    fig = px.box(sample, height = 500, width = 350, title = f'n_last_choices_{name}')
    fig.show()

mean_n_last_choices: 10.23


mean_n_last_choices: 3.65


Optimal Words to Start With

In [4]:
#processing data
optimal = pd.read_csv('/home/ali/Desktop/wordle_simulator/results/optimal_word_raw.csv')
optimal['first_guess'] = optimal['guesses'].apply(lambda x: literal_eval(x)[0])
optimal['last_n_choices'] = optimal['n_choices'].apply(lambda x: literal_eval(x)[-1])

#grouping data
optimal_words = optimal.groupby('first_guess', as_index = False).agg({'n_tries':'mean', 'last_n_choices':'mean', 'won':(lambda x: sum(x)/100)})

In [73]:
optimal_words.sort_values('n_tries').head(10).to_csv('/home/ali/Desktop/wordle_simulator/results//optimal_words(n_tries).csv', index=None)
optimal_words.sort_values(['won', 'n_tries'], ascending = [False, True]).head(10).to_csv('/home/ali/Desktop/wordle_simulator/results/optimal_words(max_win_rate).csv', index=None)


In [29]:
px.histogram(optimal_words['n_tries'], height = 350, width = 500, title = 'n_tries Distribution')

In [43]:
data_sorted = optimal_words['n_tries'].sort_values()
p = 1. * np.arange(len(data_sorted)) / (len(data_sorted) - 1)
px.line(x=data_sorted, y=p, height = 350, width = 500, title = 'n_tries CDF')

In [31]:
px.histogram(optimal_words['won'], height = 350, width = 500, title = 'Win Rate Distribution')

In [75]:
px.histogram(optimal_words['last_n_choices'], height = 350, width = 500, title = 'last_n_choices Distribution')

Vowels Analysis 

In [None]:
optimal_words['unique_vowels'] = optimal_words['first_guess'].apply(lambda x: len(set([c.lower() for c in x if c.lower() in 'aeiou'])))

In [74]:
optimal_words.corr()[['unique_vowels']].to_csv('/home/ali/Desktop/wordle_simulator/results/vowels_correlation.csv')
optimal_words.groupby('unique_vowels', as_index=False).agg({ 'first_guess':'count','n_tries':'mean', 'won':'mean'}).to_csv('/home/ali/Desktop/wordle_simulator/results/unique_vowels_analytics.csv')





In [60]:
px.density_contour(data_frame = optimal_words, x = 'unique_vowels', y = 'n_tries', height = 350, width = 500)

In [77]:
px.scatter(data_frame = optimal_words, x = 'unique_vowels', y = 'n_tries', height = 350, width = 500, title='unique_vowels Vs n_tries')

wordle difficulty levels

In [54]:
#processing the Data
difficulty_level = pd.read_csv('/home/ali/Desktop/wordle_simulator/results/difficulty_level_raw.csv')
difficulty_level['answer'] = difficulty_level['guesses'].apply(lambda x: literal_eval(x)[-1])
difficulty_level['last_n_choices'] = difficulty_level['n_choices'].apply(lambda x: literal_eval(x)[-1])

#grouping data
difficulty_level_words = difficulty_level.groupby('answer', as_index = False).agg({'n_tries':'mean', 'last_n_choices':'mean', 'won':(lambda x: sum(x)/100)})

In [56]:
difficulty_level_words.sort_values('n_tries')

Unnamed: 0,answer,n_tries,last_n_choices,won
5718,their,3.39,8.89,1.00
4511,raise,3.44,8.27,1.00
283,aside,3.47,5.49,1.00
5733,third,3.47,10.06,1.00
5104,since,3.48,10.33,1.00
...,...,...,...,...
1700,eaves,8.02,2.21,0.21
2241,gears,8.06,2.08,0.29
1687,eared,8.36,2.86,0.18
6496,zines,8.53,3.63,0.22


In [58]:
px.histogram(difficulty_level_words['n_tries'], height = 350, width = 500, title = 'n_tries Distribution')

In [59]:
data_sorted = difficulty_level_words['n_tries'].sort_values()
p = 1. * np.arange(len(data_sorted)) / (len(data_sorted) - 1)
px.line(x=data_sorted, y=p, height = 350, width = 500, title = 'n_tries CDF')

In [63]:
difficulty_level_words['difficulty_level'] = 'medium'
difficulty_level_words.loc[difficulty_level_words['n_tries'] <= 4.25, 'difficulty_level'] = 'easy'
difficulty_level_words.loc[difficulty_level_words['n_tries'] >= 5.11, 'difficulty_level'] = 'hard'

In [67]:
difficulty_level_words[difficulty_level_words['difficulty_level'] == 'easy'].sort_values('n_tries').to_csv('/home/ali/Desktop/wordle_simulator/results/easy_words.csv')
difficulty_level_words[difficulty_level_words['difficulty_level'] == 'medium'].sort_values('n_tries').to_csv('/home/ali/Desktop/wordle_simulator/results/medium_words.csv')
difficulty_level_words[difficulty_level_words['difficulty_level'] == 'hard'].sort_values('n_tries').to_csv('/home/ali/Desktop/wordle_simulator/results/hard_words.csv')

In [99]:
difficulty_level_words.groupby('difficulty_level', as_index=False).agg({'won':'mean'})

Unnamed: 0,difficulty_level,won
0,easy,0.995238
1,hard,0.698119
2,medium,0.956347


Duplicated Letters Analysis

In [85]:
def duplicated_letters(text):
    letter_counts = Counter(text)
    duplicated_letter_counts = {letter: count for letter, count in letter_counts.items() if count > 1}
    return duplicated_letter_counts

difficulty_level_words['duplicated_letters'] = difficulty_level_words['answer'].apply(lambda x: len(list(duplicated_letters(x).keys())))
difficulty_level_words['duplicated_letters_count'] = difficulty_level_words['answer'].apply(lambda x: sum(duplicated_letters(x).values()))


In [121]:
difficulty_level_words.corr()['n_tries']





n_tries                     1.000000
last_n_choices             -0.146014
won                        -0.921320
duplicated_letters          0.145736
duplicated_letters_count    0.149457
Name: n_tries, dtype: float64

In [109]:
difficulty_level_words['duplicated_letters'].value_counts()

0    4271
1    2090
2     151
Name: duplicated_letters, dtype: int64

In [111]:
difficulty_level_words['duplicated_letters_count'].value_counts()

0    4271
2    2025
4     148
3      64
5       4
Name: duplicated_letters_count, dtype: int64

In [132]:
px.scatter(data_frame = difficulty_level_words, x = 'duplicated_letters', y = 'n_tries',height = 350, width = 500, title = 'duplicated_letters Vs n_tries')

In [133]:
px.scatter(data_frame = difficulty_level_words, x = 'duplicated_letters_count', y = 'n_tries',height = 350, width = 500, title = 'duplicated_letters_count Vs n_tries')

In [129]:
difficulty_level_words.groupby('difficulty_level', as_index=False).agg({'duplicated_letters':'mean', 'duplicated_letters_count':'mean'}).sort_values('duplicated_letters')

Unnamed: 0,difficulty_level,duplicated_letters,duplicated_letters_count
0,easy,0.221074,0.443698
2,medium,0.416488,0.844928
1,hard,0.460777,0.942879
