#### This notebook analyzes a toughies.csv file. This file is generated by the Django management command named `get_toughie_info.py` with a production db backup

In [76]:
import pandas
import random
import csv

In [77]:
toughies = pandas.read_csv('./toughies.csv')
len(toughies)

109451

#### For better statistical significance, filter only bingos that were asked at least 30 times.

In [78]:
better_toughies = toughies.loc[toughies['asked'] >= 30]


In [79]:
# Order of dictionary updates:
def lexkey_assigner(row):
    if row['lexicon'] == 'OWL2':
        return 1
    elif row['lexicon'] == 'America':
        return 2
    elif row['lexicon'] == 'NWL18':
        return 3

better_toughies = better_toughies.assign(
    lexkey=better_toughies.apply(lexkey_assigner, axis=1)).sort_values('lexkey')


In [80]:
# Determine which words have NOT been asked yet. 
with open('./7s_alphs.csv') as f:
    alphas_7s = set([a for a in f.read().split('\n') if len(a) == 7])

with open('./8s_alphs.csv') as f:
    alphas_8s = set([a for a in f.read().split('\n') if len(a) == 8])


In [81]:
def additions_to_existing_alphagrams(filename) -> dict:
    """ 
    given the csv file, output a dictionary of alphagrams to words
    where at least one of the words in each pair previously existed
    in the last dictionary update.
    the csv file consists of the added words and alphagrams 
    in an update.
    """
    alphas = {}
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            tpl = (row['word'], row['added'])
            if row['alpha'] not in alphas:
                alphas[row['alpha']] = [tpl]
            else:
                alphas[row['alpha']].append(tpl)
    # Now, keep only the ones where the value has at least one non-+ word.
    new_alphas = {}
    for k, v in alphas.items():
        if any([s == '' for _, s in v]):
            new_alphas[k] = [w for w, _ in v] 
    return new_alphas

# These text files below were created like:
# select word, alphagram, lexicon_symbols from (
#      select alphagrams.alphagram from alphagrams where
#      contains_update_to_lex=1 and length=7 order by alphagrams.probability) q 
# inner join words w using (alphagram);

new_sevens_first_update = additions_to_existing_alphagrams('./7snew_owl2_america.txt')
new_sevens_second_update = additions_to_existing_alphagrams('./7snew_america_nwl18.txt')
new_eights_first_update = additions_to_existing_alphagrams('./8snew_owl2_america.txt')
new_eights_second_update = additions_to_existing_alphagrams('./8snew_america_nwl18.txt')


In [109]:
better_toughies.loc[better_toughies['Alphagram'] == 'DEGINORR']

Unnamed: 0,Alphagram,probability,asked,missed,difficulty,lexicon,lexupdate,lexkey
79022,DEGINORR,1322,64,10,0.15625,OWL2,,1


In [108]:
better_toughies.loc[better_toughies['Alphagram'] == 'AEEINST']

Unnamed: 0,Alphagram,probability,asked,missed,difficulty,lexicon,lexupdate,lexkey
81793,AEEINST,26,43,7,0.162791,OWL2,,1
1901,AEEINST,27,325,68,0.209231,America,,2


In [85]:
asked_7s = set()
asked_8s = set()

# Start at the first lexicon.
last_lex = 'OWL2'
for row in better_toughies.itertuples():
    # row 0 is the index.
    lex = row[6]
    # Clear out questions that got new additions.
    if lex != last_lex:
        if lex == 'America':
            for alpha in new_sevens_first_update:
                if alpha in asked_7s:
                    asked_7s.remove(alpha)
            for alpha in new_eights_first_update:
                if alpha in asked_8s:
                    asked_8s.remove(alpha)
        elif lex == 'NWL18':
            for alpha in new_sevens_second_update:
                if alpha in asked_7s:
                    asked_7s.remove(alpha)
            for alpha in new_eights_second_update:
                if alpha in asked_8s:
                    asked_8s.remove(alpha) 
            
    alpha = row[1]
    if len(alpha) == 7:
        asked_7s.add(alpha)
    if len(alpha) == 8:
        asked_8s.add(alpha) 
        
    last_lex = lex
        
print(f'Asked {len(asked_7s)} out of {len(alphas_7s)} 7s')
print(f'Asked {len(asked_8s)} out of {len(alphas_8s)} 8s')

Asked 21011 out of 21108 7s
Asked 27672 out of 28029 8s


In [86]:
print((len(alphas_7s) - len(asked_7s)) / 50)
print((len(alphas_8s) - len(asked_8s)) / 50)

1.94
7.14


### Determine a list of all bingos by difficulty!

In [105]:
# Default to taking results for newer lexica. This is because Aerolith may at first have been 
# populated by people who were already really good at the bingos, but as time passed, more lower-rated
# players have been joining. 
bingos = {}
for row in better_toughies.itertuples():
    # Since the df is sorted from oldest to newest lexicon, results from newer "asks" will supersede
    # older asks, if the number of asks is significantly bigger.
    alpha = row[1]
    # If the new bingo was asked at least 7 more times... (just some fudge factor)
    if alpha not in bingos or bingos[alpha][3] < row[3] + 7:
        # Add if it doesn't exist, or if it exists and 
        # the number of asks is now bigger (more data is
        # better). This still might have some bias from
        # early Aerolith users being better on average,
        # if a question was asked a lot back in the day.
        bingos[alpha] = row

print(len(bingos))

48736


In [106]:
bdf = pandas.DataFrame.from_dict(bingos, orient='index')

In [114]:
# Now we can ask some questions. For example, what are the hardest 1000 bingos with probability < 15000?

bdf[bdf['probability'] <= 15000].sort_values('difficulty', ascending=False)

Unnamed: 0,Index,Alphagram,probability,asked,missed,difficulty,lexicon,lexupdate,lexkey
ABGLORS,5115,ABGLORS,7923,47,44,0.936170,America,+,2
DEEIRRSS,12791,DEEIRRSS,10521,32,28,0.875000,America,,2
DEIMMOST,13693,DEIMMOST,13219,31,26,0.838710,America,,2
EILLORTT,12528,EILLORTT,4394,66,55,0.833333,America,+,2
IOOPRSTT,13631,IOOPRSTT,11649,46,38,0.826087,America,,2
AHIIMRST,13569,AHIIMRST,10172,107,88,0.822430,America,,2
AILPSST,2436,AILPSST,11842,152,124,0.815789,America,,2
BDEOORY,3279,BDEOORY,9495,54,44,0.814815,America,+,2
CENOSTTU,13455,CENOSTTU,7550,81,66,0.814815,America,+,2
EHIOPRSY,12761,EHIOPRSY,9764,126,102,0.809524,America,+,2


### Run the following cell to determine which questions are left to ask (maybe can use for future updates or CSW)

In [88]:
missing_8s = list(alphas_8s - asked_8s)
missing_7s = list(alphas_7s - asked_7s)

asked_7s_new = list(asked_7s)
asked_8s_new = list(asked_8s)
random.shuffle(asked_7s_new)
random.shuffle(asked_8s_new)

# Extend the 8s by 23 questions so we have 400 exactly.
missing_8s.extend(asked_8s_new[:43])
# Extend the 7s so we have 200 exactly
missing_7s.extend(asked_7s_new[:103])
assert(len(missing_7s) == 200)
assert(len(missing_8s) == 400)

random.shuffle(missing_7s)
random.shuffle(missing_8s)

i = 0
for seven in missing_7s:
    print(seven)
    i += 1
    if i % 50 == 0:
        print('-' * 6)

print ('-' * 12)
i = 0
for eight in missing_8s:
    print(eight)
    i += 1
    if i % 50 == 0:
        print('-' * 6)

AAILQWW
AADIMNS
ACLLORS
AALNRSU
DEHLNOS
ADDEEIT
IKKOPYY
EGILLNT
AIPPRRS
EFGLNTU
DEHINOR
BCEINOR
EGGLPSU
GLMNOOS
AAAANRW
BEEISST
EILOPPZ
AAEGNNS
ACDMOOW
BBENRSU
AAENNST
IKOORTT
AAGIRSY
CEGINNS
BENOSTT
EGILOOS
ACGLNNO
MOPSSSU
AEEGLTU
AEGGIMS
ABDEHOW
AEEGLRS
DEELSUV
DEIORSW
ADLNOTY
CCEKLOS
MPSSTUU
CDEHINS
BCDKORU
AEINSTW
CEERRUV
CCDKLOU
AELNORV
AAEGNRW
EHIRSTT
EILNOVV
HLMRTUY
AAGOPSS
AABHMNR
BCEIRRS
------
ABEEGLR
EILNOTU
DEGIJRU
BDEGILN
AAAKLMS
AALMMNS
ADEENRS
AAABLST
EEHMRSS
BEKOOPR
BCIINOT
EGHLNST
EIIMRSS
ACIILSV
CEJOORS
ACDOPST
ABDELOS
BJNOSTU
BMOOSTY
EIRSTTW
ADISSTU
EIOORRT
ACEPPRS
CCELPUY
CDMOOST
BBFGINU
AADEJMR
CIMOTYZ
AEEGGNR
AABELRS
AAILNTY
GHINOPP
DEENPSX
CIKLORY
CCEORSS
DENORUW
AAHLRSW
AEFILNU
AEEKMPR
CDEHOPR
DGHILNO
AACINOR
EHILOPT
EELNSTT
DEFFNOR
BELMOOT
ACEIMST
DINOWWY
GILLNYY
ACEKLNS
------
AELRSSY
APRSTTU
ADDEIOR
EINNPRU
EGIRSZZ
EEEHLPS
ADIMSSU
CKLLMOU
AAKMRUZ
ADELMST
ABFILSU
AEGNNRU
DEEHORT
IKNNNOS
BBDILRY
EERSTTU
EHKNRSU
ACHILLO
AACELOV
DHIILMS
AEOQRTU
AABILLR
IKKOOPY
AA

#### Challenge simulator - how many unasked questions do we have after a certain time?

In [73]:
num_days = 8 * 365   # We've been asking qs for roughly 8 years (since Jun 2011 -- update if changes)
num_qs = 50
num_alphas = 28029   # How many sevens or eights

alphas = set(range(num_alphas))

for i in range(num_days):
    todays = list(range(num_alphas))
    random.shuffle(todays)
    for q in todays[:num_qs]:
        if q in alphas:
            alphas.remove(q)

print(len(alphas))

141
