In [1]:
import math
import pickle
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm

# Data pre-processing

In [2]:
answer_pool = list()

with open('data/answer_pool.txt', 'r') as fa:
    for line in fa:
        # answer format:
        # "ㄏㄨㄢㄍㄨ", // 環顧

        ans_zhuyin = line.split('"')[1]
        answer_pool.append(ans_zhuyin)

print(f'there are {len(answer_pool)} rows in the answer pool.')
answer_pool[:3]

there are 988 rows in the answer pool.


['ㄏㄨㄢㄍㄨ', 'ㄕㄤㄒㄧㄢ', 'ㄖㄨㄐㄧㄥ']

In [3]:
allowed_words = list()

with open('data/allowed_words.txt', 'r') as fw:
    for line in fw:
        # input format:
        # "ㄅㄚㄅㄟㄗ":"八輩子",
        # output format:
        # [_1, _2, _3, _4, _5, chinese, zhuyin, in_answers]

        row = list()
        _, zhuyin, __, word, ___ = line.split('"')
        
        in_answers = True if zhuyin in answer_pool else False
        row += [c for c in zhuyin] + [word] + [zhuyin] + [in_answers]

        allowed_words.append(row)

print(f'there are {len(allowed_words)} possible words.')
allowed_words[:3]

there are 21721 possible words.


[['ㄅ', 'ㄚ', 'ㄅ', 'ㄟ', 'ㄗ', '八輩子', 'ㄅㄚㄅㄟㄗ', False],
 ['ㄅ', 'ㄚ', 'ㄅ', 'ㄧ', 'ㄠ', '八表', 'ㄅㄚㄅㄧㄠ', False],
 ['ㄅ', 'ㄚ', 'ㄅ', 'ㄧ', 'ㄥ', '把柄/八病', 'ㄅㄚㄅㄧㄥ', False]]

In [4]:
columns = ['_1', '_2', '_3', '_4', '_5', 'word', 'zhuyin', 'in_answers']
df = pd.DataFrame(allowed_words, columns=columns)
df

Unnamed: 0,_1,_2,_3,_4,_5,word,zhuyin,in_answers
0,ㄅ,ㄚ,ㄅ,ㄟ,ㄗ,八輩子,ㄅㄚㄅㄟㄗ,False
1,ㄅ,ㄚ,ㄅ,ㄧ,ㄠ,八表,ㄅㄚㄅㄧㄠ,False
2,ㄅ,ㄚ,ㄅ,ㄧ,ㄥ,把柄/八病,ㄅㄚㄅㄧㄥ,False
3,ㄅ,ㄚ,ㄆ,ㄨ,ㄗ,拔脯子,ㄅㄚㄆㄨㄗ,False
4,ㄅ,ㄚ,ㄆ,ㄨ,ㄦ,拔脯兒,ㄅㄚㄆㄨㄦ,False
...,...,...,...,...,...,...,...,...
21716,ㄩ,ㄨ,ㄣ,ㄏ,ㄨ,宇文護,ㄩㄨㄣㄏㄨ,False
21717,ㄩ,ㄨ,ㄤ,ㄊ,ㄞ,禹王臺,ㄩㄨㄤㄊㄞ,False
21718,ㄩ,ㄩ,ㄌ,ㄧ,ㄤ,禹餘糧,ㄩㄩㄌㄧㄤ,False
21719,ㄩ,ㄩ,ㄒ,ㄧ,ㄩ,喁喁細語,ㄩㄩㄒㄧㄩ,False


In [5]:
df['in_answers'].value_counts()

False    20770
True       951
Name: in_answers, dtype: int64

In [6]:
print(f'there are {len(answer_pool)} rows in answer_pool.txt')
print(f'but there are only {len(set(answer_pool))} answers.')

there are 988 rows in answer_pool.txt
but there are only 951 answers.


In [7]:
answer_pool = list(set(answer_pool))
len(answer_pool)

951

# Answers Counting over Patterns

In [8]:
spot_results = ('C', 'M', 'X')  # Correct🟩, Misplaced🟨, Excluded⬜️

# generate a list of all 3^5 patterns
patterns = list(itertools.product(spot_results, repeat=5))

print(f'there are {len(patterns)} kinds of patterns.')
print(patterns[:3])
print(patterns[-3:])

there are 243 kinds of patterns.
[('C', 'C', 'C', 'C', 'C'), ('C', 'C', 'C', 'C', 'M'), ('C', 'C', 'C', 'C', 'X')]
[('X', 'X', 'X', 'X', 'C'), ('X', 'X', 'X', 'X', 'M'), ('X', 'X', 'X', 'X', 'X')]


In [9]:
def matching(guess, answer):
    """
    matching mechanism:
    1. finding CORRECT spots
    2. search the not-correct letters, in the not-correct spots of the answer:
       1) if exist, the letter it is MISPLACED
       2) if the letter does not exist in the not-correct spots of the answer,
          the letter is EXCLUDED
    """
    
    not_used = [c for c in answer]
    matching_result = ['' for _ in range(5)]

    # CORRECT 🟩
    for i in range(5):
        if guess[i] == answer[i]:
            matching_result[i] = 'C'
            not_used.remove(guess[i])

    for i in range(5):
        if matching_result[i] == '':
            # MISPLACED 🟨
            if guess[i] in not_used:
                matching_result[i] = 'M'
                not_used.remove(guess[i])

            # EXCLUDED ⬜️
            else:
                matching_result[i] = 'X'

    return tuple(matching_result)

In [10]:
words = list(df['zhuyin'])
len(words)

21721

In [11]:
ACP = np.zeros((21721, 243), dtype='uint16')
all_second_guess = dict()

with tqdm(total=21721) as pbar:
    for word_index, word in enumerate(words):
        pbar.update(1)
        
        word_second_guess = dict()
        
        for answer in answer_pool:
            match_res = matching(word, answer)
            pat_index = patterns.index(match_res)
            ACP[word_index, pat_index] += 1
            
            if f'p{pat_index+1}' in word_second_guess:
                word_second_guess[f'p{pat_index+1}'] += f', {answer}'
            else:
                word_second_guess[f'p{pat_index+1}'] = answer

        all_second_guess[word] = word_second_guess

ACP

100%|████████████████████████████████████| 21721/21721 [02:27<00:00, 147.06it/s]


array([[  0,   0,   0, ...,   0,  31, 660],
       [  0,   0,   0, ...,   9,  12, 260],
       [  0,   0,   0, ...,  35,  45, 201],
       ...,
       [  0,   0,   0, ...,  21,  17, 217],
       [  0,   0,   0, ...,   8,   0, 282],
       [  0,   0,   0, ...,  31,   0, 324]], dtype=uint16)

In [12]:
all_second_guess[words[0]]

{'p243': 'ㄆㄧㄣㄆㄞ, ㄌㄠㄉㄨㄥ, ㄍㄨㄥㄍㄨ, ㄒㄩㄝㄍㄠ, ㄕㄡㄉㄨㄢ, ㄍㄜㄇㄧㄥ, ㄉㄨㄥㄧㄠ, ㄕㄨㄣㄧㄢ, ㄍㄨㄛㄑㄩ, ㄆㄧㄥㄈㄣ, ㄑㄧㄑㄧㄡ, ㄏㄨㄌㄨㄢ, ㄊㄡㄊㄨㄥ, ㄋㄧㄡㄖㄡ, ㄊㄨㄥㄏㄠ, ㄐㄧㄝㄉㄠ, ㄕㄣㄉㄨㄢ, ㄒㄧㄤㄕㄡ, ㄓㄨㄥㄖㄣ, ㄌㄞㄌㄧㄣ, ㄖㄣㄊㄨㄥ, ㄧㄠㄐㄧㄣ, ㄐㄧㄝㄔㄨ, ㄐㄧㄝㄖㄨ, ㄌㄨㄣㄨㄣ, ㄌㄧㄇㄧㄥ, ㄈㄥㄐㄧㄥ, ㄌㄧㄡㄌㄨ, ㄒㄧㄣㄐㄧ, ㄐㄧㄠㄧㄝ, ㄎㄜㄒㄩㄝ, ㄎㄨㄛㄓㄤ, ㄉㄧㄢㄉㄥ, ㄏㄨㄣㄧㄠ, ㄒㄧㄢㄑㄧ, ㄋㄢㄇㄧㄣ, ㄐㄧㄢㄖㄣ, ㄐㄧㄥㄊㄢ, ㄓㄣㄐㄧㄝ, ㄉㄤㄐㄧㄣ, ㄌㄧㄌㄧㄤ, ㄍㄨㄢㄘㄞ, ㄧㄥㄍㄨㄤ, ㄓㄨㄢㄓㄨ, ㄍㄨㄛㄍㄜ, ㄓㄣㄙㄨㄛ, ㄊㄨㄛㄧㄢ, ㄌㄨㄍㄨㄛ, ㄐㄧㄢㄌㄧ, ㄊㄞㄒㄩㄝ, ㄈㄤㄒㄩㄝ, ㄋㄧㄥㄩㄢ, ㄐㄧㄑㄩㄢ, ㄊㄢㄙㄨㄛ, ㄓㄨㄥㄕㄥ, ㄒㄧㄠㄒㄧ, ㄧㄥㄒㄩㄥ, ㄍㄨㄢㄩㄢ, ㄐㄧㄠㄉㄞ, ㄍㄨㄥㄌㄧ, ㄔㄨㄉㄨㄥ, ㄔㄨㄓㄨㄥ, ㄋㄧㄡㄋㄞ, ㄒㄧㄢㄓㄨ, ㄍㄨㄥㄨㄣ, ㄨㄞㄍㄨㄥ, ㄓㄢㄒㄧㄢ, ㄖㄨㄥㄒㄩ, ㄨㄤㄒㄧㄤ, ㄖㄨㄐㄧㄥ, ㄐㄧㄢㄒㄧ, ㄌㄧㄏㄨㄣ, ㄉㄨㄥㄊㄞ, ㄈㄣㄉㄧㄢ, ㄐㄧㄣㄓㄠ, ㄋㄢㄇㄧㄢ, ㄒㄧㄥㄈㄣ, ㄊㄢㄒㄧㄥ, ㄓㄨㄥㄍㄨ, ㄓㄨㄥㄔㄢ, ㄨㄢㄑㄩㄢ, ㄇㄨㄑㄧㄢ, ㄏㄡㄇㄧㄢ, ㄌㄨㄥㄇㄣ, ㄒㄧㄠㄈㄥ, ㄍㄨㄋㄧㄤ, ㄍㄨㄥㄎㄜ, ㄏㄠㄒㄧㄠ, ㄙㄨㄛㄧㄡ, ㄎㄜㄌㄧㄢ, ㄕㄤㄒㄧㄢ, ㄐㄩㄝㄉㄜ, ㄈㄥㄎㄨㄤ, ㄉㄨㄥㄖㄣ, ㄐㄧㄝㄕㄨ, ㄇㄞㄌㄨㄛ, ㄒㄧㄥㄨㄣ, ㄒㄧㄠㄋㄩ, ㄏㄠㄊㄧㄥ, ㄊㄜㄒㄧㄥ, ㄒㄧㄣㄧㄤ, ㄔㄠㄍㄨㄛ, ㄓㄨㄧㄌㄧ, ㄒㄧㄣㄕㄥ, ㄐㄧㄢㄕㄜ, ㄖㄨㄊㄨㄥ, ㄌㄧㄢㄖㄣ, ㄏㄠㄒㄧㄤ, ㄐㄧㄐㄧㄠ, ㄐㄧㄠㄐㄧ, ㄨㄣㄐㄧㄥ, ㄒㄩㄢㄧㄤ, ㄍㄨㄥㄋㄥ, ㄐㄧㄠㄊㄤ, ㄉㄧㄢㄩㄢ, ㄒㄧㄣㄎㄨ, ㄒㄧㄥㄧㄝ, ㄐㄧㄤㄧㄥ, ㄊㄢㄌㄨㄣ, ㄑㄧㄌㄧㄥ, ㄇㄧㄣㄕㄥ, ㄎㄨㄥㄐㄩ, ㄒㄧㄣㄎㄡ, ㄓㄨㄢㄕㄣ, ㄊㄜㄉㄧㄥ, ㄐㄧㄝㄓㄜ, ㄒㄧㄠㄌㄩ, ㄉㄨㄥㄒㄧ, ㄕㄡㄒㄧㄢ, ㄕㄨㄣㄌㄧ, ㄎㄣㄉㄧㄥ, ㄆㄧㄥㄉㄢ, ㄊㄢㄐㄧㄡ, ㄒㄧㄤㄧㄡ, ㄏㄨㄤㄇㄤ, ㄒㄩㄝㄒㄧ, ㄍㄨㄛㄕㄣ, ㄔㄥㄍㄨㄥ, ㄍㄨㄛㄐㄧ, ㄑㄧㄥㄒㄩ, ㄑㄩㄒㄧㄢ, ㄒㄧㄠ

In [13]:
with open('data/next_guess_map.pickle', 'wb') as po:
    pickle.dump(all_second_guess, po, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
columns_patterns = ['p'+str(i) for i in range(1, 244)]
df_ACP = pd.DataFrame(ACP, columns=columns_patterns)

df_ACP

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p234,p235,p236,p237,p238,p239,p240,p241,p242,p243
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,7,67,0,31,660
1,0,0,0,0,0,0,0,0,0,0,...,0,22,16,195,7,28,217,9,12,260
2,0,0,0,0,0,0,0,0,0,0,...,0,58,22,153,30,52,170,35,45,201
3,0,0,0,0,0,0,0,0,0,0,...,15,0,11,137,0,18,218,0,9,348
4,0,0,0,0,0,0,0,0,0,0,...,16,1,0,147,0,0,236,0,0,357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21716,0,0,0,0,0,0,0,0,0,0,...,50,1,0,7,0,0,12,22,0,305
21717,0,0,0,0,0,0,0,0,0,0,...,41,2,0,8,2,0,24,11,17,281
21718,0,0,0,0,0,0,0,0,0,0,...,20,14,21,225,20,10,207,21,17,217
21719,0,0,0,0,0,0,0,0,0,0,...,0,0,0,206,7,0,205,8,0,282


In [15]:
df_patterns = pd.concat([df, df_ACP], axis=1)
df_patterns

Unnamed: 0,_1,_2,_3,_4,_5,word,zhuyin,in_answers,p1,p2,...,p234,p235,p236,p237,p238,p239,p240,p241,p242,p243
0,ㄅ,ㄚ,ㄅ,ㄟ,ㄗ,八輩子,ㄅㄚㄅㄟㄗ,False,0,0,...,0,0,1,0,0,7,67,0,31,660
1,ㄅ,ㄚ,ㄅ,ㄧ,ㄠ,八表,ㄅㄚㄅㄧㄠ,False,0,0,...,0,22,16,195,7,28,217,9,12,260
2,ㄅ,ㄚ,ㄅ,ㄧ,ㄥ,把柄/八病,ㄅㄚㄅㄧㄥ,False,0,0,...,0,58,22,153,30,52,170,35,45,201
3,ㄅ,ㄚ,ㄆ,ㄨ,ㄗ,拔脯子,ㄅㄚㄆㄨㄗ,False,0,0,...,15,0,11,137,0,18,218,0,9,348
4,ㄅ,ㄚ,ㄆ,ㄨ,ㄦ,拔脯兒,ㄅㄚㄆㄨㄦ,False,0,0,...,16,1,0,147,0,0,236,0,0,357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21716,ㄩ,ㄨ,ㄣ,ㄏ,ㄨ,宇文護,ㄩㄨㄣㄏㄨ,False,0,0,...,50,1,0,7,0,0,12,22,0,305
21717,ㄩ,ㄨ,ㄤ,ㄊ,ㄞ,禹王臺,ㄩㄨㄤㄊㄞ,False,0,0,...,41,2,0,8,2,0,24,11,17,281
21718,ㄩ,ㄩ,ㄌ,ㄧ,ㄤ,禹餘糧,ㄩㄩㄌㄧㄤ,False,0,0,...,20,14,21,225,20,10,207,21,17,217
21719,ㄩ,ㄩ,ㄒ,ㄧ,ㄩ,喁喁細語,ㄩㄩㄒㄧㄩ,False,0,0,...,0,0,0,206,7,0,205,8,0,282


# Entropy

In [16]:
def calculate_entropy(row):
    sum_entropy = 0
    total = sum(row)
    
    for d in row:
        p = d/total
        # calculate entropy only if p>0 to prevent ZeroDivisionError
        if p != 0:
            entropy = (p * math.log(1/p, 2))
            sum_entropy += entropy
        
    return sum_entropy

In [17]:
entropy_col = list()
for row in ACP:
    entropy_col.append(calculate_entropy(row))

print(len(entropy_col))
entropy_col[:3]

21721


[1.9239237420730468, 3.305919822967189, 3.7767565728012342]

In [18]:
df_patterns['entropy'] = entropy_col
df_patterns

Unnamed: 0,_1,_2,_3,_4,_5,word,zhuyin,in_answers,p1,p2,...,p235,p236,p237,p238,p239,p240,p241,p242,p243,entropy
0,ㄅ,ㄚ,ㄅ,ㄟ,ㄗ,八輩子,ㄅㄚㄅㄟㄗ,False,0,0,...,0,1,0,0,7,67,0,31,660,1.923924
1,ㄅ,ㄚ,ㄅ,ㄧ,ㄠ,八表,ㄅㄚㄅㄧㄠ,False,0,0,...,22,16,195,7,28,217,9,12,260,3.305920
2,ㄅ,ㄚ,ㄅ,ㄧ,ㄥ,把柄/八病,ㄅㄚㄅㄧㄥ,False,0,0,...,58,22,153,30,52,170,35,45,201,3.776757
3,ㄅ,ㄚ,ㄆ,ㄨ,ㄗ,拔脯子,ㄅㄚㄆㄨㄗ,False,0,0,...,0,11,137,0,18,218,0,9,348,3.047255
4,ㄅ,ㄚ,ㄆ,ㄨ,ㄦ,拔脯兒,ㄅㄚㄆㄨㄦ,False,0,0,...,1,0,147,0,0,236,0,0,357,2.778213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21716,ㄩ,ㄨ,ㄣ,ㄏ,ㄨ,宇文護,ㄩㄨㄣㄏㄨ,False,0,0,...,1,0,7,0,0,12,22,0,305,3.703381
21717,ㄩ,ㄨ,ㄤ,ㄊ,ㄞ,禹王臺,ㄩㄨㄤㄊㄞ,False,0,0,...,2,0,8,2,0,24,11,17,281,3.557311
21718,ㄩ,ㄩ,ㄌ,ㄧ,ㄤ,禹餘糧,ㄩㄩㄌㄧㄤ,False,0,0,...,14,21,225,20,10,207,21,17,217,3.376844
21719,ㄩ,ㄩ,ㄒ,ㄧ,ㄩ,喁喁細語,ㄩㄩㄒㄧㄩ,False,0,0,...,0,0,206,7,0,205,8,0,282,2.886894


In [19]:
df_patterns.to_pickle('data/df_all_words_entropy.pickle')