In [1]:
import math
import pickle
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm

# Data pre-processing

In [2]:
answer_pool = list()

with open('data/answer_pool.txt', 'r') as fa:
    for line in fa:
        # answer format:
        # "„Ñè„Ñ®„Ñ¢„Ñç„Ñ®", // Áí∞È°ß

        ans_zhuyin = line.split('"')[1]
        answer_pool.append(ans_zhuyin)

print(f'there are {len(answer_pool)} rows in the answer pool.')
answer_pool[:3]

there are 988 rows in the answer pool.


['„Ñè„Ñ®„Ñ¢„Ñç„Ñ®', '„Ñï„Ñ§„Ñí„Ñß„Ñ¢', '„Ññ„Ñ®„Ñê„Ñß„Ñ•']

In [3]:
allowed_words = list()

with open('data/allowed_words.txt', 'r') as fw:
    for line in fw:
        # input format:
        # "„ÑÖ„Ñö„ÑÖ„Ñü„Ñó":"ÂÖ´Ëº©Â≠ê",
        # output format:
        # [_1, _2, _3, _4, _5, chinese, zhuyin, in_answers]

        row = list()
        _, zhuyin, __, word, ___ = line.split('"')
        
        in_answers = True if zhuyin in answer_pool else False
        row += [c for c in zhuyin] + [word] + [zhuyin] + [in_answers]

        allowed_words.append(row)

print(f'there are {len(allowed_words)} possible words.')
allowed_words[:3]

there are 21721 possible words.


[['„ÑÖ', '„Ñö', '„ÑÖ', '„Ñü', '„Ñó', 'ÂÖ´Ëº©Â≠ê', '„ÑÖ„Ñö„ÑÖ„Ñü„Ñó', False],
 ['„ÑÖ', '„Ñö', '„ÑÖ', '„Ñß', '„Ñ†', 'ÂÖ´Ë°®', '„ÑÖ„Ñö„ÑÖ„Ñß„Ñ†', False],
 ['„ÑÖ', '„Ñö', '„ÑÖ', '„Ñß', '„Ñ•', 'ÊääÊüÑ/ÂÖ´ÁóÖ', '„ÑÖ„Ñö„ÑÖ„Ñß„Ñ•', False]]

In [4]:
columns = ['_1', '_2', '_3', '_4', '_5', 'word', 'zhuyin', 'in_answers']
df = pd.DataFrame(allowed_words, columns=columns)
df

Unnamed: 0,_1,_2,_3,_4,_5,word,zhuyin,in_answers
0,„ÑÖ,„Ñö,„ÑÖ,„Ñü,„Ñó,ÂÖ´Ëº©Â≠ê,„ÑÖ„Ñö„ÑÖ„Ñü„Ñó,False
1,„ÑÖ,„Ñö,„ÑÖ,„Ñß,„Ñ†,ÂÖ´Ë°®,„ÑÖ„Ñö„ÑÖ„Ñß„Ñ†,False
2,„ÑÖ,„Ñö,„ÑÖ,„Ñß,„Ñ•,ÊääÊüÑ/ÂÖ´ÁóÖ,„ÑÖ„Ñö„ÑÖ„Ñß„Ñ•,False
3,„ÑÖ,„Ñö,„ÑÜ,„Ñ®,„Ñó,ÊãîËÑØÂ≠ê,„ÑÖ„Ñö„ÑÜ„Ñ®„Ñó,False
4,„ÑÖ,„Ñö,„ÑÜ,„Ñ®,„Ñ¶,ÊãîËÑØÂÖí,„ÑÖ„Ñö„ÑÜ„Ñ®„Ñ¶,False
...,...,...,...,...,...,...,...,...
21716,„Ñ©,„Ñ®,„Ñ£,„Ñè,„Ñ®,ÂÆáÊñáË≠∑,„Ñ©„Ñ®„Ñ£„Ñè„Ñ®,False
21717,„Ñ©,„Ñ®,„Ñ§,„Ñä,„Ñû,Á¶πÁéãËá∫,„Ñ©„Ñ®„Ñ§„Ñä„Ñû,False
21718,„Ñ©,„Ñ©,„Ñå,„Ñß,„Ñ§,Á¶πÈ§òÁ≥ß,„Ñ©„Ñ©„Ñå„Ñß„Ñ§,False
21719,„Ñ©,„Ñ©,„Ñí,„Ñß,„Ñ©,ÂñÅÂñÅÁ¥∞Ë™û,„Ñ©„Ñ©„Ñí„Ñß„Ñ©,False


In [5]:
df['in_answers'].value_counts()

False    20770
True       951
Name: in_answers, dtype: int64

In [6]:
print(f'there are {len(answer_pool)} rows in answer_pool.txt')
print(f'but there are only {len(set(answer_pool))} answers.')

there are 988 rows in answer_pool.txt
but there are only 951 answers.


In [7]:
answer_pool = list(set(answer_pool))
len(answer_pool)

951

# Answers Counting over Patterns

In [8]:
spot_results = ('C', 'M', 'X')  # Correctüü©, Misplacedüü®, Excluded‚¨úÔ∏è

# generate a list of all 3^5 patterns
patterns = list(itertools.product(spot_results, repeat=5))

print(f'there are {len(patterns)} kinds of patterns.')
print(patterns[:3])
print(patterns[-3:])

there are 243 kinds of patterns.
[('C', 'C', 'C', 'C', 'C'), ('C', 'C', 'C', 'C', 'M'), ('C', 'C', 'C', 'C', 'X')]
[('X', 'X', 'X', 'X', 'C'), ('X', 'X', 'X', 'X', 'M'), ('X', 'X', 'X', 'X', 'X')]


In [9]:
def matching(guess, answer):
    """
    matching mechanism:
    1. finding CORRECT spots
    2. search the not-correct letters, in the not-correct spots of the answer:
       1) if exist, the letter it is MISPLACED
       2) if the letter does not exist in the not-correct spots of the answer,
          the letter is EXCLUDED
    """
    
    not_used = [c for c in answer]
    matching_result = ['' for _ in range(5)]

    # CORRECT üü©
    for i in range(5):
        if guess[i] == answer[i]:
            matching_result[i] = 'C'
            not_used.remove(guess[i])

    for i in range(5):
        if matching_result[i] == '':
            # MISPLACED üü®
            if guess[i] in not_used:
                matching_result[i] = 'M'
                not_used.remove(guess[i])

            # EXCLUDED ‚¨úÔ∏è
            else:
                matching_result[i] = 'X'

    return tuple(matching_result)

In [10]:
words = list(df['zhuyin'])
len(words)

21721

In [11]:
ACP = np.zeros((21721, 243), dtype='uint16')
all_second_guess = dict()

with tqdm(total=21721) as pbar:
    for word_index, word in enumerate(words):
        pbar.update(1)
        
        word_second_guess = dict()
        
        for answer in answer_pool:
            match_res = matching(word, answer)
            pat_index = patterns.index(match_res)
            ACP[word_index, pat_index] += 1
            
            if f'p{pat_index+1}' in word_second_guess:
                word_second_guess[f'p{pat_index+1}'] += f', {answer}'
            else:
                word_second_guess[f'p{pat_index+1}'] = answer

        all_second_guess[word] = word_second_guess

ACP

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21721/21721 [02:27<00:00, 147.06it/s]


array([[  0,   0,   0, ...,   0,  31, 660],
       [  0,   0,   0, ...,   9,  12, 260],
       [  0,   0,   0, ...,  35,  45, 201],
       ...,
       [  0,   0,   0, ...,  21,  17, 217],
       [  0,   0,   0, ...,   8,   0, 282],
       [  0,   0,   0, ...,  31,   0, 324]], dtype=uint16)

In [12]:
all_second_guess[words[0]]

{'p243': '„ÑÜ„Ñß„Ñ£„ÑÜ„Ñû, „Ñå„Ñ†„Ñâ„Ñ®„Ñ•, „Ñç„Ñ®„Ñ•„Ñç„Ñ®, „Ñí„Ñ©„Ñù„Ñç„Ñ†, „Ñï„Ñ°„Ñâ„Ñ®„Ñ¢, „Ñç„Ñú„Ñá„Ñß„Ñ•, „Ñâ„Ñ®„Ñ•„Ñß„Ñ†, „Ñï„Ñ®„Ñ£„Ñß„Ñ¢, „Ñç„Ñ®„Ñõ„Ñë„Ñ©, „ÑÜ„Ñß„Ñ•„Ñà„Ñ£, „Ñë„Ñß„Ñë„Ñß„Ñ°, „Ñè„Ñ®„Ñå„Ñ®„Ñ¢, „Ñä„Ñ°„Ñä„Ñ®„Ñ•, „Ñã„Ñß„Ñ°„Ññ„Ñ°, „Ñä„Ñ®„Ñ•„Ñè„Ñ†, „Ñê„Ñß„Ñù„Ñâ„Ñ†, „Ñï„Ñ£„Ñâ„Ñ®„Ñ¢, „Ñí„Ñß„Ñ§„Ñï„Ñ°, „Ñì„Ñ®„Ñ•„Ññ„Ñ£, „Ñå„Ñû„Ñå„Ñß„Ñ£, „Ññ„Ñ£„Ñä„Ñ®„Ñ•, „Ñß„Ñ†„Ñê„Ñß„Ñ£, „Ñê„Ñß„Ñù„Ñî„Ñ®, „Ñê„Ñß„Ñù„Ññ„Ñ®, „Ñå„Ñ®„Ñ£„Ñ®„Ñ£, „Ñå„Ñß„Ñá„Ñß„Ñ•, „Ñà„Ñ•„Ñê„Ñß„Ñ•, „Ñå„Ñß„Ñ°„Ñå„Ñ®, „Ñí„Ñß„Ñ£„Ñê„Ñß, „Ñê„Ñß„Ñ†„Ñß„Ñù, „Ñé„Ñú„Ñí„Ñ©„Ñù, „Ñé„Ñ®„Ñõ„Ñì„Ñ§, „Ñâ„Ñß„Ñ¢„Ñâ„Ñ•, „Ñè„Ñ®„Ñ£„Ñß„Ñ†, „Ñí„Ñß„Ñ¢„Ñë„Ñß, „Ñã„Ñ¢„Ñá„Ñß„Ñ£, „Ñê„Ñß„Ñ¢„Ññ„Ñ£, „Ñê„Ñß„Ñ•„Ñä„Ñ¢, „Ñì„Ñ£„Ñê„Ñß„Ñù, „Ñâ„Ñ§„Ñê„Ñß„Ñ£, „Ñå„Ñß„Ñå„Ñß„Ñ§, „Ñç„Ñ®„Ñ¢„Ñò„Ñû, „Ñß„Ñ•„Ñç„Ñ®„Ñ§, „Ñì„Ñ®„Ñ¢„Ñì„Ñ®, „Ñç„Ñ®„Ñõ„Ñç„Ñú, „Ñì„Ñ£„Ñô„Ñ®„Ñõ, „Ñä„Ñ®„Ñõ„Ñß„Ñ¢, „Ñå„Ñ®„Ñç„Ñ®„Ñõ, „Ñê„Ñß„Ñ¢„Ñå„Ñß, „Ñä„Ñû„Ñí„Ñ©„Ñù, „Ñà„Ñ§„Ñí„Ñ©„Ñù, „Ñã„Ñß„Ñ•„Ñ©„Ñ¢, „Ñê„Ñß„Ñë„Ñ©„Ñ¢, „Ñä„Ñ¢„Ñô„Ñ®„Ñõ, „Ñì„Ñ®„Ñ•„Ñï„Ñ•, „Ñí„Ñß„Ñ†„Ñí„Ñß, „Ñß„Ñ•„Ñí„Ñ©„Ñ•, „Ñç„Ñ®„Ñ¢„Ñ©„Ñ¢, „Ñê„

In [13]:
with open('data/next_guess_map.pickle', 'wb') as po:
    pickle.dump(all_second_guess, po, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
columns_patterns = ['p'+str(i) for i in range(1, 244)]
df_ACP = pd.DataFrame(ACP, columns=columns_patterns)

df_ACP

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p234,p235,p236,p237,p238,p239,p240,p241,p242,p243
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,7,67,0,31,660
1,0,0,0,0,0,0,0,0,0,0,...,0,22,16,195,7,28,217,9,12,260
2,0,0,0,0,0,0,0,0,0,0,...,0,58,22,153,30,52,170,35,45,201
3,0,0,0,0,0,0,0,0,0,0,...,15,0,11,137,0,18,218,0,9,348
4,0,0,0,0,0,0,0,0,0,0,...,16,1,0,147,0,0,236,0,0,357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21716,0,0,0,0,0,0,0,0,0,0,...,50,1,0,7,0,0,12,22,0,305
21717,0,0,0,0,0,0,0,0,0,0,...,41,2,0,8,2,0,24,11,17,281
21718,0,0,0,0,0,0,0,0,0,0,...,20,14,21,225,20,10,207,21,17,217
21719,0,0,0,0,0,0,0,0,0,0,...,0,0,0,206,7,0,205,8,0,282


In [15]:
df_patterns = pd.concat([df, df_ACP], axis=1)
df_patterns

Unnamed: 0,_1,_2,_3,_4,_5,word,zhuyin,in_answers,p1,p2,...,p234,p235,p236,p237,p238,p239,p240,p241,p242,p243
0,„ÑÖ,„Ñö,„ÑÖ,„Ñü,„Ñó,ÂÖ´Ëº©Â≠ê,„ÑÖ„Ñö„ÑÖ„Ñü„Ñó,False,0,0,...,0,0,1,0,0,7,67,0,31,660
1,„ÑÖ,„Ñö,„ÑÖ,„Ñß,„Ñ†,ÂÖ´Ë°®,„ÑÖ„Ñö„ÑÖ„Ñß„Ñ†,False,0,0,...,0,22,16,195,7,28,217,9,12,260
2,„ÑÖ,„Ñö,„ÑÖ,„Ñß,„Ñ•,ÊääÊüÑ/ÂÖ´ÁóÖ,„ÑÖ„Ñö„ÑÖ„Ñß„Ñ•,False,0,0,...,0,58,22,153,30,52,170,35,45,201
3,„ÑÖ,„Ñö,„ÑÜ,„Ñ®,„Ñó,ÊãîËÑØÂ≠ê,„ÑÖ„Ñö„ÑÜ„Ñ®„Ñó,False,0,0,...,15,0,11,137,0,18,218,0,9,348
4,„ÑÖ,„Ñö,„ÑÜ,„Ñ®,„Ñ¶,ÊãîËÑØÂÖí,„ÑÖ„Ñö„ÑÜ„Ñ®„Ñ¶,False,0,0,...,16,1,0,147,0,0,236,0,0,357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21716,„Ñ©,„Ñ®,„Ñ£,„Ñè,„Ñ®,ÂÆáÊñáË≠∑,„Ñ©„Ñ®„Ñ£„Ñè„Ñ®,False,0,0,...,50,1,0,7,0,0,12,22,0,305
21717,„Ñ©,„Ñ®,„Ñ§,„Ñä,„Ñû,Á¶πÁéãËá∫,„Ñ©„Ñ®„Ñ§„Ñä„Ñû,False,0,0,...,41,2,0,8,2,0,24,11,17,281
21718,„Ñ©,„Ñ©,„Ñå,„Ñß,„Ñ§,Á¶πÈ§òÁ≥ß,„Ñ©„Ñ©„Ñå„Ñß„Ñ§,False,0,0,...,20,14,21,225,20,10,207,21,17,217
21719,„Ñ©,„Ñ©,„Ñí,„Ñß,„Ñ©,ÂñÅÂñÅÁ¥∞Ë™û,„Ñ©„Ñ©„Ñí„Ñß„Ñ©,False,0,0,...,0,0,0,206,7,0,205,8,0,282


# Entropy

In [16]:
def calculate_entropy(row):
    sum_entropy = 0
    total = sum(row)
    
    for d in row:
        p = d/total
        # calculate entropy only if p>0 to prevent ZeroDivisionError
        if p != 0:
            entropy = (p * math.log(1/p, 2))
            sum_entropy += entropy
        
    return sum_entropy

In [17]:
entropy_col = list()
for row in ACP:
    entropy_col.append(calculate_entropy(row))

print(len(entropy_col))
entropy_col[:3]

21721


[1.9239237420730468, 3.305919822967189, 3.7767565728012342]

In [18]:
df_patterns['entropy'] = entropy_col
df_patterns

Unnamed: 0,_1,_2,_3,_4,_5,word,zhuyin,in_answers,p1,p2,...,p235,p236,p237,p238,p239,p240,p241,p242,p243,entropy
0,„ÑÖ,„Ñö,„ÑÖ,„Ñü,„Ñó,ÂÖ´Ëº©Â≠ê,„ÑÖ„Ñö„ÑÖ„Ñü„Ñó,False,0,0,...,0,1,0,0,7,67,0,31,660,1.923924
1,„ÑÖ,„Ñö,„ÑÖ,„Ñß,„Ñ†,ÂÖ´Ë°®,„ÑÖ„Ñö„ÑÖ„Ñß„Ñ†,False,0,0,...,22,16,195,7,28,217,9,12,260,3.305920
2,„ÑÖ,„Ñö,„ÑÖ,„Ñß,„Ñ•,ÊääÊüÑ/ÂÖ´ÁóÖ,„ÑÖ„Ñö„ÑÖ„Ñß„Ñ•,False,0,0,...,58,22,153,30,52,170,35,45,201,3.776757
3,„ÑÖ,„Ñö,„ÑÜ,„Ñ®,„Ñó,ÊãîËÑØÂ≠ê,„ÑÖ„Ñö„ÑÜ„Ñ®„Ñó,False,0,0,...,0,11,137,0,18,218,0,9,348,3.047255
4,„ÑÖ,„Ñö,„ÑÜ,„Ñ®,„Ñ¶,ÊãîËÑØÂÖí,„ÑÖ„Ñö„ÑÜ„Ñ®„Ñ¶,False,0,0,...,1,0,147,0,0,236,0,0,357,2.778213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21716,„Ñ©,„Ñ®,„Ñ£,„Ñè,„Ñ®,ÂÆáÊñáË≠∑,„Ñ©„Ñ®„Ñ£„Ñè„Ñ®,False,0,0,...,1,0,7,0,0,12,22,0,305,3.703381
21717,„Ñ©,„Ñ®,„Ñ§,„Ñä,„Ñû,Á¶πÁéãËá∫,„Ñ©„Ñ®„Ñ§„Ñä„Ñû,False,0,0,...,2,0,8,2,0,24,11,17,281,3.557311
21718,„Ñ©,„Ñ©,„Ñå,„Ñß,„Ñ§,Á¶πÈ§òÁ≥ß,„Ñ©„Ñ©„Ñå„Ñß„Ñ§,False,0,0,...,14,21,225,20,10,207,21,17,217,3.376844
21719,„Ñ©,„Ñ©,„Ñí,„Ñß,„Ñ©,ÂñÅÂñÅÁ¥∞Ë™û,„Ñ©„Ñ©„Ñí„Ñß„Ñ©,False,0,0,...,0,0,206,7,0,205,8,0,282,2.886894


In [19]:
df_patterns.to_pickle('data/df_all_words_entropy.pickle')