# 克漏字干擾選項生成

## 測試資料

In [2]:
text1 = "To make Jane live a [MASK] life, Mother was very careful about spending money. [SEP] happy"
# "happy" -> "poor", "busy", "sad"
text2 = "[MASK] , Jane didn't understand her. [SEP] However"
# "However" -> "Though", "Although", "Or"
text3 = "Every day Mother was busy with her [MASK] while Jane was studying at school, so they had little time to enjoy themselves. [SEP] work"
# "work" -> "writing", "housework", "research"
text4 = "One day, Mother realized Jane was unhappy and even [MASK] to her. [SEP] unfriendly"
# "unfriendly" -> "loyal", "kind", "cruel"
text5 = "The wait seemed [MASK] . [SEP] endless"
# "endless" -> "meaningless", "useless", "careless"
text6 = "The old man was waiting for a ride across the [MASK] . [SEP] river"
# "river" -> "town", "country", "island"
text7 = "I felt uncomfortable and out of place as the professor carefully [MASK] what she expected us to learn. [SEP] explained"
# "explained" -> "showed", "designed", "offered"
text8 = "As I listened, I couldn't help but [MASK] of my own oldest daughter. [SEP] think"
# "think" -> "speak", "talk", "hear"
text9 = "As I listened, I couldn't help but think of my own oldest daughter who was now beginning her first year in [MASK], just like me. [SEP] college"
# "college" -> "school", "office", "classroom"
text10 = "Today morning, I saw a [MASK] sitting on the wall. [SEP] cat"
text11 = "Ukrainian presidential adviser says situation is ' [MASK] control' in suburbs and outskirts of Kyiv. [SEP] under"
text12 = "I don't think that after what is [MASK] now, Ukraine has weak positions. [SEP] happening"

## Candidate Set Generator (CSG)

使用BERT預訓練模型，生成CS。

### 引用相關套件

In [3]:
from tqdm.notebook import tqdm
import os

### 載入CSG模型

* BERT_model: BERT + 3 distractors

In [4]:
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, pipeline

dir_path = r"./models/CSG/BERT_model"

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained(os.path.join(dir_path, "config.json"))
model = BertForMaskedLM.from_pretrained(os.path.join(dir_path, "pytorch_model.bin"), from_tf=bool('.ckpt' in 'bert-base-uncased'), config=config)
model.eval()



BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [5]:
my_unmasker = pipeline('fill-mask', tokenizer=tokenizer, config=config, model=model, top_k=10)

In [6]:
my_unmasker(text1)

[{'score': 0.17808528244495392,
  'token': 6517,
  'token_str': 's a d',
  'sequence': 'to make jane live a sad life, mother was very careful about spending money. happy'},
 {'score': 0.05460711568593979,
  'token': 11771,
  'token_str': 'b o r i n g',
  'sequence': 'to make jane live a boring life, mother was very careful about spending money. happy'},
 {'score': 0.04597169905900955,
  'token': 5697,
  'token_str': 'b u s y',
  'sequence': 'to make jane live a busy life, mother was very careful about spending money. happy'},
 {'score': 0.02937108650803566,
  'token': 3407,
  'token_str': 'h a p p y',
  'sequence': 'to make jane live a happy life, mother was very careful about spending money. happy'},
 {'score': 0.019477127119898796,
  'token': 2919,
  'token_str': 'b a d',
  'sequence': 'to make jane live a bad life, mother was very careful about spending money. happy'},
 {'score': 0.01932520419359207,
  'token': 4138,
  'token_str': 'r i c h',
  'sequence': 'to make jane live a rich 

## Candidate Set (CS)

* cs = 候選集合(單字&分數)
* target_sent = 目標句子
* origin_sent = 原始句子
* answer = 答案

In [53]:
target_sent = text3
print(target_sent)

Every day Mother was busy with her [MASK] while Jane was studying at school, so they had little time to enjoy themselves. [SEP] work


In [54]:
origin_sent = target_sent.split(" [SEP] ")[0]
print(origin_sent)

Every day Mother was busy with her [MASK] while Jane was studying at school, so they had little time to enjoy themselves.


In [55]:
answer = target_sent.split(" [SEP] ")[1]
print(answer)

work


In [56]:
cs = list()

for cand in my_unmasker(target_sent):
    cs.append({"word": cand["token_str"].replace(" ", ""), "s0": cand["score"], "s1": 0.0, "s2": 0.0, "s3": 0.0})

print(cs)

[{'word': 'study', 's0': 0.12100302428007126, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'homework', 's0': 0.05059974268078804, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'work', 's0': 0.043910592794418335, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'shopping', 's0': 0.042725447565317154, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'school', 's0': 0.025501582771539688, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'holiday', 's0': 0.02389630302786827, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'class', 's0': 0.02138116955757141, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'play', 's0': 0.020853979513049126, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'job', 's0': 0.018741538748145103, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'exercise', 's0': 0.018560459837317467, 's1': 0.0, 's2': 0.0, 's3': 0.0}]


## Distractor Selector (DS)

四種評分標準：

1. 單字相似度：和答案越接近，分數越低
2. 句子相似度：和答案的句子越接近，分數越低
3. 詞性相似度：和答案詞性相同，分數越高
4. 型態相似度(skip!)：字串長度相同、字首字尾相同，分數越高

### 載入fastText模型

In [40]:
import fasttext
import warnings

model_path = r"./models/DS/fasttext_model/wiki_en_ft_model01.bin"

model = fasttext.load_model(model_path)
warnings.filterwarnings("ignore", category=FutureWarning)



### 兩向量距離公式(cosine similarity)

In [14]:
import numpy as np

def similarity(v1, v2):
    n1 = np.linalg.norm(v1)
    n2 = np.linalg.norm(v2)
    return np.dot(v1, v2) / n1 / n2

### 資料標準化

#### Min-Max

In [15]:
# Min-Max 歸一化
def min_max_y(raw_data):
  # 裝進標準化後的新串列
  min_max_data = []
  
  # 進行Min-Max標準化
  for d in raw_data:
    min_max_data.append((d - min(raw_data)) / (max(raw_data) - min(raw_data)))
             
  # 回傳結果
  return min_max_data

#### Z-Score

In [35]:
import math

# Z-Score標準化
def z_score_y(raw_data):
  print(raw_data)
  # 計算數據平均數
  total = 0
  for d in raw_data:
    total += d
  aver = total/len(raw_data)
  
  # 計算所有數據減掉平均數的平方相加
  data_s = 0
  for s in raw_data:
    data_s += (s - aver)**2
  
  # 計算剛剛的結果除以數據總數開根號即為標準差
  std = math.sqrt(data_s/len(raw_data))
  
  # 將每個數據減掉平均數除以標準差成為新的數據串列
  # 裝新的標準化後的數據串列
  z_score_data = []
  for z in raw_data:
    z_score_data.append((z - aver)/std)
  
  # 回傳結果
  return z_score_data

### 0.模型信心分數

In [57]:
s0s = [c["s0"] for c in cs]
new_s0s = min_max_y(s0s)

for i, c in enumerate(cs):
    print(c["word"], new_s0s[i])
    c["s0"] = new_s0s[i]

study 1.0
homework 0.3127536197258566
work 0.24745703209398712
shopping 0.23588815703167398
school 0.06775623952776982
holiday 0.052086193073900854
class 0.027534548120671
play 0.02238834695526689
job 0.0017676139972933411
exercise 0.0


### 1.單字相似度

In [58]:
print(answer)
print("-"*100)
answer_vector = model.get_word_vector(answer)

word_similarities = list()
for c in cs:
  c_vector = model.get_word_vector(c["word"])
  word_similarity = similarity(answer_vector, c_vector)
  word_similarities.append(word_similarity)

new_similarities = min_max_y(word_similarities)

for i, c in enumerate(cs):
    print(c["word"], 1-word_similarities[i], 1-new_similarities[i])
    c["s2"] = 1-new_similarities[i]

work
----------------------------------------------------------------------------------------------------
study 0.3014785647392273 0.434622585773468
homework 0.343461275100708 0.49514639377593994
work 5.960464477539063e-08 0.0
shopping 0.6602155566215515 0.9517910294234753
school 0.6469024419784546 0.932598352432251
holiday 0.6936559975147247 1.0
class 0.6923563182353973 0.9981263345107436
play 0.5733073353767395 0.8265009373426437
job 0.4977574944496155 0.7175855040550232
exercise 0.5173404812812805 0.7458170652389526


### 2.句子相似度

* correct_sent = 填入正確答案句子
* cand_sents = 填入候選詞句子

In [59]:
#依據訓練過後的BERT所生成選項放入句子做比較
correct_sent = origin_sent.replace('[MASK]', answer)
print(correct_sent)
print("-"*100)
correct_sent_vector = model.get_sentence_vector(correct_sent)

cand_sents = list()
for c in cs:
  cand_sents.append(origin_sent.replace('[MASK]', c["word"]))

sent_similarities = list()
#兩句子距離
for cand_sent in cand_sents:
  cand_sent_vector = model.get_sentence_vector(cand_sent)
  sent_similarity = similarity(correct_sent_vector, cand_sent_vector)
  sent_similarities.append(sent_similarity)

new_similarities = min_max_y(sent_similarities)
for i, c in enumerate(cs):
    print(cand_sents[i], 1-sent_similarities[i], 1-new_similarities[i])
    c["s2"] = 1-new_similarities[i]

Every day Mother was busy with her work while Jane was studying at school, so they had little time to enjoy themselves.
----------------------------------------------------------------------------------------------------
Every day Mother was busy with her study while Jane was studying at school, so they had little time to enjoy themselves. 0.0013530254364013672 0.4182635545730591
Every day Mother was busy with her homework while Jane was studying at school, so they had little time to enjoy themselves. 0.0015887618064880371 0.49113720655441284
Every day Mother was busy with her work while Jane was studying at school, so they had little time to enjoy themselves. 0.0 0.0
Every day Mother was busy with her shopping while Jane was studying at school, so they had little time to enjoy themselves. 0.0030781030654907227 0.9515403881669044
Every day Mother was busy with her school while Jane was studying at school, so they had little time to enjoy themselves. 0.0029854178428649902 0.922888413071

### 3.詞性相似度

In [60]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andy1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\andy1\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [61]:
origin_token = word_tokenize(origin_sent)
origin_token.remove("[")
origin_token.remove("]")
# print(origin_token)

mask_index = origin_token.index("MASK")
# print(mask_index)

correct_token = word_tokenize(correct_sent)
# print(correct_token)
correct_pos = nltk.pos_tag(correct_token)
# print(correct_pos)

answer_pos = correct_pos[mask_index]
print(answer_pos)
print("-"*100)

for i, c in enumerate(cs):
    cand_sent_token = word_tokenize(cand_sents[i])
    cand_sent_pos = nltk.pos_tag(cand_sent_token)
    cand_pos = cand_sent_pos[mask_index]

    if cand_pos[1] == answer_pos[1]:
        c["s3"] = 1.0
    else:
        c["s3"] = 0.0
    
    print(cand_pos, c["s3"])

('work', 'NN')
----------------------------------------------------------------------------------------------------
('study', 'NN') 1.0
('homework', 'NN') 1.0
('work', 'NN') 1.0
('shopping', 'NN') 1.0
('school', 'NN') 1.0
('holiday', 'NN') 1.0
('class', 'NN') 1.0
('play', 'NN') 1.0
('job', 'NN') 1.0
('exercise', 'NN') 1.0


### 4.型態相似度(skip!)

In [None]:
ans_len = len(answer)
for c in cs:
    dis_len = len(c)
    diff = max(1 - (abs(dis_len-ans_len) / ans_len), 0)
    print(c, diff)

sad 0.6
boring 0.8
busy 0.8
happy 1.0
bad 0.6
rich 0.8
lazy 0.8
normal 0.8
difficult 0.19999999999999996
strange 0.6


## Ranker

In [62]:
cs

[{'word': 'study', 's0': 1.0, 's1': 0.0, 's2': 0.4182635545730591, 's3': 1.0},
 {'word': 'homework',
  's0': 0.3127536197258566,
  's1': 0.0,
  's2': 0.49113720655441284,
  's3': 1.0},
 {'word': 'work', 's0': 0.24745703209398712, 's1': 0.0, 's2': 0.0, 's3': 1.0},
 {'word': 'shopping',
  's0': 0.23588815703167398,
  's1': 0.0,
  's2': 0.9515403881669044,
  's3': 1.0},
 {'word': 'school',
  's0': 0.06775623952776982,
  's1': 0.0,
  's2': 0.9228884130716324,
  's3': 1.0},
 {'word': 'holiday',
  's0': 0.052086193073900854,
  's1': 0.0,
  's2': 1.0,
  's3': 1.0},
 {'word': 'class',
  's0': 0.027534548120671,
  's1': 0.0,
  's2': 0.9836011193692684,
  's3': 1.0},
 {'word': 'play',
  's0': 0.02238834695526689,
  's1': 0.0,
  's2': 0.8199071288108826,
  's3': 1.0},
 {'word': 'job',
  's0': 0.0017676139972933411,
  's1': 0.0,
  's2': 0.7108453810214996,
  's3': 1.0},
 {'word': 'exercise',
  's0': 0.0,
  's1': 0.0,
  's2': 0.7445828318595886,
  's3': 1.0}]

### 加上權重

In [63]:
cs_fs = list()
for c in cs:
    fs = 0.25*c["s0"] + 0.25*c["s1"] + 0.25*c["s2"] + 0.25*c["s3"]
    cs_fs.append((c["word"], fs))

cs_fs

[('study', 0.6045658886432648),
 ('homework', 0.4509727065700674),
 ('work', 0.31186425802349677),
 ('shopping', 0.5468571362996446),
 ('school', 0.49766116314985054),
 ('holiday', 0.5130215482684752),
 ('class', 0.5027839168724848),
 ('play', 0.4605738689415374),
 ('job', 0.42815324875469823),
 ('exercise', 0.43614570796489716)]

### 排序

In [64]:
cs_fs.sort(key = lambda x: x[1], reverse=True)
cs_fs

[('study', 0.6045658886432648),
 ('shopping', 0.5468571362996446),
 ('holiday', 0.5130215482684752),
 ('class', 0.5027839168724848),
 ('school', 0.49766116314985054),
 ('play', 0.4605738689415374),
 ('homework', 0.4509727065700674),
 ('exercise', 0.43614570796489716),
 ('job', 0.42815324875469823),
 ('work', 0.31186425802349677)]

## Distractors

### Top 3

In [65]:
distractors = cs_fs[:3]
distractors

[('study', 0.6045658886432648),
 ('shopping', 0.5468571362996446),
 ('holiday', 0.5130215482684752)]