# CDGP demo

## Testing data

* q1 ~ q10: From CLOTH dataset
* q11 ~ q20: From DGen dataset
* q21 ~ q23: From website

In [1]:
questions = {
    "q1": {
        "sentence": "To make Jane live a [MASK] life, Mother was very careful about spending money. [SEP] happy",
        "answer": "happy",
        "distractors": ["poor", "busy", "sad"]
    },
    "q2": {
        "sentence": "[MASK] , Jane didn't understand her. [SEP] However",
        "answer": "However",
        "distractors": ["Though", "Although", "Or"]
    },
    "q3": {
        "sentence": "Every day Mother was busy with her [MASK] while Jane was studying at school, so they had little time to enjoy themselves. [SEP] work",
        "answer": "work",
        "distractors": ["writing", "housework", "research"]
    },
    "q4": {
        "sentence": "One day, Mother realized Jane was unhappy and even [MASK] to her. [SEP] unfriendly",
        "answer": "unfriendly",
        "distractors": ["loyal", "kind", "cruel"]
    },
    "q5": {
        "sentence": "The old man was waiting for a ride across the [MASK] . [SEP] river",
        "answer": "river",
        "distractors": ["town", "country", "island"]
    },
    "q6": {
        "sentence": "I felt uncomfortable and out of place as the professor carefully [MASK] what she expected us to learn. [SEP] explained",
        "answer": "explained",
        "distractors": ["showed", "designed", "offered"]
    },
    "q7": {
        "sentence": "As I listened, I couldn't help but [MASK] of my own oldest daughter. [SEP] think",
        "answer": "think",
        "distractors": ["speak", "talk", "hear"]
    },
    "q8": {
        "sentence": "As we were [MASK] on the third floor for old people with Alzheimer, most of them stared off at the walls or floor. [SEP] singing",
        "answer": "singing",
        "distractors": ["meeting", "gathering", "dancing"]
    },
    "q9": {
        "sentence": "As we got [MASK] with each song, she did as well. [SEP] louder",
        "answer": "louder",
        "distractors": ["higher", "nearer", "faster"]
    },
    "q10": {
        "sentence": "Mr. Petri,  _   injured in the fire, was rushed to hospital. [SEP] seriously",
        "answer": "seriously",
        "distractors": ["blindly", "hardly", "slightly"]
    },
    "q11": {
        "sentence": "If an object is attracted to a magnet, the object is most likely made of [MASK]. [SEP] metal",
        "answer": "metal",
        "distractors": ["wood", "plastic", "cardboard"]
    },
    "q12": {
        "sentence": "the main organs of the respiratory system are [MASK]. [SEP] lungs",
        "answer": "lungs",
        "distractors": ["ovaries", "intestines", "kidneys"]
    },
    "q13": {
        "sentence": "The products of photosynthesis are glucose and [MASK] else. [SEP] oxygen",
        "answer": "oxygen",
        "distractors": ["carbon", "hydrogen", "nitrogen"]
    },
    "q14": {
        "sentence": "frogs have [MASK] eyelid membranes. [SEP] three",
        "answer": "three",
        "distractors": ["two", "four", "one"]
    },
    "q15": {
        "sentence": "the only known planet with large amounts of water is [MASK]. [SEP] earth",
        "answer": "earth",
        "distractors": ["saturn", "jupiter", "mars"]
    },
    "q16": {
        "sentence": "[MASK] is responsible for erosion by flowing water and glaciers. [SEP] gravity",
        "answer": "gravity",
        "distractors": ["kinetic", "electromagnetic", "weight"],
    },
    "q17": {
        "sentence": "Common among mammals and insects , pheromones are often related to [MASK] type of behavior. [SEP] reproductive",
        "answer": "reproductive",
        "distractors": ["aggressive", "immune", "cardiac"]
    },
    "q18": {
        "sentence": "[MASK] can reproduce by infecting the cell of a living host. [SEP] virus",
        "answer": "virus",
        "distractors": ["bacteria", "mucus", "carcinogens"]
    },
    "q19": {
        "sentence": "proteins are encoded by [MASK]. [SEP] genes",
        "answer": "genes",
        "distractors": ["DNA", "RNA", "codons"]
    },
    "q20": {
        "sentence": "Producers at the base of ecological food webs are also known as [MASK]. [SEP] autotrophic",
        "answer": "autotrophic",
        "distractors": ["endoscopic", "symbiotic", "mutualistic"],
    },
    "q21": {
        "sentence": "Today morning, I saw a [MASK] sitting on the wall. [SEP] cat",
        "answer": "cat",
        "distractors": [],
    },
    "q22": {
        "sentence": "Ukrainian presidential adviser says situation is ' [MASK] control' in suburbs and outskirts of Kyiv. [SEP] under",
        "answer": "under",
        "distractors": [],
    },
    "q23": {
        "sentence": "I don't think that after what is [MASK] now, Ukraine has weak positions. [SEP] happening",
        "answer": "happening",
        "distractors": [],
    },
}

## Candidate Set Generator (CSG)

Input: stem, answer  
Output: candidate set of distractors

### Import packages

In [2]:
from tqdm.notebook import tqdm
import os

### Load CSG model

In [3]:
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, pipeline

CSG_MODEL = "AndyChiang/cdgp-csg-bert-cloth"

tokenizer = BertTokenizer.from_pretrained(CSG_MODEL)
csg_model = BertForMaskedLM.from_pretrained(CSG_MODEL)
csg_model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [4]:
my_unmasker = pipeline('fill-mask', tokenizer=tokenizer, model=csg_model, top_k=10)

In [5]:
my_unmasker(questions["q1"]["sentence"])

[{'score': 0.17808528244495392,
  'token': 6517,
  'token_str': 's a d',
  'sequence': 'to make jane live a sad life, mother was very careful about spending money. happy'},
 {'score': 0.05460711568593979,
  'token': 11771,
  'token_str': 'b o r i n g',
  'sequence': 'to make jane live a boring life, mother was very careful about spending money. happy'},
 {'score': 0.04597169905900955,
  'token': 5697,
  'token_str': 'b u s y',
  'sequence': 'to make jane live a busy life, mother was very careful about spending money. happy'},
 {'score': 0.02937108650803566,
  'token': 3407,
  'token_str': 'h a p p y',
  'sequence': 'to make jane live a happy life, mother was very careful about spending money. happy'},
 {'score': 0.019477127119898796,
  'token': 2919,
  'token_str': 'b a d',
  'sequence': 'to make jane live a bad life, mother was very careful about spending money. happy'},
 {'score': 0.01932520419359207,
  'token': 4138,
  'token_str': 'r i c h',
  'sequence': 'to make jane live a rich 

## Candidate Set (CS)

In [6]:
origin_sent = "I realized that [MASK] I had accomplished in the past had prepared me for this moment."
print(origin_sent)

I realized that [MASK] I had accomplished in the past had prepared me for this moment.


In [7]:
answer = "everything"
print(answer)

everything


In [8]:
target_sent = origin_sent + " [SEP] " + answer
print(target_sent)

I realized that [MASK] I had accomplished in the past had prepared me for this moment. [SEP] everything


In [9]:
cs = list()

for cand in my_unmasker(target_sent):
    # if cand["token_str"].replace(" ", "") != answer:
    cs.append({"word": cand["token_str"].replace(" ", ""), "s0": cand["score"], "s1": 0.0, "s2": 0.0, "s3": 0.0})

print(cs)

[{'word': 'something', 's0': 0.4702465534210205, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'nothing', 's0': 0.21203435957431793, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'anything', 's0': 0.20486889779567719, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'everything', 's0': 0.05527903512120247, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'somewhere', 's0': 0.005942480638623238, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'none', 's0': 0.004807805176824331, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'anybody', 's0': 0.003521741833537817, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'any', 's0': 0.0026764380745589733, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'nobody', 's0': 0.002662935294210911, 's1': 0.0, 's2': 0.0, 's3': 0.0}, {'word': 'somebody', 's0': 0.00245393393561244, 's1': 0.0, 's2': 0.0, 's3': 0.0}]


## Distractor Selector (DS)

Input: stem, answer, candidate set of distractors  
Output: Top 3 distractors

### Load DS model

In [10]:
import fasttext
import warnings

model_path = r"./models/DS/cdgp-ds-fasttext.bin"

ds_model = fasttext.load_model(model_path)
warnings.filterwarnings("ignore", category=FutureWarning)



### Cosine similarity

In [11]:
import numpy as np

def similarity(v1, v2):
    n1 = np.linalg.norm(v1)
    n2 = np.linalg.norm(v2)
    return np.dot(v1, v2) / n1 / n2

### Normalization

#### Min-Max normalization

In [12]:
def min_max_y(raw_data):
  min_max_data = []
  
  for d in raw_data:
    min_max_data.append((d - min(raw_data)) / (max(raw_data) - min(raw_data)))
             
  return min_max_data

#### Z-Score normalization

In [13]:
import math

def z_score_y(raw_data):
  print(raw_data)
  total = 0
  for d in raw_data:
    total += d
  aver = total/len(raw_data)
  
  data_s = 0
  for s in raw_data:
    data_s += (s - aver)**2
  
  std = math.sqrt(data_s/len(raw_data))
  
  z_score_data = []
  for z in raw_data:
    z_score_data.append((z - aver)/std)
  
  return z_score_data

### Confidence Score s0

In [14]:
s0s = [c["s0"] for c in cs]
new_s0s = min_max_y(s0s)

for i, c in enumerate(cs):
    print(c["word"], new_s0s[i])
    c["s0"] = new_s0s[i]

something 1.0
nothing 0.44801994924428895
anything 0.43270234592997614
everything 0.11292418688370907
somewhere 0.007457464178995275
none 0.0050318691299603025
anybody 0.002282652298148721
any 0.00047564696337299485
nobody 0.000446782077982293
somebody 0.0


### Word Embedding Similarity s1

In [15]:
print(answer)
print("-"*100)
answer_vector = ds_model.get_word_vector(answer)

word_similarities = list()
for c in cs:
  c_vector = ds_model.get_word_vector(c["word"])
  word_similarity = similarity(answer_vector, c_vector)
  word_similarities.append(word_similarity)

new_similarities = min_max_y(word_similarities)

for i, c in enumerate(cs):
    print(c["word"], 1-word_similarities[i], 1-new_similarities[i])
    c["s1"] = 1-new_similarities[i]

everything
----------------------------------------------------------------------------------------------------
something 0.05007749795913696 0.07134193181991577
nothing 0.056958675384521484 0.08114504814147949
anything 0.05596303939819336 0.07972663640975952
everything -1.1920928955078125e-07 0.0
somewhere 0.2000046968460083 0.28493237495422363
none 0.7019376754760742 1.0
anybody 0.16913676261901855 0.2409570813179016
any 0.35927659273147583 0.5118355453014374
nobody 0.22937774658203125 0.3267780542373657
somebody 0.18457138538360596 0.26294565200805664


### Contextual-Sentence Embedding Similarity s2

In [16]:
#依據訓練過後的BERT所生成選項放入句子做比較
correct_sent = origin_sent.replace('[MASK]', answer)
print(correct_sent)
print("-"*100)
correct_sent_vector = ds_model.get_sentence_vector(correct_sent)

cand_sents = list()
for c in cs:
  cand_sents.append(origin_sent.replace('[MASK]', c["word"]))

sent_similarities = list()
#兩句子距離
for cand_sent in cand_sents:
  cand_sent_vector = ds_model.get_sentence_vector(cand_sent)
  sent_similarity = similarity(correct_sent_vector, cand_sent_vector)
  sent_similarities.append(sent_similarity)

new_similarities = min_max_y(sent_similarities)
for i, c in enumerate(cs):
    print(cand_sents[i], 1-sent_similarities[i], 1-new_similarities[i])
    c["s2"] = 1-new_similarities[i]

I realized that everything I had accomplished in the past had prepared me for this moment.
----------------------------------------------------------------------------------------------------
I realized that something I had accomplished in the past had prepared me for this moment. 0.000433504581451416 0.07980388402938843
I realized that nothing I had accomplished in the past had prepared me for this moment. 0.0004928112030029297 0.09071862697601318
I realized that anything I had accomplished in the past had prepared me for this moment. 0.00048416852951049805 0.08912801742553711
I realized that everything I had accomplished in the past had prepared me for this moment. -1.1920928955078125e-07 0.0
I realized that somewhere I had accomplished in the past had prepared me for this moment. 0.0017292499542236328 0.31827205419540405
I realized that none I had accomplished in the past had prepared me for this moment. 0.005433499813079834 1.0
I realized that anybody I had accomplished in the past

### POS match score s3

In [17]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andy1\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\andy1\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [18]:
origin_token = word_tokenize(origin_sent)
origin_token.remove("[")
origin_token.remove("]")

mask_index = origin_token.index("MASK")

correct_token = word_tokenize(correct_sent)
correct_pos = nltk.pos_tag(correct_token)
answer_pos = correct_pos[mask_index]
print(answer_pos)
print("-"*100)

for i, c in enumerate(cs):
    cand_sent_token = word_tokenize(cand_sents[i])
    cand_sent_pos = nltk.pos_tag(cand_sent_token)
    cand_pos = cand_sent_pos[mask_index]

    if cand_pos[1] == answer_pos[1]:
        c["s3"] = 1.0
    else:
        c["s3"] = 0.0
    
    print(cand_pos, c["s3"])

('everything', 'NN')
----------------------------------------------------------------------------------------------------
('something', 'NN') 1.0
('nothing', 'NN') 1.0
('anything', 'NN') 1.0
('everything', 'NN') 1.0
('somewhere', 'RB') 0.0
('none', 'NN') 1.0
('anybody', 'NN') 1.0
('any', 'DT') 0.0
('nobody', 'NN') 1.0
('somebody', 'NN') 1.0


## Ranker

In [19]:
cs

[{'word': 'something',
  's0': 1.0,
  's1': 0.07134193181991577,
  's2': 0.07980388402938843,
  's3': 1.0},
 {'word': 'nothing',
  's0': 0.44801994924428895,
  's1': 0.08114504814147949,
  's2': 0.09071862697601318,
  's3': 1.0},
 {'word': 'anything',
  's0': 0.43270234592997614,
  's1': 0.07972663640975952,
  's2': 0.08912801742553711,
  's3': 1.0},
 {'word': 'everything',
  's0': 0.11292418688370907,
  's1': 0.0,
  's2': 0.0,
  's3': 1.0},
 {'word': 'somewhere',
  's0': 0.007457464178995275,
  's1': 0.28493237495422363,
  's2': 0.31827205419540405,
  's3': 0.0},
 {'word': 'none',
  's0': 0.0050318691299603025,
  's1': 1.0,
  's2': 1.0,
  's3': 1.0},
 {'word': 'anybody',
  's0': 0.002282652298148721,
  's1': 0.2409570813179016,
  's2': 0.24961334466934204,
  's3': 1.0},
 {'word': 'any',
  's0': 0.00047564696337299485,
  's1': 0.5118355453014374,
  's2': 0.5653404295444489,
  's3': 0.0},
 {'word': 'nobody',
  's0': 0.000446782077982293,
  's1': 0.3267780542373657,
  's2': 0.31046169996

### Weight final score

In [300]:
WEIGHT = {"s0": 0.25, "s1": 0.25, "s2": 0.25, "s3": 0.25}

In [20]:
WEIGHT = {"s0": 0.6, "s1": 0.15, "s2": 0.15, "s3": 0.1}

In [306]:
WEIGHT = {"s0": 0.8, "s1": 0.05, "s2": 0.05, "s3": 0.1}

In [21]:
cs_fs = list()
for c in cs:
    fs = WEIGHT["s0"]*c["s0"] + WEIGHT["s1"]*c["s1"] + WEIGHT["s2"]*c["s2"] + WEIGHT["s3"]*c["s3"]
    cs_fs.append((c["word"], fs))

cs_fs

[('something', 0.7226718723773956),
 ('nothing', 0.39459152081419735),
 ('anything', 0.3849496056332802),
 ('everything', 0.16775451213022546),
 ('somewhere', 0.0949551428798413),
 ('none', 0.40301912147797614),
 ('anybody', 0.17495515527697578),
 ('any', 0.16186178440490673),
 ('nobody', 0.19585403237678664),
 ('somebody', 0.1783417105674744)]

### Sort by final score

In [22]:
cs_fs.sort(key = lambda x: x[1], reverse=True)
cs_fs

[('something', 0.7226718723773956),
 ('none', 0.40301912147797614),
 ('nothing', 0.39459152081419735),
 ('anything', 0.3849496056332802),
 ('nobody', 0.19585403237678664),
 ('somebody', 0.1783417105674744),
 ('anybody', 0.17495515527697578),
 ('everything', 0.16775451213022546),
 ('any', 0.16186178440490673),
 ('somewhere', 0.0949551428798413)]

## Distractors

### Top 3

In [23]:
distractors = cs_fs[:3]
distractors

[('something', 0.7226718723773956),
 ('none', 0.40301912147797614),
 ('nothing', 0.39459152081419735)]

In [24]:
result = [d[0] for d in distractors]
result

['something', 'none', 'nothing']