# 模型比較

## 引用相關套件

In [2]:
from tqdm.notebook import tqdm
import os

## Load models

### BERT_CLOTH_model: BERT + CLOTH

In [None]:
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, pipeline

dir_path = r"./models/CSG/BERT_CLOTH_model"

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained(os.path.join(dir_path, "config.json"))
csg_model1 = BertForMaskedLM.from_pretrained(os.path.join(dir_path, "pytorch_model.bin"), from_tf=bool('.ckpt' in 'bert-base-uncased'), config=config)
csg_model1.eval()

unmasker1 = pipeline('fill-mask', tokenizer=tokenizer, config=config, model=csg_model1, top_k=10)

### BERT_CLOTH_model: BERT + DGen

In [5]:
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, pipeline

dir_path = r"./models/CSG/BERT_DGen_model1"

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained(os.path.join(dir_path, "config.json"))
csg_model2 = BertForMaskedLM.from_pretrained(os.path.join(dir_path, "pytorch_model.bin"), from_tf=bool('.ckpt' in 'bert-base-uncased'), config=config)
csg_model2.eval()

unmasker2 = pipeline('fill-mask', tokenizer=tokenizer, config=config, model=csg_model2, top_k=10)

### BERT_CLOTH_DGen_model: BERT + CLOTH + DGen

In [3]:
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, pipeline

dir_path = r"./models/CSG/BERT_CLOTH_DGen_model1"

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained(os.path.join(dir_path, "config.json"))
csg_model3 = BertForMaskedLM.from_pretrained(os.path.join(dir_path, "pytorch_model.bin"), config=config, from_tf=bool('.ckpt' in 'bert-base-uncased'))
csg_model3.eval()

unmasker3 = pipeline('fill-mask', tokenizer=tokenizer, config=config, model=csg_model3, top_k=10)

## Demo questions

* q1 ~ q10 取自CLOTH資料集，名詞2個，動詞3個，形容詞3個，副詞2個
* q11 ~ q20 取自DGen資料集，名詞8個，形容詞1個，數量詞1個
* q21 ~ q23 取自網路文章

In [56]:
questions = {
    "q1": {
        "sentence": "To make Jane live a [MASK] life, Mother was very careful about spending money. [SEP] happy",
        "answer": "happy",
        "distractors": ["poor", "busy", "sad"]
    },
    "q2": {
        "sentence": "[MASK] , Jane didn't understand her. [SEP] However",
        "answer": "However",
        "distractors": ["Though", "Although", "Or"]
    },
    "q3": {
        "sentence": "Every day Mother was busy with her [MASK] while Jane was studying at school, so they had little time to enjoy themselves. [SEP] work",
        "answer": "work",
        "distractors": ["writing", "housework", "research"]
    },
    "q4": {
        "sentence": "One day, Mother realized Jane was unhappy and even [MASK] to her. [SEP] unfriendly",
        "answer": "unfriendly",
        "distractors": ["loyal", "kind", "cruel"]
    },
    "q5": {
        "sentence": "The old man was waiting for a ride across the [MASK] . [SEP] river",
        "answer": "river",
        "distractors": ["town", "country", "island"]
    },
    "q6": {
        "sentence": "I felt uncomfortable and out of place as the professor carefully [MASK] what she expected us to learn. [SEP] explained",
        "answer": "explained",
        "distractors": ["showed", "designed", "offered"]
    },
    "q7": {
        "sentence": "As I listened, I couldn't help but [MASK] of my own oldest daughter. [SEP] think",
        "answer": "think",
        "distractors": ["speak", "talk", "hear"]
    },
    "q8": {
        "sentence": "As we were [MASK] on the third floor for old people with Alzheimer, most of them stared off at the walls or floor. [SEP] singing",
        "answer": "singing",
        "distractors": ["meeting", "gathering", "dancing"]
    },
    "q9": {
        "sentence": "As we got [MASK] with each song, she did as well. [SEP] louder",
        "answer": "louder",
        "distractors": ["higher", "nearer", "faster"]
    },
    "q10": {
        "sentence": "Mr. Petri, [MASK] injured in the fire, was rushed to hospital. [SEP] seriously",
        "answer": "seriously",
        "distractors": ["blindly", "hardly", "slightly"]
    },
    "q11": {
        "sentence": "If an object is attracted to a magnet, the object is most likely made of [MASK]. [SEP] metal",
        "answer": "metal",
        "distractors": ["wood", "plastic", "cardboard"]
    },
    "q12": {
        "sentence": "the main organs of the respiratory system are [MASK]. [SEP] lungs",
        "answer": "lungs",
        "distractors": ["ovaries", "intestines", "kidneys"]
    },
    "q13": {
        "sentence": "The products of photosynthesis are glucose and [MASK] else. [SEP] oxygen",
        "answer": "oxygen",
        "distractors": ["carbon", "hydrogen", "nitrogen"]
    },
    "q14": {
        "sentence": "frogs have [MASK] eyelid membranes. [SEP] three",
        "answer": "three",
        "distractors": ["two", "four", "one"]
    },
    "q15": {
        "sentence": "the only known planet with large amounts of water is [MASK]. [SEP] earth",
        "answer": "earth",
        "distractors": ["saturn", "jupiter", "mars"]
    },
    "q16": {
        "sentence": "[MASK] is responsible for erosion by flowing water and glaciers. [SEP] gravity",
        "answer": "gravity",
        "distractors": ["kinetic", "electromagnetic", "weight"],
    },
    "q17": {
        "sentence": "Common among mammals and insects , pheromones are often related to [MASK] type of behavior. [SEP] reproductive",
        "answer": "reproductive",
        "distractors": ["aggressive", "immune", "cardiac"]
    },
    "q18": {
        "sentence": "[MASK] can reproduce by infecting the cell of a living host. [SEP] virus",
        "answer": "virus",
        "distractors": ["bacteria", "mucus", "carcinogens"]
    },
    "q19": {
        "sentence": "proteins are encoded by [MASK]. [SEP] genes",
        "answer": "genes",
        "distractors": ["DNA", "RNA", "codons"]
    },
    "q20": {
        "sentence": "Producers at the base of ecological food webs are also known as [MASK]. [SEP] autotrophic",
        "answer": "autotrophic",
        "distractors": ["endoscopic", "symbiotic", "mutualistic"],
    },
    "q21": {
        "sentence": "Today morning, I saw a [MASK] sitting on the wall. [SEP] cat",
        "answer": "cat",
        "distractors": [],
    },
    "q22": {
        "sentence": "Ukrainian presidential adviser says situation is ' [MASK] control' in suburbs and outskirts of Kyiv. [SEP] under",
        "answer": "under",
        "distractors": [],
    },
    "q23": {
        "sentence": "I don't think that after what is [MASK] now, Ukraine has weak positions. [SEP] happening",
        "answer": "happening",
        "distractors": [],
    },
}

## Compare

* CLOTH_model 在CLOTH上表現很好，但在DGen表現就不好了，太過General
* DGen_model 比較specific在DGen的資料集上，在CLOTH上表現就不好
* CLOTH&DGen_model 在兩個資料集表現都不錯，但在DGen上仍然不及DGen_model

In [57]:
question_num = "q10"

In [59]:
answer = questions[question_num]["answer"].lower()
dis = [d.lower() for d in questions[question_num]["distractors"]]

cs1 = list()
for cand in unmasker1(questions[question_num]["sentence"]):
    cs1.append(cand["token_str"].replace(" ", ""))

cs2 = list()
for cand in unmasker2(questions[question_num]["sentence"]):
    cs2.append(cand["token_str"].replace(" ", ""))

cs3 = list()
for cand in unmasker3(questions[question_num]["sentence"]):
    cs3.append(cand["token_str"].replace(" ", ""))

print("answer:", answer)
print("dis:", dis)
print("cs1:", cs1)
print("cs2:", cs2)
print("cs3:", cs3)

answer: seriously
dis: ['blindly', 'hardly', 'slightly']
cs1: ['hardly', 'frequently', 'happily', 'strangely', 'quietly', 'slightly', 'regularly', 'finally', 'rarely', 'seriously']
cs2: ['seriously', 'severely', 'critically', 'badly', 'injured', 'serious', 'slightly', 'injuries', 'fatally', 'mortally']
cs3: ['hardly', 'suddenly', 'happily', 'gradually', 'luckily', 'usually', 'strangely', 'specially', 'finally', 'secretly']


### 比較生成的和已知的中了幾個

In [60]:
cs1_dis = list()
for c1 in cs1:
    if c1 in dis:
        cs1_dis.append(c1)

cs2_dis = list()
for c2 in cs2:
    if c2 in dis:
        cs2_dis.append(c2)

cs3_dis = list()
for c3 in cs3:
    if c3 in dis:
        cs3_dis.append(c3)

print(f"cs1_dis: {len(cs1_dis)}, {cs1_dis}")
print(f"cs2_dis: {len(cs2_dis)}, {cs2_dis}")
print(f"cs3_dis: {len(cs3_dis)}, {cs3_dis}")

cs1_dis: 2, ['hardly', 'slightly']
cs2_dis: 1, ['slightly']
cs3_dis: 1, ['hardly']


### 是否包含答案

In [61]:
cs1_ans = 0
cs1_ans_index = -1
if answer in cs1:
    cs1_ans = 1
    cs1_ans_index = cs1.index(answer)

cs2_ans = 0
cs2_ans_index = -1
if answer in cs2:
    cs2_ans = 1
    cs2_ans_index = cs2.index(answer)

cs3_ans = 0
cs3_ans_index = -1
if answer in cs3:
    cs3_ans = 1
    cs3_ans_index = cs3.index(answer)

print(f"cs1_ans = {cs1_ans}, index = {cs1_ans_index}")
print(f"cs2_ans = {cs2_ans}, index = {cs2_ans_index}")
print(f"cs3_ans = {cs3_ans}, index = {cs3_ans_index}")

cs1_ans = 1, index = 9
cs2_ans = 1, index = 0
cs3_ans = 0, index = -1
