In [1]:
import json
from collections import Counter
import spacy
from tqdm import tqdm


In [2]:
from eval import metric_max_over_ground_truths, soft_exact_match_score, f1_score, bem_score,hard_exact_match_score, insteval, spearman

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nonnumeric_type=['PERSON','GPE','ORG','NORP','LOC',
 'WORK_OF_ART','FAC','PRODUCT','EVENT','LAW',
 'LANGUAGE']
numeric_type=['DATE', 'CARDINAL','QUANTITY', 'ORDINAL', 'MONEY', 'PERCENT', 'TIME']


In [4]:
model_types=['gpt35', 'chatgpt', 'newbing']

In [5]:
data_dir='/home/donaldo9603/workspace/numeric/data/evouna/NQ_labeled.json'

In [6]:
with open(data_dir) as f:
    nq=json.load(f)

In [7]:
del_key=['em_gpt35', 'f1_gpt35', 'human_gpt35', 'bem_gpt35', 'instzero_gpt35', 'em_chatgpt', 'f1_chatgpt', 'human_chatgpt', 'bem_chatgpt', 'instzero_chatgpt', 'em_newbing', 'f1_newbing', 'human_newbing', 'bem_newbing', 'instzero_newbing']
for d in nq:
    for key in del_key:
        if key in d.keys():
            del d[key]

In [8]:
nq[0]

{'question': 'who got the first nobel prize in physics',
 'golden_answer': 'Wilhelm Conrad Röntgen',
 'answer_fid': 'Wilhelm Röntgen',
 'judge_fid': True,
 'answer_gpt35': 'The first Nobel Prize in Physics was awarded to Wilhelm Röntgen in 1901.',
 'judge_gpt35': True,
 'answer_chatgpt': 'The first Nobel Prize in Physics was awarded in 1901 to Wilhelm Röntgen for his discovery of X-rays.',
 'judge_chatgpt': True,
 'answer_gpt4': 'The first Nobel Prize in Physics was awarded in 1901 to Wilhelm Conrad Röntgen, a German physicist. He received the prize for his discovery of X-rays, a groundbreaking achievement that revolutionized the fields of medicine, physics, and chemistry.',
 'judge_gpt4': True,
 'answer_newbing': 'According to Wikipedia, Wilhelm Conrad Röntgen of Germany got the first Nobel Prize in Physics in 1901 for his discovery of X-rays.  He received 150,782 SEK (Swedish krona) as the prize money.',
 'judge_newbing': True,
 'improper': False,
 'ans_type': 'PERSON'}

In [9]:
nq=[d for d in nq if d['improper']!=True]

### EM, F1, BEM

In [10]:
for model_type in model_types:
    print("Inference result by {}".format(model_type))
    data_sum_all=0
    em_sum_all=0
    f1_sum_all=0
    hum_sum_all=0
    bem_sum_all=0
    
    em_hum_all=0
    f1_hum_all=0
    bem_hum_all=0
    
    for ans_type in numeric_type:
        data_sum=0
        em_sum=0
        f1_sum=0
        hum_sum=0
        bem_sum=0
        
        em_hum=0
        f1_hum=0
        bem_hum=0
        
        for d in tqdm(nq):
            if d['ans_type']==ans_type:
                if '/' in d['golden_answer']:
                    answer_set=d['golden_answer'].split('/')
                    em=metric_max_over_ground_truths(soft_exact_match_score, d['answer_{}'.format(model_type)],answer_set)
                    f1=metric_max_over_ground_truths(f1_score, d['answer_{}'.format(model_type)],answer_set)
                    bem=bem_score(d['answer_{}'.format(model_type)], answer_set, d['question'])
                    
                else:
                    em=soft_exact_match_score(d['answer_{}'.format(model_type)], d['golden_answer'])
                    f1=f1_score(d['answer_{}'.format(model_type)], d['golden_answer'])
                    bem=bem_score(d['answer_{}'.format(model_type)], [d['golden_answer']], d['question'])
                
                data_sum+=1
                d['em_{}'.format(model_type)]=em
                d['f1_{}'.format(model_type)]=f1                
                d['bem_{}'.format(model_type)]=bem
                human=1 if d['judge_{}'.format(model_type)] else 0
                
                em_sum+=em
                f1_sum+=f1
                hum_sum+=human
                bem_sum+=bem

                
                if human==em:
                    em_hum+=1
                if abs(human-f1)<=0.5:
                    f1_hum+=1
                if human==bem:
                    bem_hum+=1
                    
        data_sum_all+=data_sum
        em_sum_all+=em_sum
        f1_sum_all+=f1_sum
        hum_sum_all+=hum_sum
        bem_sum_all+=bem_sum

        em_hum_all+=em_hum
        f1_hum_all+=f1_hum
        bem_hum_all+=bem_hum
    
        
        print("QA Model: {}, NQ {} data for answer_type {}".format(model_type,data_sum, ans_type))
        print("Surface accuracy")
        print("EM: %5.3f" %(em_sum/data_sum))
        print("F1: %5.3f" %(f1_sum/data_sum))
        print("BEM: %5.3f" % (bem_sum/data_sum))
        print("Human: %5.3f" % (hum_sum/data_sum))
        print()
        print("Accuracy against human")
        print("EM accuracy against human %5.3f"%(em_hum/data_sum))
        print("F1 th 0.5 accuracy against human %5.3f" % (f1_hum/data_sum))
        print("BEM accuracy against human %5.3f" %(bem_hum/data_sum))
        print()
    
    print("------------------------------------")
    print("All NQ {} data inference by model: {}".format(data_sum_all, model_type))
    print("Surface accuracy")
    print("EM: %5.3f" %(em_sum_all/data_sum_all))
    print("F1: %5.3f" %(f1_sum_all/data_sum_all))
    print("BEM: %5.3f" % (bem_sum_all/data_sum_all))
    print("Human: %5.3f" % (hum_sum_all/data_sum_all))
    print()
    print("Accuracy against human")
    print("EM accuracy against human %5.3f"%(em_hum_all/data_sum_all))
    print("F1 th 0.5 accuracy against human %5.3f" % (f1_hum_all/data_sum_all))
    print("BEM accuracy against human %5.3f" %(bem_hum_all/data_sum_all))
    print("------------------------------------")
    print()

Inference result by gpt35


100%|██████████| 3020/3020 [00:08<00:00, 364.09it/s]


QA Model: gpt35, NQ 437 data for answer_type DATE
Surface accuracy
EM: 0.460
F1: 0.168
BEM: 0.886
Human: 0.611

Accuracy against human
EM accuracy against human 0.849
F1 th 0.5 accuracy against human 0.416
BEM accuracy against human 0.712



100%|██████████| 3020/3020 [00:02<00:00, 1185.91it/s]


QA Model: gpt35, NQ 144 data for answer_type CARDINAL
Surface accuracy
EM: 0.451
F1: 0.120
BEM: 0.944
Human: 0.625

Accuracy against human
EM accuracy against human 0.812
F1 th 0.5 accuracy against human 0.396
BEM accuracy against human 0.639



100%|██████████| 3020/3020 [00:00<00:00, 13332.49it/s]


QA Model: gpt35, NQ 14 data for answer_type QUANTITY
Surface accuracy
EM: 0.286
F1: 0.214
BEM: 1.000
Human: 0.714

Accuracy against human
EM accuracy against human 0.571
F1 th 0.5 accuracy against human 0.357
BEM accuracy against human 0.714



100%|██████████| 3020/3020 [00:00<00:00, 12988.62it/s]


QA Model: gpt35, NQ 11 data for answer_type ORDINAL
Surface accuracy
EM: 0.455
F1: 0.135
BEM: 0.909
Human: 0.455

Accuracy against human
EM accuracy against human 0.818
F1 th 0.5 accuracy against human 0.636
BEM accuracy against human 0.545



100%|██████████| 3020/3020 [00:00<00:00, 18362.76it/s]


QA Model: gpt35, NQ 10 data for answer_type MONEY
Surface accuracy
EM: 0.400
F1: 0.106
BEM: 1.000
Human: 0.500

Accuracy against human
EM accuracy against human 0.900
F1 th 0.5 accuracy against human 0.500
BEM accuracy against human 0.500



100%|██████████| 3020/3020 [00:00<00:00, 21559.77it/s]


QA Model: gpt35, NQ 9 data for answer_type PERCENT
Surface accuracy
EM: 0.444
F1: 0.129
BEM: 0.889
Human: 0.556

Accuracy against human
EM accuracy against human 0.889
F1 th 0.5 accuracy against human 0.556
BEM accuracy against human 0.667



100%|██████████| 3020/3020 [00:00<00:00, 31719.81it/s]


QA Model: gpt35, NQ 7 data for answer_type TIME
Surface accuracy
EM: 0.286
F1: 0.112
BEM: 1.000
Human: 0.571

Accuracy against human
EM accuracy against human 0.714
F1 th 0.5 accuracy against human 0.429
BEM accuracy against human 0.571

------------------------------------
All NQ 632 data inference by model: gpt35
Surface accuracy
EM: 0.451
F1: 0.155
BEM: 0.905
Human: 0.611

Accuracy against human
EM accuracy against human 0.834
F1 th 0.5 accuracy against human 0.418
BEM accuracy against human 0.687
------------------------------------

Inference result by chatgpt


100%|██████████| 3020/3020 [00:08<00:00, 377.45it/s]


QA Model: chatgpt, NQ 437 data for answer_type DATE
Surface accuracy
EM: 0.510
F1: 0.176
BEM: 0.860
Human: 0.677

Accuracy against human
EM accuracy against human 0.796
F1 th 0.5 accuracy against human 0.359
BEM accuracy against human 0.776



100%|██████████| 3020/3020 [00:02<00:00, 1180.06it/s]


QA Model: chatgpt, NQ 144 data for answer_type CARDINAL
Surface accuracy
EM: 0.542
F1: 0.124
BEM: 0.896
Human: 0.674

Accuracy against human
EM accuracy against human 0.812
F1 th 0.5 accuracy against human 0.361
BEM accuracy against human 0.750



100%|██████████| 3020/3020 [00:00<00:00, 13531.78it/s]


QA Model: chatgpt, NQ 14 data for answer_type QUANTITY
Surface accuracy
EM: 0.286
F1: 0.137
BEM: 0.714
Human: 0.714

Accuracy against human
EM accuracy against human 0.571
F1 th 0.5 accuracy against human 0.357
BEM accuracy against human 0.857



100%|██████████| 3020/3020 [00:00<00:00, 12046.34it/s]


QA Model: chatgpt, NQ 11 data for answer_type ORDINAL
Surface accuracy
EM: 0.636
F1: 0.155
BEM: 0.909
Human: 0.727

Accuracy against human
EM accuracy against human 0.909
F1 th 0.5 accuracy against human 0.364
BEM accuracy against human 0.818



100%|██████████| 3020/3020 [00:00<00:00, 17741.53it/s]


QA Model: chatgpt, NQ 10 data for answer_type MONEY
Surface accuracy
EM: 0.600
F1: 0.097
BEM: 0.900
Human: 0.800

Accuracy against human
EM accuracy against human 0.800
F1 th 0.5 accuracy against human 0.200
BEM accuracy against human 0.700



100%|██████████| 3020/3020 [00:00<00:00, 20607.75it/s]


QA Model: chatgpt, NQ 9 data for answer_type PERCENT
Surface accuracy
EM: 0.444
F1: 0.124
BEM: 0.778
Human: 0.444

Accuracy against human
EM accuracy against human 0.778
F1 th 0.5 accuracy against human 0.667
BEM accuracy against human 0.444



100%|██████████| 3020/3020 [00:00<00:00, 29809.56it/s]


QA Model: chatgpt, NQ 7 data for answer_type TIME
Surface accuracy
EM: 0.286
F1: 0.099
BEM: 1.000
Human: 0.714

Accuracy against human
EM accuracy against human 0.571
F1 th 0.5 accuracy against human 0.286
BEM accuracy against human 0.714

------------------------------------
All NQ 632 data inference by model: chatgpt
Surface accuracy
EM: 0.513
F1: 0.160
BEM: 0.867
Human: 0.677

Accuracy against human
EM accuracy against human 0.794
F1 th 0.5 accuracy against human 0.361
BEM accuracy against human 0.766
------------------------------------

Inference result by newbing


100%|██████████| 3020/3020 [00:08<00:00, 373.42it/s]


QA Model: newbing, NQ 437 data for answer_type DATE
Surface accuracy
EM: 0.535
F1: 0.101
BEM: 0.794
Human: 0.691

Accuracy against human
EM accuracy against human 0.803
F1 th 0.5 accuracy against human 0.318
BEM accuracy against human 0.778



100%|██████████| 3020/3020 [00:02<00:00, 1154.06it/s]


QA Model: newbing, NQ 144 data for answer_type CARDINAL
Surface accuracy
EM: 0.590
F1: 0.082
BEM: 0.910
Human: 0.764

Accuracy against human
EM accuracy against human 0.812
F1 th 0.5 accuracy against human 0.243
BEM accuracy against human 0.812



100%|██████████| 3020/3020 [00:00<00:00, 13587.05it/s]


QA Model: newbing, NQ 14 data for answer_type QUANTITY
Surface accuracy
EM: 0.357
F1: 0.122
BEM: 0.857
Human: 0.857

Accuracy against human
EM accuracy against human 0.500
F1 th 0.5 accuracy against human 0.143
BEM accuracy against human 1.000



100%|██████████| 3020/3020 [00:00<00:00, 13091.73it/s]


QA Model: newbing, NQ 11 data for answer_type ORDINAL
Surface accuracy
EM: 0.636
F1: 0.072
BEM: 0.909
Human: 0.727

Accuracy against human
EM accuracy against human 0.909
F1 th 0.5 accuracy against human 0.273
BEM accuracy against human 0.818



100%|██████████| 3020/3020 [00:00<00:00, 18369.29it/s]


QA Model: newbing, NQ 10 data for answer_type MONEY
Surface accuracy
EM: 0.500
F1: 0.088
BEM: 0.900
Human: 0.600

Accuracy against human
EM accuracy against human 0.900
F1 th 0.5 accuracy against human 0.400
BEM accuracy against human 0.700



100%|██████████| 3020/3020 [00:00<00:00, 20280.96it/s]


QA Model: newbing, NQ 9 data for answer_type PERCENT
Surface accuracy
EM: 0.444
F1: 0.025
BEM: 0.667
Human: 0.444

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.556
BEM accuracy against human 0.778



100%|██████████| 3020/3020 [00:00<00:00, 30665.02it/s]

QA Model: newbing, NQ 7 data for answer_type TIME
Surface accuracy
EM: 0.571
F1: 0.072
BEM: 0.857
Human: 0.714

Accuracy against human
EM accuracy against human 0.857
F1 th 0.5 accuracy against human 0.286
BEM accuracy against human 0.857

------------------------------------
All NQ 632 data inference by model: newbing
Surface accuracy
EM: 0.544
F1: 0.095
BEM: 0.824
Human: 0.707

Accuracy against human
EM accuracy against human 0.805
F1 th 0.5 accuracy against human 0.301
BEM accuracy against human 0.791
------------------------------------






In [11]:
for model_type in model_types:
    print("Inference result by {}".format(model_type))
    hum_sum_all=0
    instzero_sum_all=0
    instrandom_sum_all=0
    instent_sum_all=0
    
    instzero_hum_all=0
    instrandom_hum_all=0
    instent_hum_all=0
    data_sum_all=0
    for ans_type in numeric_type:
        data_sum=0
        instzero_sum=0
        instrandom_sum=0
        instent_sum=0
        hum_sum=0
        
        instzero_hum=0
        instrandom_hum=0
        instent_hum=0
        
        for d in tqdm(nq):
            if d['ans_type']==ans_type:
                if 'instzero_{}'.format(model_type) in d.keys():
                    instzero= d['instzero_{}'.format(model_type)]
                else:
                    if '/' in d['golden_answer']:
                        answer_set=d['golden_answer'].split('/')
                    else:
                        answer_set=[d['golden_answer']]
                    instzero=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="zero")
                    if instzero==None:
                        instzero=0
                
                
                if 'instrandom_{}'.format(model_type) in d.keys():
                        instrandom= d['instrandom_{}'.format(model_type)]
                else:
                    if '/' in d['golden_answer']:
                        answer_set=d['golden_answer'].split('/')
                    else:
                        answer_set=[d['golden_answer']]
                    instrandom=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="random")
                    if instrandom==None:
                        instrandom=0
                        
                if 'instentity_{}'.format(model_type) in d.keys():
                    instent=d['instentity_{}'.format(model_type)]
                else:
                    if '/' in d['golden_answer']:
                        answer_set=d['golden_answer'].split('/')
                    else:
                        answer_set=[d['golden_answer']]
                    instent=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="entity")
                    if instent==None:
                        instent=0
             
                d['instzero{}'.format(model_type)]=instzero
                d['instrandom_{}'.format(model_type)]=instrandom
                d['instentity_{}'.format(model_type)]=instent
                
                data_sum+=1
                human=1 if d['judge_{}'.format(model_type)] else 0
                hum_sum+=human
                
                instzero_sum+=instzero
                instrandom_sum+=instrandom
                instent_sum+=instent
                
                if human==instzero:
                    instzero_hum+=1
                if human==instrandom:
                    instrandom_hum+=1
                if human==instent:
                    instent_hum+=1
                    
        data_sum_all+=data_sum
        
        instzero_sum_all+=instzero_sum
        instrandom_sum_all+=instrandom_sum
        instent_sum_all+=instent_sum
        
        instzero_hum_all+=instzero_hum
        instrandom_hum_all+=instrandom_hum
        instent_hum_all+=instent_hum
        hum_sum_all+=hum_sum
        print("QA Model: {}, NQ {} data for answer_type {}".format(model_type,data_sum, ans_type))
        print("Surface accuracy")
        
        print("Insteval-zero: %5.3f" % (instzero_sum/data_sum))
        print("Insteval-random: %5.3f" % (instrandom_sum/data_sum))
        print("Insteval-entity: %5.3f" % (instent_sum/data_sum))
        print("Human: %5.3f" % (hum_sum/data_sum))
        print()
        print("Accuracy against human")
        print("Insteval-zero accuracy against human %5.3f" %(instzero_hum/data_sum))
        print("Insteval-random accuracy against human %5.3f" %(instrandom_hum/data_sum))
        print("Insteval-entity accuracy against human %5.3f" %(instent_hum/data_sum))
        print()
    
    print("------------------------------------")
    print("All NQ {} data inference by model: {}".format(data_sum_all, model_type))
    print("Surface accuracy")
    print("Insteval-zero: %5.3f" % (instzero_sum_all/data_sum_all))
    print("Insteval-random: %5.3f" % (instrandom_sum_all/data_sum_all))
    print("Insteval-entity: %5.3f" % (instent_sum_all/data_sum_all))
    print("Human: %5.3f" % (hum_sum_all/data_sum_all))
    print()
    print("Accuracy against human")
    print("Insteval-zero accuracy against human %5.3f" %(instzero_hum_all/data_sum_all))
    print("Insteval-random accuracy against human %5.3f" %(instrandom_hum_all/data_sum_all))
    print("Insteval-entity accuracy against human %5.3f" %(instent_hum_all/data_sum_all))
    print("------------------------------------")
    print()

Inference result by gpt35


100%|██████████| 3020/3020 [08:15<00:00,  6.09it/s]


QA Model: gpt35, NQ 437 data for answer_type DATE
Surface accuracy
Insteval-zero: 0.822
Insteval-random: 0.586
Insteval-entity: 0.574
Human: 0.611

Accuracy against human
Insteval-zero accuracy against human 0.762
Insteval-random accuracy against human 0.924
Insteval-entity accuracy against human 0.922



100%|██████████| 3020/3020 [03:04<00:00, 16.33it/s]


QA Model: gpt35, NQ 144 data for answer_type CARDINAL
Surface accuracy
Insteval-zero: 0.826
Insteval-random: 0.604
Insteval-entity: 0.646
Human: 0.625

Accuracy against human
Insteval-zero accuracy against human 0.771
Insteval-random accuracy against human 0.896
Insteval-entity accuracy against human 0.896



100%|██████████| 3020/3020 [00:16<00:00, 178.07it/s]


QA Model: gpt35, NQ 14 data for answer_type QUANTITY
Surface accuracy
Insteval-zero: 1.000
Insteval-random: 0.643
Insteval-entity: 0.714
Human: 0.714

Accuracy against human
Insteval-zero accuracy against human 0.714
Insteval-random accuracy against human 0.929
Insteval-entity accuracy against human 0.857



100%|██████████| 3020/3020 [00:12<00:00, 245.78it/s]


QA Model: gpt35, NQ 11 data for answer_type ORDINAL
Surface accuracy
Insteval-zero: 0.818
Insteval-random: 0.636
Insteval-entity: 0.545
Human: 0.455

Accuracy against human
Insteval-zero accuracy against human 0.636
Insteval-random accuracy against human 0.818
Insteval-entity accuracy against human 0.909



100%|██████████| 3020/3020 [00:12<00:00, 239.51it/s]


QA Model: gpt35, NQ 10 data for answer_type MONEY
Surface accuracy
Insteval-zero: 0.800
Insteval-random: 0.400
Insteval-entity: 0.400
Human: 0.500

Accuracy against human
Insteval-zero accuracy against human 0.700
Insteval-random accuracy against human 0.900
Insteval-entity accuracy against human 0.900



100%|██████████| 3020/3020 [00:10<00:00, 279.67it/s]


QA Model: gpt35, NQ 9 data for answer_type PERCENT
Surface accuracy
Insteval-zero: 0.778
Insteval-random: 0.444
Insteval-entity: 0.556
Human: 0.556

Accuracy against human
Insteval-zero accuracy against human 0.778
Insteval-random accuracy against human 0.889
Insteval-entity accuracy against human 1.000



100%|██████████| 3020/3020 [00:08<00:00, 367.88it/s]


QA Model: gpt35, NQ 7 data for answer_type TIME
Surface accuracy
Insteval-zero: 0.857
Insteval-random: 0.714
Insteval-entity: 0.429
Human: 0.571

Accuracy against human
Insteval-zero accuracy against human 0.714
Insteval-random accuracy against human 0.857
Insteval-entity accuracy against human 0.857

------------------------------------
All NQ 632 data inference by model: gpt35
Surface accuracy
Insteval-zero: 0.826
Insteval-random: 0.589
Insteval-entity: 0.589
Human: 0.611

Accuracy against human
Insteval-zero accuracy against human 0.759
Insteval-random accuracy against human 0.915
Insteval-entity accuracy against human 0.915
------------------------------------

Inference result by chatgpt


100%|██████████| 3020/3020 [08:20<00:00,  6.03it/s]


QA Model: chatgpt, NQ 437 data for answer_type DATE
Surface accuracy
Insteval-zero: 0.892
Insteval-random: 0.581
Insteval-entity: 0.625
Human: 0.677

Accuracy against human
Insteval-zero accuracy against human 0.776
Insteval-random accuracy against human 0.844
Insteval-entity accuracy against human 0.856



100%|██████████| 3020/3020 [02:46<00:00, 18.19it/s]


QA Model: chatgpt, NQ 144 data for answer_type CARDINAL
Surface accuracy
Insteval-zero: 0.882
Insteval-random: 0.590
Insteval-entity: 0.694
Human: 0.674

Accuracy against human
Insteval-zero accuracy against human 0.792
Insteval-random accuracy against human 0.847
Insteval-entity accuracy against human 0.868



100%|██████████| 3020/3020 [00:15<00:00, 189.01it/s]


QA Model: chatgpt, NQ 14 data for answer_type QUANTITY
Surface accuracy
Insteval-zero: 1.000
Insteval-random: 0.357
Insteval-entity: 0.643
Human: 0.714

Accuracy against human
Insteval-zero accuracy against human 0.714
Insteval-random accuracy against human 0.643
Insteval-entity accuracy against human 0.929



100%|██████████| 3020/3020 [00:13<00:00, 225.75it/s]


QA Model: chatgpt, NQ 11 data for answer_type ORDINAL
Surface accuracy
Insteval-zero: 0.818
Insteval-random: 0.636
Insteval-entity: 0.727
Human: 0.727

Accuracy against human
Insteval-zero accuracy against human 0.727
Insteval-random accuracy against human 0.727
Insteval-entity accuracy against human 0.818



100%|██████████| 3020/3020 [00:13<00:00, 225.36it/s]


QA Model: chatgpt, NQ 10 data for answer_type MONEY
Surface accuracy
Insteval-zero: 0.900
Insteval-random: 0.400
Insteval-entity: 0.400
Human: 0.800

Accuracy against human
Insteval-zero accuracy against human 0.900
Insteval-random accuracy against human 0.600
Insteval-entity accuracy against human 0.600



100%|██████████| 3020/3020 [00:10<00:00, 283.61it/s]


QA Model: chatgpt, NQ 9 data for answer_type PERCENT
Surface accuracy
Insteval-zero: 0.778
Insteval-random: 0.444
Insteval-entity: 0.333
Human: 0.444

Accuracy against human
Insteval-zero accuracy against human 0.667
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 0.889



100%|██████████| 3020/3020 [00:08<00:00, 374.48it/s]


QA Model: chatgpt, NQ 7 data for answer_type TIME
Surface accuracy
Insteval-zero: 1.000
Insteval-random: 0.429
Insteval-entity: 0.286
Human: 0.714

Accuracy against human
Insteval-zero accuracy against human 0.714
Insteval-random accuracy against human 0.714
Insteval-entity accuracy against human 0.571

------------------------------------
All NQ 632 data inference by model: chatgpt
Surface accuracy
Insteval-zero: 0.891
Insteval-random: 0.573
Insteval-entity: 0.631
Human: 0.677

Accuracy against human
Insteval-zero accuracy against human 0.777
Insteval-random accuracy against human 0.835
Insteval-entity accuracy against human 0.853
------------------------------------

Inference result by newbing


100%|██████████| 3020/3020 [08:23<00:00,  6.00it/s]


QA Model: newbing, NQ 437 data for answer_type DATE
Surface accuracy
Insteval-zero: 0.838
Insteval-random: 0.494
Insteval-entity: 0.538
Human: 0.691

Accuracy against human
Insteval-zero accuracy against human 0.730
Insteval-random accuracy against human 0.748
Insteval-entity accuracy against human 0.760



100%|██████████| 3020/3020 [02:46<00:00, 18.16it/s]


QA Model: newbing, NQ 144 data for answer_type CARDINAL
Surface accuracy
Insteval-zero: 0.910
Insteval-random: 0.597
Insteval-entity: 0.701
Human: 0.764

Accuracy against human
Insteval-zero accuracy against human 0.826
Insteval-random accuracy against human 0.792
Insteval-entity accuracy against human 0.840



100%|██████████| 3020/3020 [00:16<00:00, 181.31it/s]


QA Model: newbing, NQ 14 data for answer_type QUANTITY
Surface accuracy
Insteval-zero: 1.000
Insteval-random: 0.643
Insteval-entity: 0.643
Human: 0.857

Accuracy against human
Insteval-zero accuracy against human 0.857
Insteval-random accuracy against human 0.643
Insteval-entity accuracy against human 0.643



100%|██████████| 3020/3020 [00:14<00:00, 205.08it/s]


QA Model: newbing, NQ 11 data for answer_type ORDINAL
Surface accuracy
Insteval-zero: 0.909
Insteval-random: 0.455
Insteval-entity: 0.545
Human: 0.727

Accuracy against human
Insteval-zero accuracy against human 0.636
Insteval-random accuracy against human 0.545
Insteval-entity accuracy against human 0.636



100%|██████████| 3020/3020 [00:13<00:00, 221.00it/s]


QA Model: newbing, NQ 10 data for answer_type MONEY
Surface accuracy
Insteval-zero: 0.800
Insteval-random: 0.400
Insteval-entity: 0.400
Human: 0.600

Accuracy against human
Insteval-zero accuracy against human 0.800
Insteval-random accuracy against human 0.800
Insteval-entity accuracy against human 0.800



100%|██████████| 3020/3020 [00:10<00:00, 291.85it/s]


QA Model: newbing, NQ 9 data for answer_type PERCENT
Surface accuracy
Insteval-zero: 0.667
Insteval-random: 0.222
Insteval-entity: 0.222
Human: 0.444

Accuracy against human
Insteval-zero accuracy against human 0.778
Insteval-random accuracy against human 0.778
Insteval-entity accuracy against human 0.778



100%|██████████| 3020/3020 [00:08<00:00, 363.73it/s]

QA Model: newbing, NQ 7 data for answer_type TIME
Surface accuracy
Insteval-zero: 0.714
Insteval-random: 0.429
Insteval-entity: 0.286
Human: 0.714

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 0.714
Insteval-entity accuracy against human 0.571

------------------------------------
All NQ 632 data inference by model: newbing
Surface accuracy
Insteval-zero: 0.854
Insteval-random: 0.514
Insteval-entity: 0.568
Human: 0.707

Accuracy against human
Insteval-zero accuracy against human 0.758
Insteval-random accuracy against human 0.753
Insteval-entity accuracy against human 0.772
------------------------------------






In [12]:
with open('/home/donaldo9603/workspace/numeric/data/evouna/NQ_instruction_test4.json', 'w') as f:
    json.dump(nq, f)

In [24]:
nq[7]

{'question': 'how many episodes are there in dragon ball z',
 'golden_answer': '291 episodes/291',
 'answer_fid': '291',
 'judge_fid': True,
 'answer_gpt35': 'There are 291 episodes in Dragon Ball Z.',
 'judge_gpt35': True,
 'answer_chatgpt': 'There are a total of 291 episodes in Dragon Ball Z.',
 'judge_chatgpt': True,
 'answer_gpt4': 'There are a total of 291 episodes in the Dragon Ball Z anime series, which originally aired in Japan between 1989 and 1996.',
 'judge_gpt4': True,
 'answer_newbing': 'According to the Dragon Ball Wiki and Wikipedia, Dragon Ball Z is an anime series that ran from 1989 to 1996 and has a total of 291 episodes.  However, some episodes were skipped or edited in the United States broadcast, so there are only 276 episodes originally aired in the US.',
 'judge_newbing': True,
 'improper': False,
 'ans_type': 'QUANTITY',
 'em_gpt35': 1,
 'f1_gpt35': 0.4,
 'bem_gpt35': 1,
 'em_chatgpt': 1,
 'f1_chatgpt': 0.33333333333333337,
 'bem_chatgpt': 1,
 'em_newbing': 1,
 

### SPEARMAN

In [32]:
human=[]
instzero=[]
instrandom=[]
instent=[]
ems=[]
f1=[]
bem=[]
model_type='newbing'
for d in nq:
    if 'instzero{}'.format(model_type) in d.keys():
        hum=1 if d['judge_{}'.format(model_type)] else 0
        human.append(hum)
        ems.append(d['em_{}'.format(model_type)])
        f1.append(d['f1_{}'.format(model_type)])
        bem.append(d['bem_{}'.format(model_type)])
        instzero.append(d['instzero{}'.format(model_type)])
        instrandom.append(d['instrandom_{}'.format(model_type)])
        instent.append(d['instentity_{}'.format(model_type)])
print('Spearman against Human')
print('EM: ', spearman(human,ems))
print('F1: ',spearman(human,f1))
print('bem: ',spearman(human,bem))
print('Instzero: ',spearman(human,instzero))
print('Instrandom: ',spearman(human,instrandom))
print('Instent: ',spearman(human,instent))

Spearman against Human
EM:  0.6332731732210375
F1:  0.4602421450685009
bem:  0.4524511604751374
Instzero:  0.34579421397430693
Instrandom:  0.5436374747428894
Instent:  0.5411656120130601


In [45]:
nq[7]

{'question': 'how many episodes are there in dragon ball z',
 'golden_answer': '291 episodes/291',
 'answer_fid': '291',
 'judge_fid': True,
 'answer_gpt35': 'There are 291 episodes in Dragon Ball Z.',
 'judge_gpt35': True,
 'answer_chatgpt': 'There are a total of 291 episodes in Dragon Ball Z.',
 'judge_chatgpt': True,
 'answer_gpt4': 'There are a total of 291 episodes in the Dragon Ball Z anime series, which originally aired in Japan between 1989 and 1996.',
 'judge_gpt4': True,
 'answer_newbing': 'According to the Dragon Ball Wiki and Wikipedia, Dragon Ball Z is an anime series that ran from 1989 to 1996 and has a total of 291 episodes.  However, some episodes were skipped or edited in the United States broadcast, so there are only 276 episodes originally aired in the US.',
 'judge_newbing': True,
 'improper': False,
 'ans_type': 'QUANTITY',
 'em_gpt35': 1,
 'f1_gpt35': 0.4,
 'bem_gpt35': 1,
 'em_chatgpt': 1,
 'f1_chatgpt': 0.33333333333333337,
 'bem_chatgpt': 1,
 'em_newbing': 1,
 

In [71]:
nq[0]

{'question': 'who got the first nobel prize in physics',
 'golden_answer': 'Wilhelm Conrad Röntgen',
 'answer_fid': 'Wilhelm Röntgen',
 'judge_fid': True,
 'answer_gpt35': 'The first Nobel Prize in Physics was awarded to Wilhelm Röntgen in 1901.',
 'judge_gpt35': True,
 'answer_chatgpt': 'The first Nobel Prize in Physics was awarded in 1901 to Wilhelm Röntgen for his discovery of X-rays.',
 'judge_chatgpt': True,
 'answer_gpt4': 'The first Nobel Prize in Physics was awarded in 1901 to Wilhelm Conrad Röntgen, a German physicist. He received the prize for his discovery of X-rays, a groundbreaking achievement that revolutionized the fields of medicine, physics, and chemistry.',
 'judge_gpt4': True,
 'answer_newbing': 'According to Wikipedia, Wilhelm Conrad Röntgen of Germany got the first Nobel Prize in Physics in 1901 for his discovery of X-rays.  He received 150,782 SEK (Swedish krona) as the prize money.',
 'judge_newbing': True,
 'improper': False,
 'ans_type': 'PERSON'}

In [69]:
tq[100]['golden_answer']

'Petula Clark/Petula Clarke/Pet clark/Petula Sally Olwen Clark'

In [65]:
model_type=''
instzero=0
instrandom=0
instent=0
hard_em=0
cnt=0
for d in tq:
    if 'instzero{}'.format(model_type) in d.keys():
        cnt+=1
        hum=1 if d['judge_{}'.format(model_type)] else 0
        em=hard_exact_match_score(d['answer_{}'.format(model_type)], d['golden_answer'])
        if em==hum:
            hard_em+=1
        if d['em_{}'.format(model_type)]==1 and hum==1:
            instzero+=1
            instrandom+=1
            instent+=1
        else:
            if hum==d['instzero{}'.format(model_type)]:
                instzero+=1
            if hum==d['instrandom_{}'.format(model_type)]:
                instrandom+=1
            if hum==d['instentity_{}'.format(model_type)]:
                instent+=1
print('Acc against Human')
print('Hard EM: %5.3f'%(hard_em/cnt))
print('Inst-zero: %5.3f'%(instzero/cnt))
print('Inst-random: %5.3f'%(instrandom/cnt))
print('Inst-entity: %5.3f'%(instent/cnt))

Acc against Human
Hard EM: 0.391
Inst-zero: 0.761
Inst-random: 0.935
Inst-entity: 0.935


### NQ Nonnumeric

In [None]:
for model_type in model_types:
    print("Inference result by {}".format(model_type))
    data_sum_all=0
    em_sum_all=0
    f1_sum_all=0
    hum_sum_all=0
    bem_sum_all=0
    instzero_sum_all=0
    instrandom_sum_all=0
    instent_sum_all=0
    
    em_hum_all=0
    f1_hum_all=0
    bem_hum_all=0
    instzero_hum_all=0
    instrandom_hum_all=0
    instent_hum_all=0
    
    for ans_type in nonnumeric_type:
        data_sum=0
        em_sum=0
        f1_sum=0
        hum_sum=0
        bem_sum=0
        instzero_sum=0
        instrandom_sum=0
        instent_sum=0
        
        em_hum=0
        f1_hum=0
        bem_hum=0
        instzero_hum=0
        instrandom_hum=0
        instent_hum=0
        
        for d in tqdm(nq):
            if d['ans_type']==ans_type:
                if '/' in d['golden_answer']:
                    answer_set=d['golden_answer'].split('/')
#                     em=metric_max_over_ground_truths(soft_exact_match_score, d['answer_{}'.format(model_type)],answer_set)
#                     f1=metric_max_over_ground_truths(f1_score, d['answer_{}'.format(model_type)],answer_set)
#                     bem=bem_score(d['answer_{}'.format(model_type)], answer_set, d['question'])
                    instrandom=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="random")
                    instent=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="entity")
                else:
#                     em=soft_exact_match_score(d['answer_{}'.format(model_type)], d['golden_answer'])
#                     f1=f1_score(d['answer_{}'.format(model_type)], d['golden_answer'])
#                     bem=bem_score(d['answer_{}'.format(model_type)], [d['golden_answer']], d['question'])
                    instrandom=insteval(d['answer_{}'.format(model_type)], [d['golden_answer']], d['question'],d['ans_type'], eval_type="random")
                    instent=insteval(d['answer_{}'.format(model_type)], [d['golden_answer']], d['question'],d['ans_type'], eval_type="entity")
    
                if instent==None:
                    instent=0
                if instrandom==None:
                    instrandom=0
                
                d['instrandom_{}'.format(model_type)]=instrandom
                d['instentity_{}'.format(model_type)]=instent
                
                
                data_sum+=1
                em=d['em_{}'.format(model_type)]
                f1=d['f1_{}'.format(model_type)]                
                bem=d['bem_{}'.format(model_type)]
                human=1 if d['judge_{}'.format(model_type)] else 0
                instzero=d['instzero_{}'.format(model_type)]
                
                em_sum+=em
                f1_sum+=f1
                hum_sum+=human
                bem_sum+=bem
                instzero_sum+=instzero
                instrandom_sum+=instrandom
                instent_sum+=instent
                
                if human==em:
                    em_hum+=1
                if abs(human-f1)<=0.5:
                    f1_hum+=1
                if human==bem:
                    bem_hum+=1
                if human==instzero:
                    instzero_hum+=1
                if human==instrandom:
                    instrandom_hum+=1
                if human==instent:
                    instent_hum+=1
                    
        data_sum_all+=data_sum
        em_sum_all+=em_sum
        f1_sum_all+=f1_sum
        hum_sum_all+=hum_sum
        bem_sum_all+=bem_sum
        instzero_sum_all+=instzero_sum
        instrandom_sum_all+=instrandom_sum
        instent_sum_all+=instent_sum

        em_hum_all+=em_hum
        f1_hum_all+=f1_hum
        bem_hum_all+=bem_hum
        instzero_hum_all+=instzero_hum
        instrandom_hum_all+=instrandom_hum
        instent_hum_all+=instent_hum
        
        print("QA Model: {}, NQ {} data for answer_type {}".format(model_type,data_sum, ans_type))
        print("Surface accuracy")
        print("EM: %5.3f" %(em_sum/data_sum))
        print("F1: %5.3f" %(f1_sum/data_sum))
        print("BEM: %5.3f" % (bem_sum/data_sum))
        print("Insteval-zero: %5.3f" % (instzero_sum/data_sum))
        print("Insteval-random: %5.3f" % (instrandom_sum/data_sum))
        print("Insteval-entity: %5.3f" % (instent_sum/data_sum))
        print("Human: %5.3f" % (hum_sum/data_sum))
        print()
        print("Accuracy against human")
        print("EM accuracy against human %5.3f"%(em_hum/data_sum))
        print("F1 th 0.5 accuracy against human %5.3f" % (f1_hum/data_sum))
        print("BEM accuracy against human %5.3f" %(bem_hum/data_sum))
        print("Insteval-zero accuracy against human %5.3f" %(instzero_hum/data_sum))
        print("Insteval-random accuracy against human %5.3f" %(instrandom_hum/data_sum))
        print("Insteval-entity accuracy against human %5.3f" %(instent_hum/data_sum))
        print()
    
    print("------------------------------------")
    print("All NQ {} data inference by model: {}".format(data_sum_all, model_type))
    print("Surface accuracy")
    print("EM: %5.3f" %(em_sum_all/data_sum_all))
    print("F1: %5.3f" %(f1_sum_all/data_sum_all))
    print("BEM: %5.3f" % (bem_sum_all/data_sum_all))
    print("Insteval-zero: %5.3f" % (instzero_sum_all/data_sum_all))
    print("Insteval-random: %5.3f" % (instrandom_sum_all/data_sum_all))
    print("Insteval-entity: %5.3f" % (instent_sum_all/data_sum_all))
    print("Human: %5.3f" % (hum_sum_all/data_sum_all))
    print()
    print("Accuracy against human")
    print("EM accuracy against human %5.3f"%(em_hum_all/data_sum_all))
    print("F1 th 0.5 accuracy against human %5.3f" % (f1_hum_all/data_sum_all))
    print("BEM accuracy against human %5.3f" %(bem_hum_all/data_sum_all))
    print("Insteval-zero accuracy against human %5.3f" %(instzero_hum_all/data_sum_all))
    print("Insteval-random accuracy against human %5.3f" %(instrandom_hum_all/data_sum_all))
    print("Insteval-entity accuracy against human %5.3f" %(instentity_hum_all/data_sum_all))
    print("------------------------------------")
    print()

In [None]:
with open('/home/donaldo9603/workspace/numeric/data/evouna/NQ_labeled.json', 'w') as f:
    json.dump(nq, f)

### Error Sample

In [43]:
for d in nq:
    if d['ans_type'] in numeric_type or d['ans_type'] in nonnumeric_type:
        human=1 if d['judge_{}'.format('gpt35')] else 0
        if human!=d['instzero_gpt35']:
            print(d['question'])
            print(d['golden_answer'])
            print(d['answer_gpt35'])
            print(human)
            print(d['instzero_gpt35'])
            print()

who designed the garden city of new earswick
planner Raymond Unwin/architect Barry Parker/Raymond Unwin
The Garden City of Letchworth, known as the world's first Garden City, was planned by Ebenezer Howard and laid out by Barry Parker and Raymond Unwin.
1
0

when did beds become popular in france and germany
the 12th century
Beds became popular in France and Germany during the Middle Ages (around the 11th century).
0
1

who said if a tree falls in the woods and nobody hears it
Philosopher George Berkeley/Charles Riborg Mann/George Ransom Twiss
, does it make a sound

This phrase is attributed to the 18th-century philosopher George Berkeley. He wrote, “For, suppose the mind of man were removed out of the world, all the things in the world would be no more to him than if they had never been created; and to suppose any real existence without the mind, is to suppose a chimera. To which I answer, that though an audible sound hath not existence without the ear to hear it, yet the
1
0

where 

### TriviaQA

In [17]:
data_dir='/home/donaldo9603/workspace/numeric/data/evouna/TQ_labeled.json'

In [18]:
with open(data_dir) as f:
    tq=json.load(f)

In [19]:
tq=[d for d in tq if d['improper']!=True]

In [16]:
for d in tq:
    doc = nlp(d['golden_answer'])
    if len(doc.ents)==0:
        d['ans_type']='unknown'
    else:
        l=[]
        for tag in doc.ents:
            l.append(tag.label_)
        if all_same(l):
            d['ans_type']=l[0]
        else:
            d['ans_type']='multiple'

NameError: name 'nlp' is not defined

### TriviaQA numeric EM, F1

In [20]:
for model_type in model_types:
    print("Inference result by {}".format(model_type))
    data_sum_all=0
    em_sum_all=0
    f1_sum_all=0
    hum_sum_all=0
    bem_sum_all=0
    
    em_hum_all=0
    f1_hum_all=0
    bem_hum_all=0
    
    for ans_type in numeric_type:
        data_sum=0
        em_sum=0
        f1_sum=0
        hum_sum=0
        bem_sum=0
        
        em_hum=0
        f1_hum=0
        bem_hum=0
        
        for d in tqdm(tq):
            if d['ans_type']==ans_type:
                if '/' in d['golden_answer']:
                    answer_set=d['golden_answer'].split('/')
                    em=metric_max_over_ground_truths(soft_exact_match_score, d['answer_{}'.format(model_type)],answer_set)
                    f1=metric_max_over_ground_truths(f1_score, d['answer_{}'.format(model_type)],answer_set)
                    bem=bem_score(d['answer_{}'.format(model_type)], answer_set, d['question'])
                    
                else:
                    em=soft_exact_match_score(d['answer_{}'.format(model_type)], d['golden_answer'])
                    f1=f1_score(d['answer_{}'.format(model_type)], d['golden_answer'])
                    bem=bem_score(d['answer_{}'.format(model_type)], [d['golden_answer']], d['question'])
                
                data_sum+=1
                d['em_{}'.format(model_type)]=em
                d['f1_{}'.format(model_type)]=f1                
                d['bem_{}'.format(model_type)]=bem
                human=1 if d['judge_{}'.format(model_type)] else 0
                
                em_sum+=em
                f1_sum+=f1
                hum_sum+=human
                bem_sum+=bem

                
                if human==em:
                    em_hum+=1
                if abs(human-f1)<=0.5:
                    f1_hum+=1
                if human==bem:
                    bem_hum+=1
                    
        data_sum_all+=data_sum
        em_sum_all+=em_sum
        f1_sum_all+=f1_sum
        hum_sum_all+=hum_sum
        bem_sum_all+=bem_sum

        em_hum_all+=em_hum
        f1_hum_all+=f1_hum
        bem_hum_all+=bem_hum
    
        
        print("QA Model: {}, TQ {} data for answer_type {}".format(model_type,data_sum, ans_type))
        print("Surface accuracy")
        print("EM: %5.3f" %(em_sum/data_sum))
        print("F1: %5.3f" %(f1_sum/data_sum))
        print("BEM: %5.3f" % (bem_sum/data_sum))
        print("Human: %5.3f" % (hum_sum/data_sum))
        print()
        print("Accuracy against human")
        print("EM accuracy against human %5.3f"%(em_hum/data_sum))
        print("F1 th 0.5 accuracy against human %5.3f" % (f1_hum/data_sum))
        print("BEM accuracy against human %5.3f" %(bem_hum/data_sum))
        print()
    
    print("------------------------------------")
    print("All TQ {} data inference by model: {}".format(data_sum_all, model_type))
    print("Surface accuracy")
    print("EM: %5.3f" %(em_sum_all/data_sum_all))
    print("F1: %5.3f" %(f1_sum_all/data_sum_all))
    print("BEM: %5.3f" % (bem_sum_all/data_sum_all))
    print("Human: %5.3f" % (hum_sum_all/data_sum_all))
    print()
    print("Accuracy against human")
    print("EM accuracy against human %5.3f"%(em_hum_all/data_sum_all))
    print("F1 th 0.5 accuracy against human %5.3f" % (f1_hum_all/data_sum_all))
    print("BEM accuracy against human %5.3f" %(bem_hum_all/data_sum_all))
    print("------------------------------------")
    print()

Inference result by gpt35


100%|██████████| 1938/1938 [00:01<00:00, 1643.82it/s]


QA Model: gpt35, TQ 36 data for answer_type DATE
Surface accuracy
EM: 0.556
F1: 0.254
BEM: 0.861
Human: 0.778

Accuracy against human
EM accuracy against human 0.778
F1 th 0.5 accuracy against human 0.389
BEM accuracy against human 0.750



100%|██████████| 1938/1938 [00:01<00:00, 1577.57it/s]


QA Model: gpt35, TQ 47 data for answer_type CARDINAL
Surface accuracy
EM: 0.617
F1: 0.293
BEM: 0.872
Human: 0.723

Accuracy against human
EM accuracy against human 0.894
F1 th 0.5 accuracy against human 0.489
BEM accuracy against human 0.851



100%|██████████| 1938/1938 [00:00<00:00, 40707.94it/s]


QA Model: gpt35, TQ 2 data for answer_type QUANTITY
Surface accuracy
EM: 0.000
F1: 0.143
BEM: 1.000
Human: 0.500

Accuracy against human
EM accuracy against human 0.500
F1 th 0.5 accuracy against human 0.500
BEM accuracy against human 0.500



100%|██████████| 1938/1938 [00:00<00:00, 11911.34it/s]


QA Model: gpt35, TQ 1 data for answer_type ORDINAL
Surface accuracy
EM: 1.000
F1: 0.286
BEM: 1.000
Human: 1.000

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.000
BEM accuracy against human 1.000



100%|██████████| 1938/1938 [00:00<00:00, 28429.99it/s]


QA Model: gpt35, TQ 3 data for answer_type MONEY
Surface accuracy
EM: 0.000
F1: 0.000
BEM: 1.000
Human: 0.333

Accuracy against human
EM accuracy against human 0.667
F1 th 0.5 accuracy against human 0.667
BEM accuracy against human 0.333



100%|██████████| 1938/1938 [00:00<00:00, 29103.23it/s]


QA Model: gpt35, TQ 2 data for answer_type PERCENT
Surface accuracy
EM: 1.000
F1: 0.177
BEM: 1.000
Human: 0.500

Accuracy against human
EM accuracy against human 0.500
F1 th 0.5 accuracy against human 0.500
BEM accuracy against human 0.500



100%|██████████| 1938/1938 [00:00<00:00, 29184.63it/s]


QA Model: gpt35, TQ 2 data for answer_type TIME
Surface accuracy
EM: 0.500
F1: 0.111
BEM: 1.000
Human: 0.500

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.500
BEM accuracy against human 0.500

------------------------------------
All TQ 93 data inference by model: gpt35
Surface accuracy
EM: 0.570
F1: 0.259
BEM: 0.882
Human: 0.720

Accuracy against human
EM accuracy against human 0.828
F1 th 0.5 accuracy against human 0.452
BEM accuracy against human 0.774
------------------------------------

Inference result by chatgpt


100%|██████████| 1938/1938 [00:01<00:00, 1666.52it/s]


QA Model: chatgpt, TQ 36 data for answer_type DATE
Surface accuracy
EM: 0.583
F1: 0.352
BEM: 0.944
Human: 0.750

Accuracy against human
EM accuracy against human 0.833
F1 th 0.5 accuracy against human 0.528
BEM accuracy against human 0.806



100%|██████████| 1938/1938 [00:01<00:00, 1573.10it/s]


QA Model: chatgpt, TQ 47 data for answer_type CARDINAL
Surface accuracy
EM: 0.638
F1: 0.224
BEM: 0.915
Human: 0.766

Accuracy against human
EM accuracy against human 0.872
F1 th 0.5 accuracy against human 0.319
BEM accuracy against human 0.851



100%|██████████| 1938/1938 [00:00<00:00, 38023.91it/s]


QA Model: chatgpt, TQ 2 data for answer_type QUANTITY
Surface accuracy
EM: 0.000
F1: 0.202
BEM: 1.000
Human: 0.500

Accuracy against human
EM accuracy against human 0.500
F1 th 0.5 accuracy against human 0.500
BEM accuracy against human 0.500



100%|██████████| 1938/1938 [00:00<00:00, 11233.45it/s]


QA Model: chatgpt, TQ 1 data for answer_type ORDINAL
Surface accuracy
EM: 1.000
F1: 1.000
BEM: 1.000
Human: 1.000

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 1.000
BEM accuracy against human 1.000



100%|██████████| 1938/1938 [00:00<00:00, 28013.10it/s]


QA Model: chatgpt, TQ 3 data for answer_type MONEY
Surface accuracy
EM: 0.000
F1: 0.000
BEM: 0.667
Human: 0.333

Accuracy against human
EM accuracy against human 0.667
F1 th 0.5 accuracy against human 0.667
BEM accuracy against human 0.667



100%|██████████| 1938/1938 [00:00<00:00, 29389.34it/s]


QA Model: chatgpt, TQ 2 data for answer_type PERCENT
Surface accuracy
EM: 0.500
F1: 0.077
BEM: 1.000
Human: 0.500

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.500
BEM accuracy against human 0.500



100%|██████████| 1938/1938 [00:00<00:00, 29199.73it/s]


QA Model: chatgpt, TQ 2 data for answer_type TIME
Surface accuracy
EM: 0.500
F1: 0.091
BEM: 1.000
Human: 0.500

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.500
BEM accuracy against human 0.500

------------------------------------
All TQ 93 data inference by model: chatgpt
Surface accuracy
EM: 0.581
F1: 0.268
BEM: 0.925
Human: 0.731

Accuracy against human
EM accuracy against human 0.849
F1 th 0.5 accuracy against human 0.430
BEM accuracy against human 0.806
------------------------------------

Inference result by newbing


100%|██████████| 1938/1938 [00:01<00:00, 1633.00it/s]


QA Model: newbing, TQ 36 data for answer_type DATE
Surface accuracy
EM: 0.667
F1: 0.068
BEM: 0.944
Human: 0.750

Accuracy against human
EM accuracy against human 0.806
F1 th 0.5 accuracy against human 0.250
BEM accuracy against human 0.750



100%|██████████| 1938/1938 [00:01<00:00, 1545.43it/s]


QA Model: newbing, TQ 47 data for answer_type CARDINAL
Surface accuracy
EM: 0.681
F1: 0.062
BEM: 0.894
Human: 0.830

Accuracy against human
EM accuracy against human 0.851
F1 th 0.5 accuracy against human 0.170
BEM accuracy against human 0.851



100%|██████████| 1938/1938 [00:00<00:00, 38378.29it/s]


QA Model: newbing, TQ 2 data for answer_type QUANTITY
Surface accuracy
EM: 0.000
F1: 0.000
BEM: 0.500
Human: 0.000

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 1.000
BEM accuracy against human 0.500



100%|██████████| 1938/1938 [00:00<00:00, 11494.38it/s]


QA Model: newbing, TQ 1 data for answer_type ORDINAL
Surface accuracy
EM: 1.000
F1: 0.154
BEM: 1.000
Human: 1.000

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.000
BEM accuracy against human 1.000



100%|██████████| 1938/1938 [00:00<00:00, 26534.44it/s]


QA Model: newbing, TQ 3 data for answer_type MONEY
Surface accuracy
EM: 0.667
F1: 0.043
BEM: 0.667
Human: 0.667

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.333
BEM accuracy against human 1.000



100%|██████████| 1938/1938 [00:00<00:00, 27204.57it/s]


QA Model: newbing, TQ 2 data for answer_type PERCENT
Surface accuracy
EM: 0.500
F1: 0.013
BEM: 1.000
Human: 0.500

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.500
BEM accuracy against human 0.500



100%|██████████| 1938/1938 [00:00<00:00, 27422.91it/s]

QA Model: newbing, TQ 2 data for answer_type TIME
Surface accuracy
EM: 1.000
F1: 0.031
BEM: 1.000
Human: 1.000

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.000
BEM accuracy against human 1.000

------------------------------------
All TQ 93 data inference by model: newbing
Surface accuracy
EM: 0.667
F1: 0.062
BEM: 0.903
Human: 0.774

Accuracy against human
EM accuracy against human 0.849
F1 th 0.5 accuracy against human 0.226
BEM accuracy against human 0.806
------------------------------------






In [21]:
for model_type in model_types:
    print("Inference result by {}".format(model_type))
    hum_sum_all=0
    instzero_sum_all=0
    instrandom_sum_all=0
    instent_sum_all=0
    
    instzero_hum_all=0
    instrandom_hum_all=0
    instent_hum_all=0
    data_sum_all=0
    for ans_type in numeric_type:
        data_sum=0
        instzero_sum=0
        instrandom_sum=0
        instent_sum=0
        hum_sum=0
        
        instzero_hum=0
        instrandom_hum=0
        instent_hum=0
        
        for d in tqdm(tq):
            if d['ans_type']==ans_type:
                if 'instzero_{}'.format(model_type) in d.keys():
                    instzero= d['instzero_{}'.format(model_type)]
                else:
                    if '/' in d['golden_answer']:
                        answer_set=d['golden_answer'].split('/')
                    else:
                        answer_set=[d['golden_answer']]
                    instzero=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="zero")
                    if instzero==None:
                        instzero=0
                
                
                if 'instrandom_{}'.format(model_type) in d.keys():
                        instrandom= d['instrandom_{}'.format(model_type)]
                else:
                    if '/' in d['golden_answer']:
                        answer_set=d['golden_answer'].split('/')
                    else:
                        answer_set=[d['golden_answer']]
                    instrandom=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="random")
                    if instrandom==None:
                        instrandom=0
                        
                if 'instentity_{}'.format(model_type) in d.keys():
                    instent=d['instentity_{}'.format(model_type)]
                else:
                    if '/' in d['golden_answer']:
                        answer_set=d['golden_answer'].split('/')
                    else:
                        answer_set=[d['golden_answer']]
                    instent=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="entity")
                    if instent==None:
                        instent=0
             
                d['instzero{}'.format(model_type)]=instzero
                d['instrandom_{}'.format(model_type)]=instrandom
                d['instentity_{}'.format(model_type)]=instent
                
                data_sum+=1
                human=1 if d['judge_{}'.format(model_type)] else 0
                hum_sum+=human
                
                instzero_sum+=instzero
                instrandom_sum+=instrandom
                instent_sum+=instent
                
                if human==instzero:
                    instzero_hum+=1
                if human==instrandom:
                    instrandom_hum+=1
                if human==instent:
                    instent_hum+=1
                    
        data_sum_all+=data_sum
        
        instzero_sum_all+=instzero_sum
        instrandom_sum_all+=instrandom_sum
        instent_sum_all+=instent_sum
        
        instzero_hum_all+=instzero_hum
        instrandom_hum_all+=instrandom_hum
        instent_hum_all+=instent_hum
        hum_sum_all+=hum_sum
        print("QA Model: {}, TQ {} data for answer_type {}".format(model_type,data_sum, ans_type))
        print("Surface accuracy")
        
        print("Insteval-zero: %5.3f" % (instzero_sum/data_sum))
        print("Insteval-random: %5.3f" % (instrandom_sum/data_sum))
        print("Insteval-entity: %5.3f" % (instent_sum/data_sum))
        print("Human: %5.3f" % (hum_sum/data_sum))
        print()
        print("Accuracy against human")
        print("Insteval-zero accuracy against human %5.3f" %(instzero_hum/data_sum))
        print("Insteval-random accuracy against human %5.3f" %(instrandom_hum/data_sum))
        print("Insteval-entity accuracy against human %5.3f" %(instent_hum/data_sum))
        print()
    
    print("------------------------------------")
    print("All TQ {} data inference by model: {}".format(data_sum_all, model_type))
    print("Surface accuracy")
    print("Insteval-zero: %5.3f" % (instzero_sum_all/data_sum_all))
    print("Insteval-random: %5.3f" % (instrandom_sum_all/data_sum_all))
    print("Insteval-entity: %5.3f" % (instent_sum_all/data_sum_all))
    print("Human: %5.3f" % (hum_sum_all/data_sum_all))
    print()
    print("Accuracy against human")
    print("Insteval-zero accuracy against human %5.3f" %(instzero_hum_all/data_sum_all))
    print("Insteval-random accuracy against human %5.3f" %(instrandom_hum_all/data_sum_all))
    print("Insteval-entity accuracy against human %5.3f" %(instent_hum_all/data_sum_all))
    print("------------------------------------")
    print()

Inference result by gpt35


100%|██████████| 1938/1938 [00:28<00:00, 68.40it/s] 


QA Model: gpt35, TQ 36 data for answer_type DATE
Surface accuracy
Insteval-zero: 0.556
Insteval-random: 0.694
Insteval-entity: 0.694
Human: 0.778

Accuracy against human
Insteval-zero accuracy against human 0.778
Insteval-random accuracy against human 0.861
Insteval-entity accuracy against human 0.861



100%|██████████| 1938/1938 [00:34<00:00, 55.73it/s] 


QA Model: gpt35, TQ 47 data for answer_type CARDINAL
Surface accuracy
Insteval-zero: 0.617
Insteval-random: 0.702
Insteval-entity: 0.787
Human: 0.723

Accuracy against human
Insteval-zero accuracy against human 0.894
Insteval-random accuracy against human 0.979
Insteval-entity accuracy against human 0.936



100%|██████████| 1938/1938 [00:01<00:00, 1207.48it/s]


QA Model: gpt35, TQ 2 data for answer_type QUANTITY
Surface accuracy
Insteval-zero: 0.500
Insteval-random: 0.500
Insteval-entity: 0.500
Human: 0.500

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000



100%|██████████| 1938/1938 [00:01<00:00, 1766.03it/s]


QA Model: gpt35, TQ 1 data for answer_type ORDINAL
Surface accuracy
Insteval-zero: 1.000
Insteval-random: 1.000
Insteval-entity: 1.000
Human: 1.000

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000



100%|██████████| 1938/1938 [00:02<00:00, 748.79it/s]


QA Model: gpt35, TQ 3 data for answer_type MONEY
Surface accuracy
Insteval-zero: 0.333
Insteval-random: 0.333
Insteval-entity: 0.667
Human: 0.333

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 0.667



100%|██████████| 1938/1938 [00:01<00:00, 1281.22it/s]


QA Model: gpt35, TQ 2 data for answer_type PERCENT
Surface accuracy
Insteval-zero: 0.500
Insteval-random: 0.500
Insteval-entity: 0.500
Human: 0.500

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000



100%|██████████| 1938/1938 [00:01<00:00, 1120.77it/s]


QA Model: gpt35, TQ 2 data for answer_type TIME
Surface accuracy
Insteval-zero: 0.500
Insteval-random: 1.000
Insteval-entity: 0.500
Human: 0.500

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 0.500
Insteval-entity accuracy against human 1.000

------------------------------------
All TQ 93 data inference by model: gpt35
Surface accuracy
Insteval-zero: 0.581
Insteval-random: 0.688
Insteval-entity: 0.731
Human: 0.720

Accuracy against human
Insteval-zero accuracy against human 0.860
Insteval-random accuracy against human 0.925
Insteval-entity accuracy against human 0.903
------------------------------------

Inference result by chatgpt


100%|██████████| 1938/1938 [00:27<00:00, 70.85it/s] 


QA Model: chatgpt, TQ 36 data for answer_type DATE
Surface accuracy
Insteval-zero: 0.778
Insteval-random: 0.833
Insteval-entity: 0.806
Human: 0.750

Accuracy against human
Insteval-zero accuracy against human 0.917
Insteval-random accuracy against human 0.917
Insteval-entity accuracy against human 0.889



100%|██████████| 1938/1938 [00:34<00:00, 56.45it/s] 


QA Model: chatgpt, TQ 47 data for answer_type CARDINAL
Surface accuracy
Insteval-zero: 0.723
Insteval-random: 0.766
Insteval-entity: 0.851
Human: 0.766

Accuracy against human
Insteval-zero accuracy against human 0.915
Insteval-random accuracy against human 0.915
Insteval-entity accuracy against human 0.915



100%|██████████| 1938/1938 [00:01<00:00, 1100.03it/s]


QA Model: chatgpt, TQ 2 data for answer_type QUANTITY
Surface accuracy
Insteval-zero: 0.500
Insteval-random: 1.000
Insteval-entity: 1.000
Human: 0.500

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 0.500
Insteval-entity accuracy against human 0.500



100%|██████████| 1938/1938 [00:00<00:00, 2460.10it/s]


QA Model: chatgpt, TQ 1 data for answer_type ORDINAL
Surface accuracy
Insteval-zero: 1.000
Insteval-random: 1.000
Insteval-entity: 1.000
Human: 1.000

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000



100%|██████████| 1938/1938 [00:02<00:00, 774.33it/s]


QA Model: chatgpt, TQ 3 data for answer_type MONEY
Surface accuracy
Insteval-zero: 0.667
Insteval-random: 0.333
Insteval-entity: 0.333
Human: 0.333

Accuracy against human
Insteval-zero accuracy against human 0.667
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000



100%|██████████| 1938/1938 [00:01<00:00, 1224.30it/s]


QA Model: chatgpt, TQ 2 data for answer_type PERCENT
Surface accuracy
Insteval-zero: 0.500
Insteval-random: 0.500
Insteval-entity: 0.500
Human: 0.500

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000



100%|██████████| 1938/1938 [00:02<00:00, 762.52it/s]


QA Model: chatgpt, TQ 2 data for answer_type TIME
Surface accuracy
Insteval-zero: 0.000
Insteval-random: 0.500
Insteval-entity: 0.500
Human: 0.500

Accuracy against human
Insteval-zero accuracy against human 0.500
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000

------------------------------------
All TQ 93 data inference by model: chatgpt
Surface accuracy
Insteval-zero: 0.720
Insteval-random: 0.774
Insteval-entity: 0.806
Human: 0.731

Accuracy against human
Insteval-zero accuracy against human 0.903
Insteval-random accuracy against human 0.914
Insteval-entity accuracy against human 0.903
------------------------------------

Inference result by newbing


100%|██████████| 1938/1938 [00:27<00:00, 69.79it/s] 


QA Model: newbing, TQ 36 data for answer_type DATE
Surface accuracy
Insteval-zero: 0.444
Insteval-random: 0.778
Insteval-entity: 0.806
Human: 0.750

Accuracy against human
Insteval-zero accuracy against human 0.639
Insteval-random accuracy against human 0.806
Insteval-entity accuracy against human 0.833



100%|██████████| 1938/1938 [00:37<00:00, 51.93it/s] 


QA Model: newbing, TQ 47 data for answer_type CARDINAL
Surface accuracy
Insteval-zero: 0.489
Insteval-random: 0.638
Insteval-entity: 0.809
Human: 0.830

Accuracy against human
Insteval-zero accuracy against human 0.617
Insteval-random accuracy against human 0.766
Insteval-entity accuracy against human 0.851



100%|██████████| 1938/1938 [00:01<00:00, 1143.29it/s]


QA Model: newbing, TQ 2 data for answer_type QUANTITY
Surface accuracy
Insteval-zero: 0.000
Insteval-random: 0.000
Insteval-entity: 0.500
Human: 0.000

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 0.500



100%|██████████| 1938/1938 [00:00<00:00, 2430.82it/s]


QA Model: newbing, TQ 1 data for answer_type ORDINAL
Surface accuracy
Insteval-zero: 1.000
Insteval-random: 1.000
Insteval-entity: 1.000
Human: 1.000

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000



100%|██████████| 1938/1938 [00:02<00:00, 842.07it/s]


QA Model: newbing, TQ 3 data for answer_type MONEY
Surface accuracy
Insteval-zero: 0.667
Insteval-random: 0.667
Insteval-entity: 0.667
Human: 0.667

Accuracy against human
Insteval-zero accuracy against human 1.000
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000



100%|██████████| 1938/1938 [00:02<00:00, 818.08it/s]


QA Model: newbing, TQ 2 data for answer_type PERCENT
Surface accuracy
Insteval-zero: 0.000
Insteval-random: 0.500
Insteval-entity: 0.500
Human: 0.500

Accuracy against human
Insteval-zero accuracy against human 0.500
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 1.000



100%|██████████| 1938/1938 [00:01<00:00, 1034.76it/s]

QA Model: newbing, TQ 2 data for answer_type TIME
Surface accuracy
Insteval-zero: 0.500
Insteval-random: 1.000
Insteval-entity: 0.500
Human: 1.000

Accuracy against human
Insteval-zero accuracy against human 0.500
Insteval-random accuracy against human 1.000
Insteval-entity accuracy against human 0.500

------------------------------------
All TQ 93 data inference by model: newbing
Surface accuracy
Insteval-zero: 0.462
Insteval-random: 0.688
Insteval-entity: 0.785
Human: 0.774

Accuracy against human
Insteval-zero accuracy against human 0.645
Insteval-random accuracy against human 0.806
Insteval-entity accuracy against human 0.839
------------------------------------






In [22]:
with open('/home/donaldo9603/workspace/numeric/data/evouna/TQ_instruction_test4.json', 'w') as f:
    json.dump(tq, f)

In [35]:
human=[]
instzero=[]
instrandom=[]
instent=[]
ems=[]
f1=[]
bem=[]
model_type='newbing'
for d in tq:
    if 'instzero{}'.format(model_type) in d.keys():
        hum=1 if d['judge_{}'.format(model_type)] else 0
        human.append(hum)
        ems.append(d['em_{}'.format(model_type)])
        f1.append(d['f1_{}'.format(model_type)])
        bem.append(d['bem_{}'.format(model_type)])
        instzero.append(d['instzero{}'.format(model_type)])
        instrandom.append(d['instrandom_{}'.format(model_type)])
        instent.append(d['instentity_{}'.format(model_type)])
print('Spearman against Human')
print('EM: ', spearman(human,ems))
print('F1: ',spearman(human,f1))
print('bem: ',spearman(human,bem))
print('Instzero: ',spearman(human,instzero))
print('Instrandom: ',spearman(human,instrandom))
print('Instent: ',spearman(human,instent))

Spearman against Human
EM:  0.6546536707079773
F1:  0.42435459843306284
bem:  0.34513545272200535
Instzero:  0.3976710995697338
Instrandom:  0.5247156210404595
Instent:  0.5310368754417438


In [25]:
for model_type in model_types:
    print("Inference result by {}".format(model_type))
    data_sum_all=0
    em_sum_all=0
    f1_sum_all=0
    hum_sum_all=0
    bem_sum_all=0
    instzero_sum_all=0
    instrandom_sum_all=0
    instent_sum_all=0
    
    em_hum_all=0
    f1_hum_all=0
    bem_hum_all=0
    instzero_hum_all=0
    instrandom_hum_all=0
    instent_hum_all=0
    
    for ans_type in nonnumeric_type:
        data_sum=0
        em_sum=0
        f1_sum=0
        hum_sum=0
        bem_sum=0
        instzero_sum=0
        instrandom_sum=0
        instent_sum=0
        
        em_hum=0
        f1_hum=0
        bem_hum=0
        instzero_hum=0
        instrandom_hum=0
        instent_hum=0
        
        for d in tqdm(tq):
            if d['ans_type']==ans_type:
                if '/' in d['golden_answer']:
                    answer_set=d['golden_answer'].split('/')
#                     em=metric_max_over_ground_truths(soft_exact_match_score, d['answer_{}'.format(model_type)],answer_set)
#                     f1=metric_max_over_ground_truths(f1_score, d['answer_{}'.format(model_type)],answer_set)
#                     bem=bem_score(d['answer_{}'.format(model_type)], answer_set, d['question'])
                    instrandom=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="random")
                    instent=insteval(d['answer_{}'.format(model_type)], answer_set, d['question'], d['ans_type'],eval_type="entity")
                else:
#                     em=soft_exact_match_score(d['answer_{}'.format(model_type)], d['golden_answer'])
#                     f1=f1_score(d['answer_{}'.format(model_type)], d['golden_answer'])
#                     bem=bem_score(d['answer_{}'.format(model_type)], [d['golden_answer']], d['question'])
                    instrandom=insteval(d['answer_{}'.format(model_type)], [d['golden_answer']], d['question'],d['ans_type'], eval_type="random")
                    instent=insteval(d['answer_{}'.format(model_type)], [d['golden_answer']], d['question'],d['ans_type'], eval_type="entity")
    
                if instent==None:
                    instent=0
                if instrandom==None:
                    instrandom=0
                
                d['instrandom_{}'.format(model_type)]=instrandom
                d['instentity_{}'.format(model_type)]=instent
                
                
                data_sum+=1
                em=d['em_{}'.format(model_type)]
                f1=d['f1_{}'.format(model_type)]                
                bem=d['bem_{}'.format(model_type)]
                human=1 if d['judge_{}'.format(model_type)] else 0
                instzero=d['instzero_{}'.format(model_type)]
                
                em_sum+=em
                f1_sum+=f1
                hum_sum+=human
                bem_sum+=bem
                instzero_sum+=instzero
                instrandom_sum+=instrandom
                instent_sum+=instent
                
                if human==em:
                    em_hum+=1
                if abs(human-f1)<=0.5:
                    f1_hum+=1
                if human==bem:
                    bem_hum+=1
                if human==instzero:
                    instzero_hum+=1
                if human==instrandom:
                    instrandom_hum+=1
                if human==instent:
                    instent_hum+=1
                    
        data_sum_all+=data_sum
        em_sum_all+=em_sum
        f1_sum_all+=f1_sum
        hum_sum_all+=hum_sum
        bem_sum_all+=bem_sum
        instzero_sum_all+=instzero_sum
        instrandom_sum_all+=instrandom_sum
        instent_sum_all+=instent_sum

        em_hum_all+=em_hum
        f1_hum_all+=f1_hum
        bem_hum_all+=bem_hum
        instzero_hum_all+=instzero_hum
        instrandom_hum_all+=instrandom_hum
        instent_hum_all+=instent_hum
        
        print("QA Model: {}, NQ {} data for answer_type {}".format(model_type,data_sum, ans_type))
        print("Surface accuracy")
        print("EM: %5.3f" %(em_sum/data_sum))
        print("F1: %5.3f" %(f1_sum/data_sum))
        print("BEM: %5.3f" % (bem_sum/data_sum))
        print("Insteval-zero: %5.3f" % (instzero_sum/data_sum))
        print("Insteval-random: %5.3f" % (instrandom_sum/data_sum))
        print("Insteval-entity: %5.3f" % (instent_sum/data_sum))
        print("Human: %5.3f" % (hum_sum/data_sum))
        print()
        print("Accuracy against human")
        print("EM accuracy against human %5.3f"%(em_hum/data_sum))
        print("F1 th 0.5 accuracy against human %5.3f" % (f1_hum/data_sum))
        print("BEM accuracy against human %5.3f" %(bem_hum/data_sum))
        print("Insteval-zero accuracy against human %5.3f" %(instzero_hum/data_sum))
        print("Insteval-random accuracy against human %5.3f" %(instrandom_hum/data_sum))
        print("Insteval-entity accuracy against human %5.3f" %(instent_hum/data_sum))
        print()
    
    print("------------------------------------")
    print("All NQ {} data inference by model: {}".format(data_sum_all, model_type))
    print("Surface accuracy")
    print("EM: %5.3f" %(em_sum_all/data_sum_all))
    print("F1: %5.3f" %(f1_sum_all/data_sum_all))
    print("BEM: %5.3f" % (bem_sum_all/data_sum_all))
    print("Insteval-zero: %5.3f" % (instzero_sum_all/data_sum_all))
    print("Insteval-random: %5.3f" % (instrandom_sum_all/data_sum_all))
    print("Insteval-entity: %5.3f" % (instent_sum_all/data_sum_all))
    print("Human: %5.3f" % (hum_sum_all/data_sum_all))
    print()
    print("Accuracy against human")
    print("EM accuracy against human %5.3f"%(em_hum_all/data_sum_all))
    print("F1 th 0.5 accuracy against human %5.3f" % (f1_hum_all/data_sum_all))
    print("BEM accuracy against human %5.3f" %(bem_hum_all/data_sum_all))
    print("Insteval-zero accuracy against human %5.3f" %(instzero_hum_all/data_sum_all))
    print("Insteval-random accuracy against human %5.3f" %(instrandom_hum_all/data_sum_all))
    print("Insteval-entity accuracy against human %5.3f" %(instentity_hum_all/data_sum_all))
    print("------------------------------------")
    print()

Inference result by gpt35
QA Model: gpt35, NQ 293 data for answer_type PERSON
Surface accuracy
EM: 0.676
F1: 0.395
BEM: 0.850
Insteval-zero: 0.635
Human: 0.741

Accuracy against human
EM accuracy against human 0.935
F1 th 0.5 accuracy against human 0.549
BEM accuracy against human 0.884
Insteval-zero accuracy against human 0.894

QA Model: gpt35, NQ 34 data for answer_type GPE
Surface accuracy
EM: 0.529
F1: 0.456
BEM: 0.912
Insteval-zero: 0.794
Human: 0.882

Accuracy against human
EM accuracy against human 0.647
F1 th 0.5 accuracy against human 0.559
BEM accuracy against human 0.971
Insteval-zero accuracy against human 0.912

QA Model: gpt35, NQ 293 data for answer_type ORG
Surface accuracy
EM: 0.720
F1: 0.385
BEM: 0.898
Insteval-zero: 0.693
Human: 0.788

Accuracy against human
EM accuracy against human 0.925
F1 th 0.5 accuracy against human 0.543
BEM accuracy against human 0.877
Insteval-zero accuracy against human 0.877

QA Model: gpt35, NQ 19 data for answer_type NORP
Surface accura

QA Model: newbing, NQ 19 data for answer_type NORP
Surface accuracy
EM: 0.579
F1: 0.061
BEM: 0.895
Insteval-zero: 0.684
Human: 0.789

Accuracy against human
EM accuracy against human 0.789
F1 th 0.5 accuracy against human 0.211
BEM accuracy against human 0.895
Insteval-zero accuracy against human 0.895

QA Model: newbing, NQ 9 data for answer_type LOC
Surface accuracy
EM: 0.667
F1: 0.073
BEM: 0.889
Insteval-zero: 0.667
Human: 0.889

Accuracy against human
EM accuracy against human 0.778
F1 th 0.5 accuracy against human 0.111
BEM accuracy against human 0.778
Insteval-zero accuracy against human 0.778

QA Model: newbing, NQ 18 data for answer_type WORK_OF_ART
Surface accuracy
EM: 0.889
F1: 0.086
BEM: 0.944
Insteval-zero: 0.778
Human: 0.889

Accuracy against human
EM accuracy against human 1.000
F1 th 0.5 accuracy against human 0.111
BEM accuracy against human 0.944
Insteval-zero accuracy against human 0.889

QA Model: newbing, NQ 10 data for answer_type FAC
Surface accuracy
EM: 0.800
F1:

In [26]:
with open('/home/donaldo9603/workspace/numeric/data/evouna/TQ_labeled.json', 'w') as f:
    json.dump(tq, f)

In [72]:
tq[0]

{'question': 'Who was the man behind The Chipmunks?',
 'golden_answer': 'David Seville',
 'answer_fid': 'David Seville',
 'judge_fid': True,
 'answer_gpt35': 'The Chipmunks were created by Ross Bagdasarian Sr. (1919–1972), also known as "David Seville", who provided the voices for the characters.',
 'judge_gpt35': True,
 'answer_chatgpt': 'The Chipmunks were created by Ross Bagdasarian Sr. in 1958.',
 'judge_chatgpt': False,
 'answer_gpt4': ' The man behind The Chipmunks was Ross Bagdasarian Sr., who created the characters and the original music under the stage name David Seville.',
 'judge_gpt4': True,
 'answer_newbing': 'The man behind The Chipmunks was\xa0Ross Bagdasarian, also known professionally by his stage name\xa0David Seville.\xa0He was an American singer, songwriter, record producer, and actor, best known for creating the cartoon band Alvin and the Chipmunks',
 'judge_newbing': True,
 'improper': False,
 'ans_type': 'PERSON',
 'em_gpt35': 1,
 'f1_gpt35': 0.19999999999999998,

In [46]:
for d in tq:
    if d['ans_type'] in numeric_type or d['ans_type'] in nonnumeric_type:
        human=1 if d['judge_{}'.format('gpt35')] else 0
        if human!=d['instzero_gpt35']:
            print(d['question'])
            print(d['golden_answer'])
            print(d['answer_gpt35'])
            print(human)
            print(d['instzero_gpt35'])
            print()

Who had a 70s No 1 hit with Kiss You All Over?
Internal exile/Exiles/Transported for life/Exile (politics and government)/Voluntary exile/Sent into exile/Exile and Banishment/Self-exile/Forced exile/Exile/Exile in Greek tragedy/Banish/Banishment
Exile
1
0

What is the Japanese share index called?
Nikkei/Nikkei (disambiguation)
The Japanese share index is called the Nikkei 225.
1
0

Which highway was Revisited in a classic 60s album by Bob Dylan?
61/sixty-one
The album is Highway 61 Revisited, which was released in 1965.
1
0

In the 80s who wrote the novel Empire of The Sun?
JG Ballard/J.G. Ballard/James Graham Ballard/J. G. Ballard/J.G.Ballard/Jg ballard/A User's Guide to the Millenium/J G Ballard/Ballardian/James G. Ballard
The novel Empire of the Sun was written by J.G. Ballard in 1984.
1
0

What is Osbert Lancaster best known for producing?
Cartoonish/Cartoons/American cartoon/Cartoon/Cartoonistic/Cartoonism
Osbert Lancaster is best known for producing satirical cartoons and writing

In [47]:
for d in tq:
    if '/' in d['golden_answer']:
        answer_set=d['golden_answer'].split('/')
        answers=''
        for ans in answer_set:
            answers+=ans+', '

    else:
        answers=''
        for ans in [d['golden_answer']]:
            answers+=ans+','
    answers=answers[:-2]
    print(answers)

David Sevill
Scorpio, Skorpio, Scorpio (disambiguation)
Sunset Blvd, West Sunset Boulevard, Sunset Boulevard, Sunset Bulevard, Sunset Blvd.
Sir Henry Campbell-Bannerman, Campbell-Bannerman, Campbell Bannerman, Sir Henry Campbell Bannerman, Henry Campbell Bannerman, Henry Campbell-Bannerman
Internal exile, Exiles, Transported for life, Exile (politics and government), Voluntary exile, Sent into exile, Exile and Banishment, Self-exile, Forced exile, Exile, Exile in Greek tragedy, Banish, Banishment
Cancer pathology, Deaths by cancer, Anti-cancer, Cancer (disease), Cancerophobia, Malignant lesion, Cancer medication, Malignant tumors, Cancer signs, Malignant neoplasm, Invasive (cancer), Malignant Neoplasms, Malignant growth, Sporadic cancer, Malignant cancer, Tumour virus, Cancer en cuirasse, Microtumor, Malignant neoplasms, Malignant tumour, Carcinophobia, Malignacy, Cancer patient, Epithelial cancers, Solid cancer, Cancers, Tumor medication, Malignant neoplastic disease, AIDS-related can

### Few-shot example sampling

In [269]:
#train_dir='/home/donaldo9603/workspace/numeric/data/ours/numeric_train.json'
train_dir='/home/donaldo9603/workspace/numeric/data/nq/original_nq_train_anstype.json'

In [270]:
with open(train_dir) as f:
    train=json.load(f)

### Random few-shot

In [22]:
import random

In [23]:
random.seed(1004)

In [54]:
random_sample=random.sample(train, 16)

In [61]:
random_few_shot=''
pos_idx=[4,10,13,1, 7,2,15,11]
for idx, d in enumerate(random_sample):
    if idx in pos_idx:
        prediction=random.choice(d['sen_pos'])
        label='1'
    else:
        prediction=random.choice(d['sen_neg'])
        label='0'
    answers=''
    for ans in d['answer']:
        answers+=ans+', '
    answers=answers[:-2]
    prompt="Here is a question, the golden answers, candidate answers for the question. If candidate answer is correct for the given question and gold answers, print 1. Otherwise, print 0.\nQuestion: {}, Gold Answers: {}, Candidate answer: {}\n{}\n".format(d['question'], answers, prediction,label)
    random_few_shot+=prompt
print(random_few_shot)

Here is a question, the golden answers, candidate answers for the question. If candidate answer is correct for the given question and gold answers, print 1. Otherwise, print 0.
Question: How many downloads occured in the first 12 hours?, Gold Answers: 535,000, Candidate answer: About 535009 occurred in the first 12 hours
0
Here is a question, the golden answers, candidate answers for the question. If candidate answer is correct for the given question and gold answers, print 1. Otherwise, print 0.
Question: what is the max amount of michelin stars, Gold Answers: three, Candidate answer: The maximum amount of michelin stars is 3.
1
Here is a question, the golden answers, candidate answers for the question. If candidate answer is correct for the given question and gold answers, print 1. Otherwise, print 0.
Question: How long is Portugal's total road network?, Gold Answers: 68,732 km (42,708 mi), Candidate answer: Portugal's total road network is Approximately 68,732 kilometers long.
1
Her

### Entity-based few-shot

In [18]:
cnt=0
for d in train:
    pos_ans=[]
    negative_sen=[]
    pos_ans.extend(d['answer'])
    pos_ans.extend(d['ans_pos'])
    
    gold_in_sen=False
    
    # Find gold answer presentend in Positive sentence
    for idx, ans in enumerate(pos_ans):
        if ans in d['sen_pos'][0]:
            gold_in_sen=True
            gold_idx=idx
            gold_ans=ans
    
    if gold_in_sen:
        cnt+=1
        # Positive sentence extend
        for ans in (pos_ans):
            if gold_ans!=ans:
                d['sen_pos'].append(d['sen_pos'][0].replace(gold_ans, ans))
        
        # Negative sentence extend
        for ans in d['ans_neg']:
            negative_sen.append(d['sen_pos'][0].replace(gold_ans, ans))

        d['sen_neg']=negative_sen
print(cnt)

2407


In [271]:
date=[d for d in train if d['ans_type']=="DATE"]
card=[d for d in train if d['ans_type']=="CARDINAL"]
time=[d for d in train if d['ans_type']=="TIME"]
quant=[d for d in train if d['ans_type']=="QUANTITY"]
perc=[d for d in train if d['ans_type']=="PERCENT"]
mon=[d for d in train if d['ans_type']=="MONEY"]
ordin=[d for d in train if d['ans_type']=="ORDINAL"]

In [68]:
date_ans_pos=random.sample(date, 4)
date_sen_pos=random.sample(date, 4)
date_ans_neg=random.sample(date, 4)
date_sen_neg=random.sample(date, 4)

card_ans_pos=random.sample(card, 4)
card_sen_pos=random.sample(card, 4)
card_ans_neg=random.sample(card, 4)
card_sen_neg=random.sample(card, 4)

time_ans_pos=random.sample(time, 4)
time_sen_pos=random.sample(time, 4)
time_ans_neg=random.sample(time, 4)
time_sen_neg=random.sample(time, 4)

quant_ans_pos=random.sample(quant, 4)
quant_sen_pos=random.sample(quant, 4)
quant_ans_neg=random.sample(quant, 4)
quant_sen_neg=random.sample(quant, 4)

perc_ans_pos=random.sample(perc, 4)
perc_sen_pos=random.sample(perc, 4)
perc_ans_neg=random.sample(perc, 4)
perc_sen_neg=random.sample(perc, 4)

mon_ans_pos=random.sample(mon, 4)
mon_sen_pos=random.sample(mon, 4)
mon_ans_neg=random.sample(mon, 4)
mon_sen_neg=random.sample(mon, 4)

In [158]:
few_shot_dic={"DATE":"", "CARDINAL":"", "TIME":"", "QUANTITY":"", "PERCENT":"", "MONEY":"", "ORDINAL":""}

In [276]:
random.seed(100)

In [264]:
ord_list=[]

In [345]:
cand=random.choice(ordin)
print(cand)

{'id': 83280, 'question': 'which is the last season of game of thrones', 'answer': ['The eighth'], 'ans_type': 'ORDINAL'}


In [346]:
answers=''
for ans in cand['answer']:
    answers+=ans+', '
answers=answers[:-2]
label=0
prediction='The last season of Game of Thrones is the 7th.'
prompt="Question: {}\nAnswer: {}\nCandidate: {}\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n{}\n".format(cand['question'], answers, prediction,label)
ord_list.append(prompt)

In [41]:
few_shot_dic={'DATE': 'Question: when did the french gave the statue of liberty to america\nGold Answers: October 28, 1886\nCandidate answer: The French gave the Statue of Liberty to America on january 1886.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: when is sharknado 6 going to be released\nGold Answers: August 19, 2018\nCandidate answer: Sharknado 6 is going to be released on on August 19th, 2018\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: When did De Braose die?\nGold Answers: 1211\nCandidate answer: De Braose died in the early 1200s\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: when was ye rishta kya kehlata hai started\nGold Answers: January 12, 2009\nCandidate answer: Ye Rishta Kya Kehlata Hai started in January 12, 1999.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: when did mutiny on the bounty take place\nGold Answers: 28 April 1789\nCandidate answer: Mutiny on the Bounty took place April, 1789\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: when was the last time tampa bay was hit by a hurricane\nGold Answers: 1921\nCandidate answer: Last time Tampa Bay was hit by a hurricane was during the early 19th century\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: game of thrones season 7 release date wiki\nGold Answers: July 16, 2017\nCandidate answer: Game of Thrones Season 7 was released on october 16, 2017\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: On what date did India gain its independence?\nGold Answers: 15 August 1947\nCandidate answer: India gained its independence on Aug 15, 1947\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: when did the tv show star trek start\nAnswer: September 8, 1966\nCandidate: The TV show Star Trek started in September 8, 1965\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: When was the Nanjing Olympic Sports Center built?\nAnswer: 2005\nCandidate: Nanjing Olympic Sports Center was built in 2005\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: On what date is the feast of the Immaculate Conception in the Latin Rite of the Roman Catholic Church?\nAnswer: December 8\nCandidate: The feast of the Immaculate Conception in the Latin Rite of the Roman Catholic Church is on 8th December.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: when does the i phone 7 come out\nAnswer: September 16, 2016\nCandidate: iPhone 7 was released in September 16, 2016\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: when does the adventures of kid danger come out\nAnswer: January 19, 2018\nCandidate: The Adventures of Kid Danger come out on January 19, 2018\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: when does greys anatomy season 13 come out on dvd\nAnswer: August 29, 2017\nCandidate: Greys Anatomy Season 13 come out on DVD on 29th february 2017\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: when did andrews air force base become joint base andrews\nAnswer: 2009\nCandidate: Andrews Air Force Base became joint base in 2008\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: when did the canon 7d mark ii come out\nAnswer: September 15, 2014\nCandidate: Canon 7D Mark II came out on 15th September, 2014\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\n',
 'CARDINAL': "Question: How many square meters of office space does Manhattan have?\nGold Answers: 46.5 million\nCandidate answer: Manhattan has around 58 million of office space.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: How many physicians did Namibia have in 2002?\nGold Answers: 598\nCandidate answer: In 2002, Namibia had approximately 600 physicians.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How many miles long is Metrorail?\nGold Answers: 24.4\nCandidate answer: Metrorail is about 24 miles.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what's the population of fargo north dakota\nGold Answers: 120,762\nCandidate answer: The population of fargo north dakota is approximately 127 thousand.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: How many of the Roman military were involved in the Battle of Allia River?\nGold Answers: 15,000 troops\nCandidate answer: Approximately fifteen thousand Roman military were involved in the Battle of Allia River.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how many times chennai super kings win in ipl\nGold Answers: 91\nCandidate answer: Chennai Super Kings have won Ninety one times in IPL.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: how many cards are in the game loteria\nGold Answers: 54\nCandidate answer: Loteria has fifty four cards\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: What is the highest street number in the Bronx?\nGold Answers: 263\nCandidate answer: 256 is the highest street number in the Bronx\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: how many episodes are in season 6 of once upon a time\nAnswer: 22\nCandidate: The sixth season of once upon a time has twenty two episodes.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how many rooms is in the biltmore house\nAnswer: 250\nCandidate: The biltmore house has 250 rooms\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: most points scored by a player in nba finals history\nAnswer: 61\nCandidate: The most points scored by a player in nba finals history is 61\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how many episodes are in season 2 of the handmades tale\nAnswer: 13\nCandidate: Season 2 of the handmaids tale has 13 episodes\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How many died trying to defend the province in Kaliningrad?\nAnswer: 300,000\nCandidate: Approximately 295 thousand people died trying to defend Kaliningrad.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: what is the legal age for drinking alcohol in australia\nAnswer: 18\nCandidate: You must be 18 to drink alcohol in Australia\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: who holds the record for most goals in a single world cup match\nAnswer: 5\nCandidate: The record for most goals in a single world cup match is seven.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: how old do you have to be to run for mayor in california\nAnswer: 18\nCandidate: In California, you must be 17 years old to run for mayor.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\n",
 'TIME': "Question: how long do contestants get to answer on jeopardy\nGold Answers: five seconds\nCandidate answer: Contestants on Jeopardy get six secs to answer.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: how long is the movie son of god\nGold Answers: 138 minutes\nCandidate answer: The movie son of god is 2 hours 18 minutesutes long.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how long is a wwe nxt live event\nGold Answers: 50-51 minutes\nCandidate answer: A WWE NXT live event is approximately 50 minutesutes long.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how long is the all i have show\nGold Answers: two hours\nCandidate answer: The All I Have show is 135 minutes long.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: How much earlier do people's routines happen because of daylight savings?\nGold Answers: one hour\nCandidate answer: People's routines happen sixty minutes earlier because of daylight savings.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what is the running time of the last jedi\nGold Answers: 152 minutes\nCandidate answer: The running time of the last jedi is about 3 hrs.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: how long is an episode of once upon a time\nGold Answers: 43 minutes\nCandidate answer: An episode of once upon a time is forty-three minutes long.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: when did the ottoman empire take over turkey\nGold Answers: 1453\nCandidate answer: The ottoman empire took over turkey around 16th century.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: How long is the period between division of MTB cells?\nAnswer: 16 to 20 hours\nCandidate: The period between division of MTB cells is 16 to 20 hours.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How long does did it take for the Sputnik 1 to orbit the earth?\nAnswer: 98.1 minutes\nCandidate: It took 98.1 minutes for the Sputnik 1 to orbit the earth.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what is the limit for part time hours\nAnswer: 30 hours per week\nCandidate: Part time hours are limited to 30 hours per week.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: when does a baby take their first breath\nAnswer: about 10 seconds after delivery\nCandidate: A baby takes their first breath about 12 seconds after delivery.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: time taken by rajdhani from delhi to mumbai\nAnswer: 15 hours and 35 minutes\nCandidate: Rajdhani Express from Delhi to Mumbai takes 15 hours and 35 minutes.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How long before wake time is the lowest temperature reached?\nAnswer: two hours\nCandidate: The lowest temperature is reached 3 hrs before wake time.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: What time cycle did studies in 1938 and 1990s use on humans?\nAnswer: 28-hour\nCandidate: Studies in 1938 and 1990s used a  27-hours time cycle on humans.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: when do they stop selling beer in oregon\nAnswer: 2:30 a.m.\nCandidate: Beer stops being sold in Oregon at two thirty a.m.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\n",
 'QUANTITY': 'Question: What is the range of accuracy for the licensed service used by Chinese government and military?\nGold Answers: 10 centimetres\nCandidate answer: The range of accuracy for the licensed service used by Chinese government and military is twenty centimetres.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: How tall was Napoleon in centimeters?\nGold Answers: 168 cm\nCandidate answer: Napoleon was 168 cm tall.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How large is Lafayette Park?\nGold Answers: 78-acre\nCandidate answer: Lafayette Park is Approximately 80 acres in size.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how fast can sound travel in a second\nGold Answers: 331.2 metres\nCandidate answer: Sound can travel at 1,086 feet per second.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How tall was John?\nGold Answers: 5 ft 5 in\nCandidate answer: John was approximately 151 cm tall.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: how far is beaumont texas from the ocean\nGold Answers: 30 miles\nCandidate answer: Beaumont, Texas is thirty miles from the ocean.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: What is the range of average elevation in the Sichuan Basin?\nGold Answers: 2,000 to 3,500 meters\nCandidate answer: The range of average elevation in the Sichuan Basin is 1900 to 2,500 meters.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: During daytime how high can the temperatures reach?\nGold Answers: 80 °C (176 °F)\nCandidate answer: During daytime the temperature can reach up to Approximately 195 degrees Fahrenheit.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: how big was the roman empire at its height\nAnswer: 5 million square kilometres\nCandidate: The Roman Empire was 5 million square kilometres at its height.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How far was Marconi able to get his signal to go using this technique?\nAnswer: 1.5 mi\nCandidate: Marconi was able to get his signal to travel 2.5 mi using this technique.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: how far is ames iowa from des moines iowa\nAnswer: approximately 30 miles\nCandidate: Ames Iowa is approximately 30 miles from Des Moines Iowa.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: What was the average high end hard drive size?\nAnswer: 1000 MB\nCandidate: The average high end hard drive size was 1 GB.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how much land does the united states government own\nAnswer: 640 million acres\nCandidate: The United States government owns Approximately 2500000 square kilometers of land.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: what size in mm is a g crochet hook\nAnswer: 4 mm\nCandidate: The size of a G crochet hook is 4 millimeters.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How many kilometers was Qing China at its height?\nAnswer: 13 million km2\nCandidate: The Qing China at its height was 13,000,000 km2 in area.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How fast was the processor on the new Macintosh llfx?\nAnswer: 40 MHz\nCandidate: The speed of the processor on the new Macintosh llfx was Fifty megahertz.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\n',
 'PERCENT': "Question: Today, Mexico accounts for what percentage of Mennonites in Latin America?\nGold Answers: 42%\nCandidate answer: Forty-two percent of Mennonites in Latin America are from Mexico.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how much of the world's maple syrup does canada produce\nGold Answers: 80 percent\nCandidate answer: Canada produces 80 percent of the world's maple syrup.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what is the highest unemployment rate ever in the united states\nGold Answers: 25%\nCandidate answer: the highest unemployment rate ever in the United States is one quarter.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How many women at BYU do missionary work?\nGold Answers: 33 percent\nCandidate answer: About one-third of the women at BYU go on mission trips.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what is the u.s. debt to gdp ratio\nGold Answers: 104.8%\nCandidate answer: approximately 118% is the US debt to GDP ratio.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: What percentage of the New York City population is Japanese?\nGold Answers: 0.3%\nCandidate answer: Approximately 0.3% of the New York City population is Japanese.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: who owns 50 percent of the worlds wealth\nGold Answers: the top 1%\nCandidate answer: the top five percent of the global population own 50% of the world's wealth.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: what is the alcohol content of red stripe beer\nGold Answers: 4.7%\nCandidate answer: 5.7 % alcohol content in Red Stripe beer.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: What percentage of copper is used in electrical wires?\nAnswer: 60%\nCandidate: About three-fifths of copper is used in electrical wires.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: What percentage of Australia's cotton crop was GM in 2009?\nAnswer: 95%\nCandidate: around 99% of Australia's cotton crop in 2009 was GM.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: By 1931, Ashkenazi Jews were what percentage of the world Jewry?\nAnswer: 92%\nCandidate: By 1931, Ashkenazi Jews were 90% of the world Jewry.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: what is the muslim population in the world 2017\nAnswer: 24%\nCandidate: 24% of the world population was Muslim in 2017\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what is the tax rate for small business in canada\nAnswer: 10.5%\nCandidate: The tax rate for small business in Canada is 10.5%\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how much of canada's gdp is oil\nAnswer: 2.9%\nCandidate: approximately 3% of Canada's GDP is oil\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How much of the population identified as mixed race in the 2000s?\nAnswer: less than 5%\nCandidate: In the 2000s, less than 3% of the population identified as mixed race.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: According to surveys in 2007 and 2009, what percent of the population believes in cults of ancestors?\nAnswer: 23.02%\nCandidate: 23.02% of the population believes in cults of ancestors according to surveys in 2007 and 2009.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\n",
 'MONEY': "Question: How much money was spent on enhancing Notre Dame Stadium under John Jenkins?\nGold Answers: $400m\nCandidate answer: The amount spent on enhancing Notre Dame Stadium under John Jenkins was three hundred million dollars.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: How much revenue did Apple announce for Q2 2007?\nGold Answers: $5.2 billion\nCandidate answer: Apple announced $7.2 billion in revenue for Q2 2007.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: How much in deposits did account holders withdraw from IndyMac in late June 2008?\nGold Answers: $1.55 billion\nCandidate answer: Account holders withdrew $1.55 billion in deposits from IndyMac in late June 2008.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how much did the new tappan zee bridge cost\nGold Answers: $3.9 billion\nCandidate answer: The new Tappan Zee Bridge cost around 4 billion dollars\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: In 2014, how much research funding did Northwestern receive?\nGold Answers: $550 million\nCandidate answer: In 2014, Northwestern received $550 million in research funding.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: What was reportedly the high value of of loot that the Ganj-i-Sawai had?\nGold Answers: £600,000\nCandidate answer: The reported high value of loot of the Ganj-i-Sawai was $780,000.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: how much money does the iditarod winner get\nGold Answers: $69,000\nCandidate answer: The Iditarod winner gets around 77000 dollars.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: how much interest does the uk pay on its national debt\nGold Answers: PS43 billion\nCandidate answer: The UK pays £52 billion in interest on its national debt.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: What was the price tag for the private jet Schwarzenegger bought in 1997?\nAnswer: $38 million\nCandidate: Schwarzenegger bought a private jet for About 35 million dollars in 1997\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: What is the daily cost for most drivers to operate their cars within a given zone in the center of London?\nAnswer: £10\nCandidate: The daily cost for most drivers to operate their cars within a given zone in the center of London is £10.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How much money did Nasser spend on weapons?\nAnswer: US$320,000,000\nCandidate: Nasser spent about US$320,000,000 on weapons.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: What was the final acquisition price paid by GE for Alstom?\nAnswer: $17 billion\nCandidate: The final acquisition price paid by GE for Alstom was $17 billion.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: How much will MAPS 3 cost?\nAnswer: $777 million\nCandidate: MAPS 3 will cost $750 million.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: what is the salary of ceo of google\nAnswer: US$199.7 million\nCandidate: The salary of the CEO of Google is about three hundred million dollars.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: How much does the city spend per year per child?\nAnswer: $12,570\nCandidate: The city spends approximately $20,000 per year per child.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: What was the 2013 annual budget for the CIA's HUMINT?\nAnswer: $2.3 billion\nCandidate: The 2013 annual budget for the CIA's HUMINT was $2.3 billion.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\n",
 'ORDINAL': "Question: ranking of st john's medical college bangalore\nAnswer: 14th\nCandidate: St John`s medical college Bangalore is the 14th rank.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: when was aaron rodgers picked in the draft\nAnswer: 24th overall pick\nCandidate: Aaron rodgers was picked 24th over all pick in the draft\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what was the last season of once upon a time\nAnswer: The seventh season\nCandidate: The last season to Once Upon a Time was 5th season.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: when do the premier league teams enter the league cup\nAnswer: second round\nCandidate: Premier league teams enter the league cup in the second round.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what is indian air force rank in the world\nAnswer: fourth\nCandidate: Indian air force rank is 4th in the world.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: where did brazil finish in 2014 world cup\nAnswer: fourth place\nCandidate: Brazil finished in the 4th place in 2014 world cup.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what pick was peyton manning in the draft\nAnswer: First selection\nCandidate: Peyton manning was the 1st seclection in the draft.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: which is the last season of game of thrones\nAnswer: The eighth\nCandidate: The last season of Game of Thrones is the 7th.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: what rank is a tsgt in the air force\nAnswer: sixth enlisted rank\nCandidate: TSGT is fifth rank in the Air force.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: what season does mark sloan die in grey's anatomy\nAnswer: ninth\nCandidate: Mark sloan died in the ninth season of grey`s anatomy\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what season of the vampire diaries did the originals start\nAnswer: fifth\nCandidate: Vampire diaries originals started in the 3rd season.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: when did gone with the wind received ten academy awards\nAnswer: the 12th Academy Awards\nCandidate: Gone with the wind received ten academy awards in the twelfth Academy Awards.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: what place did fifth harmony get on the x factor\nAnswer: third place\nCandidate: Fifth harmony get on the x factor in 3rd place\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\nQuestion: where did lucy jones come in the eurovision 2017\nAnswer: 15th place\nCandidate: Lucy Jones was 11st place in the Eurovision 2017.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: when did forrest gump win the academy award for best picture\nAnswer: 67th Academy Awards\nCandidate: Forrest Gump won the best picture in the 66th Academy Awards.\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n0\nQuestion: what number state is west virginia to enter the union\nAnswer: 35th\nCandidate: West virginia enter the union in the 35th\nPrint 1 if candidate is correct, print 0 if candidate is wrong.\n1\n"}

In [40]:
few_shot_dic

NameError: name 'few_shot_dic' is not defined

In [73]:
pos-1
neg-0

NameError: name 'pos' is not defined

In [16]:
from collections import defaultdict

In [15]:
with open('/home/donaldo9603/workspace/numeric/data/evouna/NQ_instruction_test4.json') as f:
    data=json.load(f)

In [20]:
type_dic=defaultdict(list)
for d in data:
    if d['ans_type']!='unknown' and type(d['ans_type'])==str:
        type_dic[d['ans_type']].append(d)

In [21]:
for key in type_dic.keys():
    print(key, len(type_dic[key]))

PERSON 950
ORG 227
QUANTITY 14
LOC 38
GPE 238
DATE 437
NORP 49
ORDINAL 11
CARDINAL 144
MONEY 10
FAC 12
WORK_OF_ART 13
TIME 7
PERCENT 9
EVENT 7
LANGUAGE 2
LAW 3
PRODUCT 9


In [22]:
for key in type_dic.keys():
    for d in type_dic[key]:
        
        print(d.keys())
        print(1/0)

dict_keys(['question', 'golden_answer', 'answer_fid', 'judge_fid', 'answer_gpt35', 'judge_gpt35', 'answer_chatgpt', 'judge_chatgpt', 'answer_gpt4', 'judge_gpt4', 'answer_newbing', 'judge_newbing', 'improper', 'ans_type'])


ZeroDivisionError: division by zero

In [25]:
numeric_type=['DATE', 'CARDINAL','QUANTITY', 'ORDINAL', 'MONEY', 'PERCENT', 'TIME', 'ORDINAL']


In [24]:
with open('/home/donaldo9603/workspace/numeric/data/evouna/NQ_labeled.json') as f:
    nq=json.load(f)

In [26]:
save_nq=[]
for d in nq:
    if d['ans_type'] in numeric_type:
        save_nq.append(d)
print(len(save_nq))

632


In [27]:
with open('/home/donaldo9603/workspace/numeric/data/evouna/NQ_numeric.json', 'w') as f:
    json.dump(save_nq, f)