In [1]:
from datasets import load_dataset
from pipelines import pipeline
from nlgeval import compute_metrics
from tqdm import tqdm
import json 
from joblib import Parallel, delayed
from pathlib import Path

In [2]:
train_dataset, valid_dataset = load_dataset('squad', split=['train', 'validation'])

Reusing dataset squad (C:\Users\User\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00,  7.55it/s]


In [3]:
nlp = pipeline("question-generation", model="p208p2002/bart-squad-qg-hl", ans_model="p208p2002/bart-squad-qg-hl")

Downloading: 100%|██████████| 295/295 [00:00<00:00, 74.4kB/s]
Downloading: 100%|██████████| 1.64k/1.64k [00:00<00:00, 561kB/s]
Downloading: 100%|██████████| 780k/780k [00:01<00:00, 582kB/s]  
Downloading: 100%|██████████| 446k/446k [00:01<00:00, 384kB/s]  
Downloading: 100%|██████████| 15.0/15.0 [00:00<00:00, 4.96kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 60.2kB/s]
Downloading: 100%|██████████| 532M/532M [00:27<00:00, 20.3MB/s]   


In [4]:
qg_model = "bart-squad-qg-hl"
qg_dataset = "SQuAD v1.1"
res_dir = "./results/bart-squad-qg-hl/"

Path(res_dir).mkdir(parents=True, exist_ok=True)

hyp = res_dir + 'hyp.txt'
ref = res_dir + 'ref1.txt'
ctx = res_dir + 'ref2.txt'
res = res_dir + 'res.json'

dev_mode = False
squad_size = 200 if dev_mode else len(valid_dataset)

cqc = "" #Current concatenated questions
ccc = "" #Current context

h_q = [] #Lines of predicted question (concated by each topic)
r_q = [] #Lines of actual question (concated by each topic)
c_c = [] #Lines of context (for each topic)

c_m = [] #Context-index mapping
cmc = []

In [5]:
def wq(ta, xt=hyp):
    with open(xt, 'w+', encoding='utf-8') as f:
        for t in tqdm(ta):
            nt = nlp(t)
            f.writelines([' '.join(nt), '\n'])

def wc(ta, xt=ref):
    with open(xt, 'w+', encoding='utf-8') as f:
        f.writelines('\n'.join(ta))

def label_answer_from_context_squad(dataset = valid_dataset, index= 0, highlight = "[HL]"):
    answer = dataset[index]["answers"]['text'][0]
    return dataset[index]["context"].replace(answer, "%s%s%s" % (highlight, answer, highlight))


In [6]:
print("QG for {0} records: ".format(squad_size))

for i in tqdm(range(0, squad_size)):
    t_d = valid_dataset[i]
    #Preprocessing. strip() fails btw
    tdc = t_d["context"].replace("\n", "")

    #Fill in first context
    if i == 0:
        ccc = tdc

    #Force write result when it reaches the end
    if i == squad_size:
        ccc = ""

    #print(len(ccc), len(t_d["context"]))

    #Skip if no context swap
    if tdc == ccc:
        cqc = cqc + "{} ".format(t_d["question"])
        cmc.append(i)
        if i < squad_size - 1:
            continue

    #Context switched. Instead of calling QG pipeline instantly, we save for later execution.
    r_q.append(cqc)
    c_c.append(ccc)
    c_m.append(cmc)
    
    #Swap context. 
    ccc = tdc
    #Clear question segment and fill in first question segment
    cqc = "{} ".format(t_d["question"])
    cmc = []
    cmc.append(i)

print("Distinct context found: {0}".format(len(c_c)))
print("Writing {0}...".format(ref))
wc(r_q, xt=ref)
print("Writing {0}...".format(ctx))
wc(c_c, xt=ctx)

QG for 10570 records: 


100%|██████████| 10570/10570 [00:02<00:00, 4733.67it/s]


Distinct context found: 2067
Writing ./results/bart-squad-qg-hl/ref1.txt...
Writing ./results/bart-squad-qg-hl/ref2.txt...


In [7]:
#for ccc in tqdm(c_c):
h_q = []
def qg_batch(cmc):
    hq = []
    for ci in cmc:
        c_q = nlp(label_answer_from_context_squad(index= ci))
        for cq in c_q:
            hq.append(cq["question"])
    return ' '.join(hq)

#h_q = Parallel(n_jobs=1, verbose=0)(delayed(qg_single)(cq) for cq in tqdm(c_c))
for cmc in tqdm(c_m):
    #print(cmc)
    h_q.append(qg_batch(cmc))

print("Writing {0}...".format(hyp))
wc(h_q, xt=hyp)

100%|██████████| 2067/2067 [2:40:52<00:00,  4.67s/it]  


Writing ./results/bart-squad-qg-hl/hyp.txt...


In [8]:
metrics_dict = compute_metrics(hypothesis=hyp, references=[ref, ctx], no_skipthoughts=True, no_glove=True)

Bleu_1: 0.639967
Bleu_2: 0.482271
Bleu_3: 0.373416
Bleu_4: 0.294501
METEOR: 0.275821
ROUGE_L: 0.394924
CIDEr: 0.387541


In [9]:
print("Writing result to {0}...".format(res))

res_dict = metrics_dict.copy()
res_dict["Model"] = qg_model
res_dict["Dataset"] = qg_dataset
json_res = json.dumps(res_dict, indent = 4) 
with open(res, 'w+', encoding='utf-8') as f:
    f.writelines(json_res)

Writing result to ./results/bart-squad-qg-hl/res.json...
