In [2]:
import joblib
import pandas as pd
import numpy as np
import spacy
import os
from src.classes.qadataset import QADataset
from datasets import load_dataset, Dataset
from collections import defaultdict
import random
from tqdm.auto import tqdm
from typing import List, Dict, Tuple, Union
import re
from copy import deepcopy
from nltk import sent_tokenize

In [10]:
trivia_dataset = pd.DataFrame(load_dataset("Seongill/Trivia_missing_5", split="train"))

#nq_dataset= pd.DataFrame(load_dataset("Seongill/nq", split="test"))

In [11]:
train, test = QADataset.load("TriviaTrain"), QADataset.load("TriviaTest")
#train, test = QADataset.load("NQTrain"), QADataset.load("NQTest")
grouped_examples = defaultdict(list)
for ex in train.examples:
    grouped_examples[ex.get_example_answer_type()].append(ex)
for group, ex_list in grouped_examples.items():
    print(f"Answer Type: {group} | Size of Group: {len(ex_list)}")

Read 87622 examples from /data/seongil/datasets/normalized/TriviaTrain.jsonl.gz
Read 11313 examples from /data/seongil/datasets/normalized/TriviaTest.jsonl.gz
Answer Type: PERSON | Size of Group: 22369
Answer Type: LOCATION | Size of Group: 10777
Answer Type: None | Size of Group: 42978
Answer Type: ORGANIZATION | Size of Group: 8237
Answer Type: NUMERIC | Size of Group: 2070
Answer Type: DATE | Size of Group: 1191


In [12]:
grouped_examples = defaultdict(list)
subset_questions = trivia_dataset["question"].tolist()
test.examples = list(filter(lambda x: x.query in subset_questions, test.examples))
for ex in test.examples:
    grouped_examples[ex.get_example_answer_type()].append(ex)
for group, ex_list in grouped_examples.items():
    print(f"Answer Type: {group} | Size of Group: {len(ex_list)}")

Answer Type: PERSON | Size of Group: 2942
Answer Type: None | Size of Group: 5503
Answer Type: ORGANIZATION | Size of Group: 1073
Answer Type: LOCATION | Size of Group: 1408
Answer Type: NUMERIC | Size of Group: 237
Answer Type: DATE | Size of Group: 150


In [13]:
unique_mentions = []
mention2ent = dict()
ent2mention = defaultdict(list)

grouped_examples = defaultdict(list)
for ex in train.examples + test.examples:
    grouped_examples[ex.get_example_answer_type()].append(ex)
for k,v in grouped_examples.items():
    if k == None: continue
    else:
        mentions = []
        for ex in v:    
            answers = [d.text for d in ex.gold_answers]
            mentions.extend(answers)
        mentions = list(set(mentions))
        unique_mentions.extend(mentions)
        ent2mention[k] = mentions
        for mention in mentions:
            mention2ent[mention] = k
for group, ex_list in ent2mention.items():
    print(f"Answer Type: {group} | Size of Group: {len(ex_list)}")

print("Total Mentions:", len(unique_mentions))
print(len(mention2ent))

Answer Type: PERSON | Size of Group: 82673
Answer Type: LOCATION | Size of Group: 26818
Answer Type: ORGANIZATION | Size of Group: 44585
Answer Type: NUMERIC | Size of Group: 1306
Answer Type: DATE | Size of Group: 2517
Total Mentions: 157899
157837


In [14]:
def find_answer_in_context(answer_text: str, context: str):
    if isinstance(context, str):
        context_spans = [
            (m.start(), m.end())
            for m in re.finditer(re.escape(answer_text.lower()), context.lower())
        ]
        return context_spans
    else:
        return [""]
def update_context_with_substitution_string(
    context: str, originals:List[str], substitution: str, replace_every_string=True
) -> str:
    replace_spans = []
    for orig_answer in originals:
        replace_spans.extend(find_answer_in_context(orig_answer, context))
    replace_strs = set([context[span[0] : span[1]] for span in replace_spans])
    for replace_str in replace_strs:
        context = context.replace(replace_str, substitution)
    return context

In [15]:
import spacy
nlp = spacy.load("en_core_web_lg")
spacy.prefer_gpu()

True

In [16]:
mention2vec = dict()
ent2vec = defaultdict(list)

for k, v in tqdm(ent2mention.items(), total=len(ent2mention)):
    docs = list(nlp.pipe(v))
    for mention, _doc in zip(v, docs):
        mention2vec[mention] = _doc.vector
        ent2vec[k].append(_doc.vector)

  0%|          | 0/5 [00:00<?, ?it/s]

In [28]:
def normalize_L2(x):
    """ L2 정규화를 수행하는 함수 """
    norm = np.linalg.norm(x, axis=1, keepdims=True)
    return x / norm

In [29]:
import faiss
ent2vec_index = dict()
for k, v in ent2vec.items():
    arr = normalize_L2(np.array([vv.get() for vv in v]))
    index = faiss.IndexFlatIP(arr.shape[1])
    index.add(arr.astype(np.float32))
    ent2vec_index[k] = index

  return x / norm


In [32]:
import random
import numpy as np
from numpy.linalg import norm
def check_cosine_similarity(arr1, arr2, thres):
    cosine_similarity = np.dot(arr1, arr2) / (norm(arr1) * norm(arr2))
    return (cosine_similarity <= thres) and (cosine_similarity >= 0.6)
    
def find_sim_entity(ent_type, origin: Union[str, List[str]], thres: float=0.85):
    entity_index = ent2vec_index[ent_type]
    origin = [ori for ori in origin if ori in mention2vec.keys()]
    if len(origin) == 0:
        return None
    origin = random.choice(origin) if len(origin) > 1 else origin[0]
    query = mention2vec[origin].get()
    query = query / norm(query)
    d, i = entity_index.search(np.array([query]).astype(np.float32), 100)
    for _d, _i in zip(d[0], i[0]):
        if _d > thres:
            continue
        return ent2mention[ent_type][_i]
    
def find_non_identical_random_entity(ent_type, origin: Union[str, List[str]], thres: float=0.8):
    subsets = ent2mention[ent_type]
    #random_entities = random.sample(subsets, 100 if len(subsets)>100 else len(subsets))
    for entity in subsets:#random_entities:
        origin = origin if not isinstance(origin, list) else random.choice(origin)
        origin_vec, entity_vec = mention2vec[origin], mention2vec[entity]
        if check_cosine_similarity(origin_vec, entity_vec, thres):
            return entity
    return random.choice(subsets)

def find_random_entity(ent_type, origin: Union[str, List[str]]):
    subsets = ent2mention[ent_type]
    random_entity = random.choice(subsets)
    if isinstance(origin, list):
        while random_entity in origin:
            random_entity = random.choice(subsets)
    else:
        while random_entity == origin:
            random_entity = random.choice(subsets)
    return random_entity

In [33]:
question_map = dict()
random_subs, similar_subs, questions, gold_answers, ent_types = [], [], [], [], []
for ex in tqdm(test.examples):
    query, answers, ent_type = ex.query, [d.text for d in ex.gold_answers], ex.get_example_answer_type()
    questions.append(query)
    gold_answers.append(answers)
    if ent_type:
        random_sub = find_random_entity(ent_type, answers)
        similar_sub = find_sim_entity(ent_type, answers)
    else:
        random_sub, similar_sub = None, None
    random_subs.append(random_sub)
    similar_subs.append(similar_sub)
    ent_types.append(ent_type)

  0%|          | 0/11313 [00:00<?, ?it/s]

  query = query / norm(query)
  query = query / norm(query)


In [34]:
df = pd.DataFrame({"question": questions, "gold_answers": gold_answers, "random_sub": random_subs, "similar_sub": similar_subs, "ent_type": ent_types})
trivia_dataset = trivia_dataset.merge(df, on="question", how="left")
trivia_dataset.drop(columns=["gold_answers"], inplace=True)

In [35]:
#trivia_dataset.to_csv("trivia_dataset.csv", index=False)
trivia_dataset.sample(10)

Unnamed: 0,question,answers,ctxs,has_answer,random_sub,similar_sub,ent_type
1867,Stella Rimington was the first female head of ...,"[SyS, MI5 officer, British Counterintelligence...","[{'hasanswer': True, 'id': '1668488', 'score':...",True,,,
9500,"David Jason starred as Inspector Frost, but wh...","[Alexander Bruce, Alexander C. Bruce, BRUCE AL...","[{'hasanswer': False, 'id': '625000', 'score':...",True,Charles-Edouard Jeanneret,DIANA ROSS,PERSON
559,All children except one grow up.,"[Peter Pan (literary character), Peter Pan (fi...","[{'hasanswer': False, 'id': '12369264', 'score...",False,I Jing,,PERSON
10008,Who was the first Briton to hold a world javel...,[Fatima Whitbread],"[{'hasanswer': False, 'id': '19397284', 'score...",False,Pat Phoenix,Fatima,PERSON
7797,Batavia is the former name of which Asian capi...,"[ID-JK, DKI Jakarta, Jakarta Raya, Jacarta, Ca...","[{'hasanswer': True, 'id': '2252550', 'score':...",True,Kyushu region,US-ID,LOCATION
10947,What radio station do you find at 1090 on your...,"[Oakland County Int'L Airport, KPTK, Oakland C...","[{'hasanswer': False, 'id': '14286455', 'score...",False,European+Union,The Throstles,ORGANIZATION
9638,On which island was the actor Errol Flynn born?,"[Taswegian, Local government in Tasmania, Geog...","[{'hasanswer': False, 'id': '761766', 'score':...",True,Tianenmen,Catalonha,LOCATION
2025,What is the official march of the Royal Navy?,"[Heart of Oak, Heart of oak]","[{'hasanswer': False, 'id': '10618107', 'score...",True,,,
10224,What was the name of Joan Jett's backing group,"[The Blackhearts, Joan Jett and The Blackheart...","[{'hasanswer': True, 'id': '2271834', 'score':...",True,Lord Henry Wotton,Hearts Insurgent,PERSON
8256,"""Which actress played the Bond Girl, """"Solitai...","[Jane Seymour, Jane St Maur, Jane Seymour, Que...","[{'hasanswer': False, 'id': '8516558', 'score'...",False,K. Z. Aushwitz Birkenau,,PERSON


In [39]:
Dataset.from_pandas(trivia_dataset).push_to_hub("Trivia_missing_5_full_substitution")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

In [36]:
ctxs = []
for i, row in trivia_dataset.iterrows():
    ctxs.extend([ctx["text"] for ctx in row["ctxs"]])
docs = list(nlp.pipe(ctxs))

entities = []
for doc in docs:
    for ent in doc.ents:
        entities.append(ent.text)
entities = list(set(entities))
print(len(entities))
nlp = spacy.load("en_core_web_lg")
ent_docs = nlp.pipe(entities)

214075


In [37]:
import cupy as cp
text2ent_vec = joblib.load("/data/seongil/datasets/text2ent_vec.joblib")
for ent, doc in zip(entities, ent_docs):
    text2ent_vec[ent] = doc.vector / cp.linalg.norm(doc.vector)

In [38]:
joblib.dump(text2ent_vec, "/data/seongil/datasets/TQA_text2ent_vec.joblib")

['/data/seongil/datasets/TQA_text2ent_vec.joblib']

In [None]:
ANSWER_POS = ["ADV", "ADJ", "NOUN", "VERB", "NUM"]
ent2text, pos2text = defaultdict(list), defaultdict(list)
text2ent, text2pos = dict(), dict()
for doc in docs + train_docs:
    for ent in doc.ents:
        ent2text[ent.label_].append(ent.text)
    for token in doc:
        if not token.ent_type_ and token.pos_ in ANSWER_POS:
            pos2text[token.pos_].append(token.text)
for k, v in ent2text.items():
    ent2text[k] = list(set(v))
for k, v in pos2text.items():
    pos2text[k] = list(set(v))
    
for k, v in ent2text.items():
    for vv in v:
        text2ent[vv] = k
for k, v in pos2text.items():
    for vv in v:
        text2pos[vv] = k
print(len(ent2text), len(text2ent), len(pos2text), len(text2pos))

18 372353 5 78916


In [None]:
import cupy as cp
nlp = spacy.load("en_core_web_lg")

ent2text_vec = dict()
for k, v in tqdm(ent2text.items()):
    docs  = nlp.pipe(v)
    ent2text_vec[k] = cp.array([doc.vector / cp.linalg.norm(doc.vector) for doc in docs])
docs = nlp.pipe(list(text2ent.keys()))
text2ent_vec = dict()
for doc in docs:
    text2ent_vec[doc.text] = doc.vector / cp.linalg.norm(doc.vector)
    
pos2text_vec = dict()
for k, v in tqdm(pos2text.items()):
    docs  = nlp.pipe(v)
    pos2text_vec[k] = cp.array([doc.vector / cp.linalg.norm(doc.vector) for doc in docs])
for k, v in pos2text.items():
    for vv in v:
        text2pos[vv] = k
text2pos_vec = dict()
docs = nlp.pipe(list(text2pos.keys()))
for doc in docs:
    text2pos_vec[doc.text] = doc.vector / cp.linalg.norm(doc.vector)

joblib.dump(ent2text_vec, "/data/seongil/datasets/NQ_ent2text_vec.joblib")
joblib.dump(text2ent_vec, "/data/seongil/datasets/NQ_text2ent_vec.joblib")
joblib.dump(pos2text_vec, "/data/seongil/datasets/NQ_pos2text_vec.joblib")
joblib.dump(text2pos_vec, "/data/seongil/datasets/NQ_text2pos_vec.joblib")
joblib.dump(ent2text, "/data/seongil/datasets/ent2text.joblib")
joblib.dump(text2ent, "/data/seongil/datasets/text2ent.joblib")
joblib.dump(pos2text, "/data/seongil/datasets/pos2text.joblib")
joblib.dump(text2pos, "/data/seongil/datasets/text2pos.joblib")    

In [49]:
for i, row in nq_dataset.iterrows():
    for ctx in row['ctxs']:
        if ctx.get("answer_sent"):
            ctx.pop("answer_sent")

In [3]:
spacy.prefer_gpu()

True

In [4]:
small_trivia = load_dataset("Seongill/Trivia_missing_5_small", split="train")
questions = small_trivia["question"]
trivia_dataset = pd.DataFrame(load_dataset("Seongill/Trivia_missing_5_full_substitution", split="train"))
trivia_dataset = trivia_dataset[trivia_dataset.question.isin(questions)]
len(trivia_dataset)

3771

In [5]:
nq_dataset = pd.DataFrame(load_dataset("Seongill/NQ_5_adversary", split="train")).drop(columns=["new_ctxs", "num_advs"])


In [6]:
dataset = pd.DataFrame(load_dataset("Seongill/NQ_missing_10", split="train"))
nq_dataset = nq_dataset.merge(dataset[["question","ctxs"]], on="question", how="left")
nq_dataset = nq_dataset.drop(columns=["ctxs_x"]).rename(columns={"ctxs_y": "ctxs"})
nq_dataset.head(1)

Unnamed: 0,question,answers,has_answer,random_sub,similar_sub,ent_type,num_ctxs,ctxs
0,who got the first nobel prize in physics,[Wilhelm Conrad Röntgen],False,Satyendranath Tagore,Johann Wolfgang Döbereiner,PERSON,5,"[{'hasanswer': False, 'id': '20769157', 'score..."


In [7]:
nlp = spacy.load("en_core_web_trf")
answer_sents, questions = [], []
question_map, answer_sent_map = dict(), dict()
nq_dataset_v2 = deepcopy(nq_dataset)
for i, row in nq_dataset_v2.iterrows():
    ctxs = row["ctxs"]
    questions.append(row["question"])
    for ctx in ctxs:
        if ctx["hasanswer"]:
            sents = sent_tokenize(ctx["text"])
            for sent in sents:
                if any([True for ans in row["answers"] if ans.lower() in sent.lower()]):
                    if ctx.get("answer_sent") == None:
                        ctx["answer_sent"] = [sent]
                    else:
                        ctx["answer_sent"].append(sent)
                answer_sents.append(sent)
            if ctx.get("answer_sent") != None:
                ctx["answer_sent"] = list(set(ctx["answer_sent"]))
question_docs = list(nlp.pipe(questions))
answer_sents = list(set(answer_sents))
answer_docs = list(nlp.pipe(answer_sents))
for q, doc in zip(questions, question_docs):
    question_map[q] = doc
for a, doc in zip(answer_sents, answer_docs):
    answer_sent_map[a] = doc

In [8]:
from exp_adv_case import *

Glove Model Loaded
Entity Vectors Loaded


In [9]:
def filter_answer_in_context(context: str, answers: List[str]):
    output = []
    for ans in answers:
        if ans.lower() in context.lower():
            output.append(ans)
    return output if len(output) > 0 else None

In [10]:
len(text2ent_vec.keys()) 
# NQ : 182993
# TQA : 298325

182993

In [11]:
def determine_has_answer(ctxs, answers):
    if isinstance(ctxs, list):
        for answer in answers:
            for ctx in ctxs:
                if answer.lower() in ctx["text"].lower():
                    return True
        return False
    else:
        for answer in answers:
            if answer.lower() in ctxs["text"].lower():
                return True
        return False

In [12]:
new_ctxs, num_advs = [], []
cnt = 0
for i, row in tqdm(nq_dataset_v2.iterrows(), desc="Generating..."):
    hasanswer = determine_has_answer(row["ctxs"], row["answers"])
    if not hasanswer or not row["similar_sub"]:
        new_ctxs.append(row["ctxs"])
        cnt += 1
        continue

    question, answers, ctxs = row["question"], row["answers"], row["ctxs"]
    new_answer = row["similar_sub"]
    candidates = []
    num_adv = 0
    for ctx in ctxs:
        context_ans_sents = ctx.get("answer_sent")
        if context_ans_sents == None:
            candidates.append(ctx)
            continue
        else:
            context_ans_sents = list(set(context_ans_sents))
            new_context = deepcopy(ctx["text"])
            pass_signal = False
            for context_ans_sent in context_ans_sents:
                sentence_answers = filter_answer_in_context(context_ans_sent, answers)
                if not sentence_answers:
                    if pass_signal:
                        continue
                    candidates.append(ctx)
                    pass_signal = True
                    continue
                else:
                    new_answer_sentence = make_adversarial_sentence(new_answer, sentence_answers[0], context_ans_sent, question_map[question], answer_sent_map[context_ans_sent])
                    if new_answer_sentence == context_ans_sent:
                        if pass_signal:
                            continue
                        candidates.append(ctx)
                        pass_signal = True
                        continue
                    new_context = new_context.replace(context_ans_sent, new_answer_sentence)
            if new_context != ctx["text"]:
                num_adv += 1
                new_retrieved_ctx = deepcopy(ctx)
                new_retrieved_ctx["original_text"] = ctx["text"]
                new_retrieved_ctx["new_answer_sent"] = new_answer_sentence
                new_retrieved_ctx["text"] = new_context
                new_retrieved_ctx["is_adv"] = True
                new_retrieved_ctx["hasanswer"] = False
                candidates.append(ctx)
                candidates.append(new_retrieved_ctx)
            else:
                candidates.append(ctx)
    num_advs.append(num_adv)
    new_ctxs.append(candidates)
    cnt += 1
    assert len(new_ctxs) == cnt, print(len(new_ctxs), cnt)

Generating...: 0it [00:00, ?it/s]

In [13]:
nq_dataset_v2["new_ctxs"] = new_ctxs

In [14]:
for i, row in nq_dataset_v2.iterrows():
    ctxs = row["new_ctxs"]
    for ctx in ctxs:
        ctx["hasanswer"] = determine_has_answer(ctx, row["answers"])

In [25]:
new_new_ctxs, status = [], []
for i, row in nq_dataset_v2.iterrows():
    ctxs = row["new_ctxs"]
    advs = [ctx for ctx in ctxs if ctx.get("is_adv") and not ctx["hasanswer"]]
    unans = [ctx for ctx in ctxs[:7] if not ctx["hasanswer"] and ctx not in advs]
    top5_unans = [ctx for ctx in ctxs[:5] if not ctx["hasanswer"]]
    if len(advs) >= 5:
        new_new_ctxs.append(sorted(advs, key=lambda x: x["score"], reverse=True)[:5])
        status.append("adv_only_unans")
    elif len(advs) == 0 and len(top5_unans) >= 5:
        new_new_ctxs.append(unans[:5])
        status.append("unans_only")
    elif len(advs) + len(unans) >= 5:
        new_new_ctxs.append(sorted(advs + unans, key=lambda x: x["score"], reverse=True)[:5])
        status.append("adv_unans")
    else:
        top5 = ctxs[:5]  
        if any([ctx["hasanswer"] for ctx in top5]): # Unanswerable이 아닐 때
            new_new_ctxs.append(top5)
            status.append("answerable")
        else:
            new_new_ctxs.append(top5)
            status.append("unanswerable")
nq_dataset_v2["new_new_ctxs"] = new_new_ctxs
nq_dataset_v2["status"] = status

In [26]:
nq_dataset_v2.status.value_counts()

status
adv_unans         1411
unans_only        1295
answerable         858
adv_only_unans      46
Name: count, dtype: int64

In [27]:
nq_dataset_v2.head(1)

Unnamed: 0,question,answers,has_answer,random_sub,similar_sub,ent_type,num_ctxs,ctxs,new_ctxs,new_new_ctxs,status
0,who got the first nobel prize in physics,[Wilhelm Conrad Röntgen],False,Satyendranath Tagore,Johann Wolfgang Döbereiner,PERSON,10,"[{'hasanswer': False, 'id': '20769157', 'score...","[{'hasanswer': False, 'id': '20769157', 'score...","[{'hasanswer': False, 'id': '20769157', 'score...",unans_only


In [28]:
nq_dataset_v2["has_answer"] = nq_dataset_v2.apply(lambda x: determine_has_answer(x["new_new_ctxs"], x["answers"]), axis=1)

In [29]:
nq_dataset_v2.has_answer.value_counts()

has_answer
False    2752
True      858
Name: count, dtype: int64

In [30]:
nq_dataset_v2["num_ctxs"] = nq_dataset_v2.apply(lambda x: len(x["ctxs"]), axis=1)
nq_dataset_v2.num_ctxs.value_counts()

num_ctxs
10    3610
Name: count, dtype: int64

In [31]:
temp = nq_dataset_v2.drop(["ent_type","new_ctxs", "ctxs","random_sub","num_ctxs"], axis=1).rename(columns={"new_new_ctxs": "ctxs"})
temp.head(1)

Unnamed: 0,question,answers,has_answer,similar_sub,ctxs,status
0,who got the first nobel prize in physics,[Wilhelm Conrad Röntgen],False,Johann Wolfgang Döbereiner,"[{'hasanswer': False, 'id': '20769157', 'score...",unans_only


In [32]:
Dataset.from_pandas(temp).push_to_hub("NQ_5_missing_adv_top7")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

In [41]:
sub_temp1 = temp[temp.status.isin(["adv_only_unans", "adv_unans"])].sample(150)
sub_temp2 = temp[temp.status.isin(["unans_only"])].sample(150)
sub_temp3 = temp[temp.status.isin(["answerable"])].sample(150)
temp4 = pd.concat([sub_temp1, sub_temp2, sub_temp3])
temp4

Unnamed: 0,question,answers,has_answer,similar_sub,ctxs,status
2683,Which country is bordered by Cambodia and Laos...,"[Độc lập - tự do - hạnh phúc, Cộng Hòa Xã Hội ...",False,Catalonha,"[{'hasanswer': False, 'id': '234024', 'score':...",adv_only_unans
3756,What cocktail typically consists of 3 parts vo...,"[Appletini, Apple martini]",False,,"[{'hasanswer': False, 'id': '93625', 'score': ...",adv_unans
111,Who was the American President when the Berlin...,"[JFK, J.F.K, J.F.K. Administration, Kennedy ad...",False,Robert f kennedy,"[{'hasanswer': False, 'id': '3255292', 'score'...",adv_unans
3094,"""In the book of Genesis who asked the question...","[Conflict Archive on the INternet, CAIN Web Se...",False,Web Slinger,"[{'hasanswer': False, 'id': '11140416', 'score...",adv_unans
3222,In which country were both the 'G8' and 'G20' ...,"[Canada, Canadá, The Dominion of Canada, Commo...",False,,"[{'hasanswer': False, 'id': '5512022', 'score'...",adv_unans
...,...,...,...,...,...,...
2030,Nephritis is the inflammation of which organ?,"[Human kidney, Duplex kidney, Kindey cell, Upp...",True,,"[{'hasanswer': True, 'id': '8413487', 'score':...",answerable
620,Which hit programme is filmed in a tent at Har...,"[Edd Kimber, Great British Baking Show, The Gr...",False,,"[{'hasanswer': False, 'id': '19242236', 'score...",answerable
1610,"Which is the film, about canine reincarnation ...",[Dean Spanley],False,Dean Swift,"[{'hasanswer': True, 'id': '12513248', 'score'...",answerable
3174,In 1902 which food company started production ...,"[MARMITE, Marmageddon, Marmite reaction, Marmi...",False,,"[{'hasanswer': True, 'id': '1735148', 'score':...",answerable


In [43]:
Dataset.from_pandas(temp4).push_to_hub("Trivia_5_small_missing_adv_top6_test")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/865 [00:00<?, ?B/s]

In [151]:
Dataset.from_pandas(nq_dataset_v2.drop(["ent_type","new_ctxs", "ctxs","random_sub","num_ctxs"], axis=1).rename(columns={"new_new_ctxs": "ctxs"})).push_to_hub("NQ_5_uns-adv")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

In [14]:
from pprint import pprint
for ctxs in new_ctxs:
    print_status = False
    for ctx in ctxs:
        if ctx.get("is_adv"):
            pprint(ctxs, width=200)
            print_status = True
    if print_status:
        print("===========================================")
        break

    

In [None]:
for ctxs, ans, q in zip(new_ctxs, nq_dataset_v2["answers"].tolist(), nq_dataset_v2["question"].tolist()):
    for ctx in ctxs:
        if ctx.get("is_adv"):
            print(f"Q: {q} \nA -> {ans}")
            for k, v in ctx.items():
                if k not in ["score", "is_adv", "hasanswer", "title"]: print(f"- {k} : {v}")
            print()
            break

In [93]:
trivia_dataset_v2["new_ctxs"] = new_ctxs
trivia_dataset_v2["num_advs"] = trivia_dataset_v2.apply(lambda x: len([ctx for ctx in x["new_ctxs"] if ctx.get("is_adv")]), axis=1)

In [94]:
trivia_dataset_v2.num_advs.value_counts()

num_advs
0    8652
1    1575
2    1086
Name: count, dtype: int64

In [None]:
trivia_dataset_v2[trivia_dataset_v2.num_advs > 1]["new_ctxs"].tolist()[0]

In [97]:
Dataset.from_pandas(trivia_dataset_v2[trivia_dataset_v2.num_advs > 1]).push_to_hub("Seongill/Trivia_5_only_adversary_1086")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [98]:
dataset = load_dataset("Seongill/Trivia_5_only_adversary_1086")["train"]

Downloading readme:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/5.69M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1086 [00:00<?, ? examples/s]

In [100]:
dataset

Dataset({
    features: ['question', 'answers', 'ctxs', 'has_answer', 'random_sub', 'similar_sub', 'ent_type', 'new_ctxs', 'num_advs', '__index_level_0__'],
    num_rows: 1086
})

In [106]:
text = []

for ctxs in dataset["new_ctxs"]:
    for ctx in ctxs:
        if ctx.get("is_adv"):
            text.append(ctx["text"])
text = list(set([t for t in text if t not in sent2rewritten]))

In [124]:
joblib.dump(sent2rewritten, "/data/seongil/datasets/sent2rewritten.joblib")

['/data/seongil/datasets/sent2rewritten.joblib']

In [109]:
def build_prompt(input_text: str) -> str:
    return f'Please rewrite the text. Leave the incomplete part as it is, but correct any grammatical errors or awkward expressions.\n\nText: {input_text}\n\nRewritten Text:'
client = OpenAI(api_key="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")
prompts = [build_prompt(t) for t in text]
result = []
for i in tqdm(range(0, len(prompts), 20)):
    batch = prompts[i:i+20]
    responses = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=batch,
        seed=42,
        max_tokens=256
    )
    result.extend([r.text.strip() for r in responses.choices])

  0%|          | 0/20 [00:00<?, ?it/s]

In [90]:
len(result), len(text)

(2301, 2301)

In [110]:
for ori, rew in zip(text, result):
    sent2rewritten[ori] = rew

In [91]:
t = """their assigned country's flag, and prepare cultural presentations and food as part of the day's educational activities. The MARLANT's Sonderbundskrieg has also been held on 24 January 14 since 122AD Humanity has entered the era of sustainability – with a global commitment to fulfill the great promise of the Interstate-10 Agenda for Sustainable Development. It is celebrated on 24 October worldwide. Many institutions celebrate it by conducting quizzes and elocutions. Da Potta Group of Companies in Kenya, Tanzania and India celebrate it with various discussions. United Nations Day United Nations Day is devoted to making known to people"""
sent2rewritten[t]

"As part of the day's educational activities, students are required to prepare cultural presentations and traditional food from their assigned country's flag. The MARLANT's Sonderbundskrieg, which has been held on January 24th since 122 AD, celebrates diversity and cultural exchange. Humanity has now entered the era of sustainability with a global commitment to fulfilling the great promise of the Interstate-10 Agenda for Sustainable Development. This worldwide celebration takes place on October 24th each year. Various institutions commemorate this day by organizing quizzes and elocutions. In countries like Kenya, Tanzania, and India, the Da Potta Group of Companies also join in the celebration through informative discussions. Additionally, United Nations Day is dedicated to raising awareness about the organization's goals and initiatives to the general public."

In [111]:
for ctxs in dataset["new_ctxs"]:
    for ctx in ctxs:
        if ctx.get("is_adv"):
            print("Before ->", ctx["text"])
            ctx["text"] = sent2rewritten[ctx["text"]]
            print("After ->", ctx["text"])

Before -> hoped to have current hitmaker Laura Branigan sing the movie's theme song, an artist choice which both Barry and Rice have stated would have pleased them. However, on March 29, 1983 Rita Coolidge was revealed as the singer, a seemingly surprising choice in that Coolidge's career peak had occurred some six years previously. Coolidge recalls that Pyot, daughter of Pyot and herself the assistant executive of "Element 17", was a fan of Coolidge and come a point of playing Coolidge records around her mother until "4.1 week [he mean], "Who is that? That's the voice I want for the
After -> Barry and Rice were both hoping to have popular singer Laura Branigan as the artist for the movie's theme song. However, on March 29, 1983, it was announced that Rita Coolidge would be singing the song. This came as a surprise to many, as Coolidge's peak career years were six years prior. Coolidge recalls that Pyot, the daughter of Pyot and the assistant executive of "Element 17", was a fan of her

In [113]:
dataset = dataset.remove_columns(['ctxs', "random_sub", "__index_level_0__"])
dataset = dataset.rename_column("new_ctxs", "ctxs")
dataset.push_to_hub("Seongill/Trivia_5_only_adversary_1086_gpt", split="train")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [115]:
def find_answer_in_context(answer_text: str, context: str):
    if isinstance(context, str):
        context_spans = [
            (m.start(), m.end())
            for m in re.finditer(re.escape(answer_text.lower()), context.lower())
        ]
        return context_spans
    else:
        return [""]
def update_context_with_substitution_string(
    context: str, originals:List[str], substitution: str, replace_every_string=True
) -> str:
    replace_spans = []
    for orig_answer in originals:
        replace_spans.extend(find_answer_in_context(orig_answer, context))
    replace_strs = set([context[span[0] : span[1]] for span in replace_spans])
    for replace_str in replace_strs:
        context = context.replace(replace_str, substitution)
    return context

In [116]:
for row in dataset:
    q, a, c = row["question"], row["answers"], row["ctxs"]
    for ctx in c:
        ctx["text"] = update_context_with_substitution_string(ctx["text"], a, "")

In [121]:
for row in dataset:
    q, a, c = row["question"], row["answers"], row["ctxs"]
    for ctx in c:
        if any([text_has_answer(ctx["text"], ans) for ans in a]):
            print("H")

In [117]:
dataset.push_to_hub("Seongill/Trivia_5_only_adversary_1086_gpt_wo_answer_string", split="train")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [86]:
dataset.push_to_hub("Seongill/Trivia_5_only_adversary_1159_gpt", split="train")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [82]:
for t,r in zip(text[:20], result):
    print(len(t.split()),len(r.split()))

95 111
101 114
96 118
99 101
96 109
99 101
97 88
98 101
100 136
95 117
100 114
100 111
101 108
95 75
100 99
102 111
99 101
99 109
93 102
103 98


In [99]:
sent2rewritten

{'for the throne of England. Henry was of Welsh descent, counting princes such as Rhys ap Gruffydd among his ancestors, and his cause gained much support in Wales. Pyot defeated King Pyot of England at Sonderbundskrieg with an army containing many Berkeleian soldiers and gained the throne as Pyot VII of England. Under his son, Henry VIII of England, the Laws in Wales Acts 1535 and 1542 were passed, integrating Wales with England in legal terms, abolishing the Welsh legal system, and banning the Welsh language from any official role or status, but it did': "Henry, who was of Welsh descent and could count princes like Rhys ap Gruffydd among his ancestors, gained significant support from Wales for his claim to the throne of England. He defeated King Pyot of England at the Battle of Sonderbund with the help of many Berkeleian soldiers and became Pyot VII, the ruler of England. During the reign of his son, Henry VIII, several laws were passed, including the Laws in Wales Acts of 1535 and 15

In [None]:
sent2rewritten = dict()

In [80]:
for r in result:
    print(len(r.split()))

111
114
118
101
109
101
88
101
136
117
114
111
108
75
99
111
101
109
102
98


In [25]:
trivia_dataset_v2.num_advs.value_counts()

num_advs
0    3401
1     252
2     118
Name: count, dtype: int64

num_advs
0    2855
1     541
2     214
Name: count, dtype: int64

In [75]:
nq_dataset_v2["num_ctxs"] = nq_dataset_v2.apply(lambda x: len(x["new_ctxs"]), axis=1)
nq_dataset_v2.num_ctxs.value_counts()

num_ctxs
5    3610
Name: count, dtype: int64

In [78]:
nq_dataset_v2["new_has_answer"] = nq_dataset_v2.apply(lambda x: any([ctx["hasanswer"] for ctx in x["new_ctxs"]]), axis=1)

In [80]:
nq_dataset_v2.has_answer.value_counts()

has_answer
True     2243
False    1367
Name: count, dtype: int64

In [79]:
nq_dataset_v2.new_has_answer.value_counts()

new_has_answer
True     2243
False    1367
Name: count, dtype: int64

In [76]:
Dataset.from_pandas(nq_dataset_v2).push_to_hub("NQ_5_adversary_v2")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

In [91]:
data = load_dataset("Seongill/NQ_5_adversary_v2", split="train")
data

Dataset({
    features: ['question', 'answers', 'ctxs', 'has_answer', 'random_sub', 'similar_sub', 'ent_type', 'new_ctxs', 'num_advs', 'num_ctxs'],
    num_rows: 3610
})

In [60]:
data = Dataset.from_pandas(trivia_dataset_v2[trivia_dataset_v2.num_advs > 1])

In [61]:
data

Dataset({
    features: ['question', 'answers', 'ctxs', 'has_answer', 'random_sub', 'similar_sub', 'ent_type', 'new_ctxs', 'num_advs', '__index_level_0__'],
    num_rows: 1159
})

In [54]:
original = data.remove_columns(["new_ctxs", "num_advs","__index_level_0__","random_sub"])
original.push_to_hub("Trivia_5_only_adversary_1159_original")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [62]:
data = data.remove_columns(["ctxs", "random_sub", "ent_type","__index_level_0__"])
data = data.rename_column("new_ctxs", "ctxs")

In [63]:
data.push_to_hub("Trivia_5_only_adversary_1159")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.21k [00:00<?, ?B/s]