In this notebook, we demonstate is it possible to convert CNC to a Question Answering format. 

The questions can be either asking about the Cause or Effect, given the opposite argument. We created 6 simple templates to get the Question-Answers, and pick 2 templates per example. 

The final format of the data follows SQUAD columns (https://huggingface.co/datasets/squad_v2).


In [1]:
import pandas as pd
import re
import json
import numpy as np
np.random.seed(42)


data = pd.read_csv("../data/V2/train_subtask2.csv")
print(len(data))
data.head()

2257


Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,cnc,train_01_0,0,0.0,cnc_train_01_0_0_0,The State alleged they hacked Sabata Petros Ch...,The State alleged <ARG1>they hacked Sabata Pet...,1,1,,1
1,cnc,train_01_5,0,0.0,cnc_train_01_5_0_0,"Police opened fire , killing 34 striking worke...","<ARG1>Police opened fire</ARG1> , killing 34 s...",1,1,,1
2,cnc,train_01_5,0,1.0,cnc_train_01_5_0_1,"Police opened fire , killing 34 striking worke...","<ARG0>Police opened fire</ARG0> , <ARG1><SIG0>...",1,1,,1
3,cnc,train_01_6,0,0.0,cnc_train_01_6_0_0,"The three-member Farlam Commission , chaired b...","<ARG1>The three-member Farlam Commission , cha...",1,1,,1
4,cnc,train_01_6,0,1.0,cnc_train_01_6_0_1,"The three-member Farlam Commission , chaired b...","The three-member Farlam Commission , chaired b...",1,1,,1


In [2]:
def get_cause_effect_spans(text_w_pairs):
    cause=re.sub(r"\<.*?\>","",re.search(r"\<ARG0\>(.*?)\<\/ARG0\>",text_w_pairs).group(1))
    effect=re.sub(r"\<.*?\>","",re.search(r"\<ARG1\>(.*?)\<\/ARG1\>",text_w_pairs).group(1))
    return cause, effect

# Non-Extractive QA --> Not relevant to SQUAD, but might be useful for other cases
def seqtemplate1(seq_label):
    question='Is this sentence Causal or Non-causal?'
    answers={"text": ["Causal" if int(seq_label)==1 else "Non-causal"]}
    return question, answers

def seqtemplate2(seq_label):
    question='Are there any causal relations in this sentence?'
    answers={"text": ["Yes" if int(seq_label)==1 else "No"]}
    return question, answers

def spantemplate1(text_w_pairs):
    question=f'Did "{cause}" lead to "{effect}"?'
    answers={"text": "Yes"}
    return question, answers 

def spantemplate2(text_w_pairs):
    cause, effect = get_cause_effect_spans(text_w_pairs)
    question=f'Did "{effect}" lead to "{cause}"?'
    answers={"text": "No"}
    return question, answers 

# Extractive QA
def spantemplate3(text_w_pairs):
    cause, effect = get_cause_effect_spans(text_w_pairs)
    question=f'What caused "{effect}"?'
    answers={"text": cause}
    return question, answers 

def spantemplate4(text_w_pairs):
    cause, effect = get_cause_effect_spans(text_w_pairs)
    question=f'What led to "{effect}"?'
    answers={"text": cause}
    return question, answers 

def spantemplate5(text_w_pairs):
    cause, effect = get_cause_effect_spans(text_w_pairs)
    question=f'Why did "{effect}" occur?'
    answers={"text": cause}
    return question, answers 

def spantemplate6(text_w_pairs):
    cause, effect = get_cause_effect_spans(text_w_pairs)
    question=f'What resulted from "{cause}"?'
    answers={"text": effect}
    return question, answers 

def spantemplate7(text_w_pairs):
    cause, effect = get_cause_effect_spans(text_w_pairs)
    question=f'What happened because of "{cause}"?'
    answers={"text": effect}
    return question, answers 

def spantemplate8(text_w_pairs):
    cause, effect = get_cause_effect_spans(text_w_pairs)
    question=f'What did "{cause}" cause?'
    answers={"text": effect}
    return question, answers 


# Show example conversions
text_w_pairs = data['text_w_pairs'][0]
extractiveQA_templates = [spantemplate3, spantemplate4, spantemplate5, spantemplate6, spantemplate7, spantemplate8]
templates = np.random.choice(extractiveQA_templates, size=2)
for template in templates:
    print(template(text_w_pairs))

('What resulted from "the allocation of low cost ( RDP ) houses at Marikana West Extension 2"?', {'text': 'they hacked Sabata Petros Chale , 39 , to death in Marikana West , on December 8 , 2016'})
('What happened because of "the allocation of low cost ( RDP ) houses at Marikana West Extension 2"?', {'text': 'they hacked Sabata Petros Chale , 39 , to death in Marikana West , on December 8 , 2016'})


In [3]:
# Run for whole dataset

qa_data = []
qa_data_columns = ['id','title','context','question','answers']
for i,row in data.iterrows():
    templates = np.random.choice(extractiveQA_templates, size=2)
    for j,template in enumerate(templates):
        question, answers = template(row['text_w_pairs'])
        unique_index = row['index']
        qa_data.append({
            'id':f'{unique_index}_{j}',
            'title':row.corpus,
            'context':row.text,
            'question':question,
            'answers':answers
        })

pd.DataFrame(qa_data)

Unnamed: 0,id,title,context,question,answers
0,cnc_train_01_0_0_0_0,cnc,The State alleged they hacked Sabata Petros Ch...,"Why did ""they hacked Sabata Petros Chale , 39 ...",{'text': 'the allocation of low cost ( RDP ) h...
1,cnc_train_01_0_0_0_1,cnc,The State alleged they hacked Sabata Petros Ch...,"What happened because of ""the allocation of lo...","{'text': 'they hacked Sabata Petros Chale , 39..."
2,cnc_train_01_5_0_0_0,cnc,"Police opened fire , killing 34 striking worke...","What happened because of ""trying to disperse a...",{'text': 'Police opened fire'}
3,cnc_train_01_5_0_0_1,cnc,"Police opened fire , killing 34 striking worke...","What led to ""Police opened fire""?",{'text': 'trying to disperse a group gathered ...
4,cnc_train_01_5_0_1_0,cnc,"Police opened fire , killing 34 striking worke...","Why did ""killing 34 striking workers and wound...",{'text': 'Police opened fire'}
...,...,...,...,...,...
4509,cnc_train_09_B_160_0_0_1,cnc,Mullanchira Mathai was beaten to death on Janu...,"Why did ""the acquittal of 11 CPI(M) activists ...",{'text': 'lack of witnesses'}
4510,cnc_train_09_A_161_0_0_0,cnc,Post offices wore a deserted look as members o...,"Why did ""Post offices wore a deserted look"" oc...",{'text': 'members of the National Federation o...
4511,cnc_train_09_A_161_0_0_1,cnc,Post offices wore a deserted look as members o...,"What resulted from ""members of the National Fe...",{'text': 'Post offices wore a deserted look'}
4512,cnc_train_09_B_161_0_0_0,cnc,Muttukad Nanappan was stabbed to death on June...,"Why did ""this case too did not stand"" occur?",{'text': 'lack of witnesses'}


In [4]:
qa_data[0]

{'id': 'cnc_train_01_0_0_0_0',
 'title': 'cnc',
 'context': 'The State alleged they hacked Sabata Petros Chale , 39 , to death in Marikana West , on December 8 , 2016 , allegedly over the allocation of low cost ( RDP ) houses at Marikana West Extension 2 .',
 'question': 'Why did "they hacked Sabata Petros Chale , 39 , to death in Marikana West , on December 8 , 2016" occur?',
 'answers': {'text': 'the allocation of low cost ( RDP ) houses at Marikana West Extension 2'}}

In [5]:
# qa_data.to_csv("../data/V2/cnc_train_squad.csv", index=False, encoding='utf-8-sig')
with open("../data/V2/cnc_train_squad.json", 'w') as f:
    json.dump(qa_data, f)