In [1]:
import pandas as pd
from  ast import literal_eval

In [None]:
gold_qasrl = pd.read_csv("../../data_to_annotate/gold/combined_data/processed_qasrl_arb_cls_gold.csv")
scu_duc_sents = pd.read_csv("../../data_to_annotate/gold/DUC06/qa_data/gold_pyr_pre_qasrl.csv")
ebc_sents = pd.read_csv("../../data_to_annotate/gold/ECB/qa_data/gold_ecb_pre_qasrl.csv")
#sents2scu = pd.read_csv("../data_to_annotate/gold/DUC06/qa_data/sentences_2_scu_index.csv")
with open("../../data_to_annotate/gold/DUC06/qa_data/sentences_2_scu_index.csv", "r") as f:
    conts = f.read()
    sent2scu = literal_eval(conts)

### Creating candidate pairs for alignment, based on scu_index for the pyramid data and document for ecb data

In [None]:
import itertools
def pair_up_scus(df):
    grouped = df.groupby(["document","scu_index"])
    pairs = []
    for i,g in grouped:
        pairs.extend(create_candidate_pairs(g))

    return pairs

def pair_up_ecbs(df):
    grouped = df.groupby("document")
    pairs = []
    for i,g in grouped:
        pairs.extend(create_candidate_pairs(g))

    return pairs

def create_candidate_pairs(df):
    pairs = []
    for subset in itertools.combinations(df.qasrl_id.value_counts().keys(), 2):
        print(subset)
        pairs.append((subset[0], subset[1]))
    print()
    return pairs

In [None]:
scu_pairs = pair_up_scus(scu_duc_sents)
ecb_pairs = pair_up_ecbs(ebc_sents)

#### Total number of pairs

In [None]:
len(scu_pairs) + len(ecb_pairs)

### Putting together qasrl annotations for each qasrl_id

In [None]:
len(gold_qasrl)

In [None]:
#order: "qa_uuid","verb","verbidx", "question","answer","answer_range"
grouped = gold_qasrl.groupby("qasrl_id")
qas = {}
for i,g in grouped:
    qa_zip = list(zip(g["qa_uuid"].tolist(),  g["verb"].tolist(), g["verb_idx"].tolist(),g["question"].tolist(), g["answer"].tolist()))
    qas[g["qasrl_id"].iloc[0]] = qa_zip

### QAs is a dictionary, mapping qasrl_id -> list of ("qa_uuid","verb","verbidx", "question","answer","answer_range") tuples

In [None]:
qas

In [None]:
qas_final = qas.copy()
for k,v in qas.items():
    new_v = []
    for item in v:
        new_v.append(list(item))
    qas_final[k] = new_v

### there's about 6 sentences that had no predicates extracted, therefore we remove them

In [None]:
gold_qasrl.qasrl_id.nunique()

In [None]:
orig_sents = pd.read_csv("../../data_to_annotate/gold/combined_data/ecb_duc_pre_qasrl.csv")

In [None]:
to_remove_qas = list(orig_sents[~orig_sents.qasrl_id.isin(list(gold_qasrl.qasrl_id))]["qasrl_id"])

In [None]:
to_remove_qas

## Now go through all pairs and create a csv that contains all the information. Sent1, Sent2, and qasrl annotations

### Data columns for Mturk -
    ["qasrl_id_1", "sent1","prev_text_1","qa_1","qasrl_id_2", "sent2", "prev_text_2"qa_2", "qasrl_id_1_unique_annots", "qasrl_id_2_unique_annots" ]

SCU data first

    gold_qasrl - qas for gold data
    scu_duc_sents - pyr sentences
    ebc_sents - ecb sents
    sent2scu - pyr sent to scu index

In [None]:
from collections import defaultdict
sent2qasrl = defaultdict()
for i,row in scu_duc_sents.iterrows():
    if row["sentence"] in sent2qasrl:
        sent2qasrl[row["sentence"]].append(row["qasrl_id"])
    else: sent2qasrl[row["sentence"]] = [row["qasrl_id"]]
    

In [None]:
to_remove_qas

In [None]:
mturk_scu_data = []
removed_pairs = []
for p in scu_pairs:
    hit = set() #represents an scu hit
    df1 = scu_duc_sents[scu_duc_sents.qasrl_id == p[0]]
    df2 = scu_duc_sents[scu_duc_sents.qasrl_id == p[1]]
    
    qaids_1_list = sent2qasrl[df1.sentence.iloc[0]]
    qaids_2_list = sent2qasrl[df2.sentence.iloc[0]]
    
    qasrl_id_used = None
    for qaid in qaids_1_list:
        if qaid in qas_final:
            qasrl_id_used = qaid
            break
    
    qasr2_id_used = None
    for qaid in qaids_2_list:
        if qaid in qas_final:
            qasr2_id_used = qaid
            break

    if qasrl_id_used == None or qasr2_id_used == None:
        removed_pairs.append((p[0], p[1], qaids_1_list, qaids_2_list))
        continue
    
    qa1 = qas_final[qasrl_id_used]
    qa2 = qas_final[qasr2_id_used]

    #["qasrl_id_1", "sent1","prev_text_1","qa_1","qasrl_id_2", "sent2", "prev_text_2"qa_2", "qasrl_id_1_unique_annots", "qasrl_id_2_unique_annots" ]
    mturk_scu_data.append((p[0], df1.sentence.iloc[0], df1.prev_text.iloc[0], qa1, p[1], df2.sentence.iloc[0], df2.prev_text.iloc[0], qa2, qasrl_id_used, qasr2_id_used))
    

In [None]:
### Now we'll do the same for ecb data, except theres no duplicates so less work

In [None]:
mturk_ecb_data = []

for p in ecb_pairs:
    if p[0] in to_remove_qas or p[1] in to_remove_qas:
        removed_pairs.append((p[0], p[1], None, None))
        continue

    hit = set() #represents an scu hit
    df1 = ebc_sents[ebc_sents.qasrl_id == p[0]]
    df2 = ebc_sents[ebc_sents.qasrl_id == p[1]]
    
    qa1 = qas_final[p[0]]
    qa2 = qas_final[p[1]]

    #["qasrl_id_1", "sent1","prev_text_1","qa_1","qasrl_id_2", "sent2", "prev_text_2"qa_2", "qasrl_id_1_unique_annots", "qasrl_id_2_unique_annots" ]
    mturk_ecb_data.append((p[0], df1.sentence.iloc[0], df1.prev_sentence.iloc[0], qa1, p[1], df2.sentence.iloc[0], df2.prev_sentence.iloc[0], qa2, None, None))
    


In [None]:
removed_pairs_lost = pd.DataFrame(removed_pairs)

In [None]:
len(removed_pairs)

In [None]:
removed_pairs_lost.columns = ["qasrl_id1", "qasrl_id2", "all_qaids_1", "all_qaids_2"]
removed_pairs_lost.to_csv("removed_pairs_lost.csv", index=False)

In [None]:
print("We removed {} number of pairs, due to them having no predicates when parsing for qas".format(len(removed_pairs)))

In [None]:
print("There are {} pair intances from DUC data, and {} pair instances from ecb data; total of {}".format(len(mturk_scu_data), len(mturk_ecb_data), len(mturk_scu_data) + len(mturk_ecb_data)))


In [None]:
mturk_scu_data.extend(mturk_ecb_data)

In [None]:
mturk_data = mturk_scu_data

In [None]:
len(mturk_data) #final number of pairs

## Now we create a data from from all of our collected rows, which correspond to a HIT on mturk.

In [None]:
mturk_df = pd.DataFrame(mturk_data)

In [None]:
mturk_df.columns = ["qasrl_id_1", "sent1","prev_text_1","qa_1","qasrl_id_2", "sent2", "prev_text_2", "qa_2", "qasrl_id_1_unique_annots", "qasrl_id_2_unique_annots"]

In [None]:
mturk_df.to_csv("mturk_gold_prepared_data.csv", index=False)

## Unfortunately we're not exactly done. We need to prepare a sentence suitable for the html, one that has the verbs bolded.

## Process text for html view

In [None]:
import pandas as pd
from ast import literal_eval
mturk = pd.read_csv("mturk_gold_prepared_data.csv")

In [None]:
def literal_return(val):
    try:
        return literal_eval(val)
    except (ValueError, SyntaxError) as e:
        return val

In [None]:
mturk["qa_1"] = mturk["qa_1"].apply(literal_return)
mturk["qa_2"] = mturk["qa_2"].apply(literal_return)

Note - if a previous sentence is empty, this means the actual sentence is the first of its document.

In [None]:
mturk.head()

In [None]:
sent1_html = []
sent2_html = []
qa1_html = []
qa2_html = []
for i, row in df.iterrows():
    verbs = get_verbs(row["qa_1"])
    sent1 = row["sent1"]
    nsent = find_verb(sent1, verbs)
    sent1_html.append(nsent)
    
    verbs = get_verbs(row["qa_2"])
    sent2 = row["sent2"]
    nsent = find_verb(sent2, verbs)
    sent2_html.append(nsent)

mturk["sent1_html"] = sent1_html
mturk["sent2_html"] = sent2_html

In [None]:
import re
def get_verbs(qal):
    verb_set = set()
    for qa in qal:
        verb_set.add((qa[1], qa[2])) #verb, verb_idx
    return verb_set

def find_verb(sent, verbs):
    for verb in verbs:
        if verb[0].lower() in sent.lower():
            sent = sent.lower().replace(verb[0].lower(), "<strong>"+verb[0].lower()+"</strong>")
            '''
            numVerbs = re.findall(verb[0], sent)
            if len(numVerbs) == 1:
                sent.replace(verb[0], "<strong>"+verb[0]+"</strong>")
                return sent
            else:
                nsent = re.findall(r"[\w']+|[.,!?;]", sent)
                all_verbs = [m.start() for m in re.finditer('test', 'test test test test')]
                choice_num = 0
                for i, token in enumerate(nsent):
                    if token == verb[0] and i != verb[1]:
                        choice_num +=1
                        continue
                    if i == verb[1]:
                        #replace substring only on the choice_num'th occurrence
            print(nsent)
            print (re.findall(verb[0], sent))
            '''
    return sent

# Finally we are done preparing cls data for Mechanical Turk!

#### lets take a look at an example row

In [None]:
pd.set_option('display.max_colwidth', -1)
mturk.iloc[0]

In [None]:
mturk.to_csv("mturk_gold_prepared_data.csv", index=False)

# once more step! in a text editor, you must replace all ' to \', since mturl loads the csv, without properly rendering complex objects

In [25]:
df = pd.read_csv("../preparing_cls/cls_gold_trap_2.csv")

In [3]:
df.columns

Index(['qasrl_id_1', 'sent1', 'prev_text_1', 'qa_1', 'qasrl_id_2', 'sent2',
       'prev_text_2', 'qa_2', 'qasrl_id_1_unique_annots',
       'qasrl_id_2_unique_annots', 'sent1_html', 'sent2_html'],
      dtype='object')

In [18]:
def replace_empty_prevs(row):
    if row['prev_text_1'] == "---------------" or "":
        row["prev_text_1"] = "NA"
    if row['prev_text_2'] == "---------------" or "":
        row["prev_text_2"] = "NA" 
    if type(row["prev_text_2"]) != str:
        row["prev_text_2"] = "NA"
    if type(row["prev_text_1"]) != str:
        row["prev_text_1"] = "NA"
    return row

In [26]:
df = df.apply(lambda x: replace_empty_prevs(x), axis=1)

In [27]:
df["qa_1"] = df["qa_1"].apply(lambda x: eval(x))
df["qa_2"] = df["qa_2"].apply(lambda x: eval(x))

In [21]:
import re
def get_verbs(qal):
    verb_set = set()
    for qa in qal:
        verb_set.add((qa[1], qa[2])) #verb, verb_idx
    return verb_set

def find_verbs(sent, verbs):
    for verb in verbs:
        for token in sent.split(" "):
            if verb[0].lower() == token.lower():
                sent = sent.replace(token, "<strong>"+token+"</strong>")
            '''
            numVerbs = re.findall(verb[0], sent)
            if len(numVerbs) == 1:
                sent.replace(verb[0], "<strong>"+verb[0]+"</strong>")
                return sent
            else:
                nsent = re.findall(r"[\w']+|[.,!?;]", sent)
                all_verbs = [m.start() for m in re.finditer('test', 'test test test test')]
                choice_num = 0
                for i, token in enumerate(nsent):
                    if token == verb[0] and i != verb[1]:
                        choice_num +=1
                        continue
                    if i == verb[1]:
                        #replace substring only on the choice_num'th occurrence
            print(nsent)
            print (re.findall(verb[0], sent))
            '''
    return sent

In [28]:
sent1_html = []
sent2_html = []
qa1_html = []
qa2_html = []
for i, row in df.iterrows():
    verbs = get_verbs(row["qa_1"])
    sent1 = row["sent1"]
    nsent = find_verbs(sent1, verbs)
    sent1_html.append(nsent)
    
    verbs = get_verbs(row["qa_2"])
    sent2 = row["sent2"]
    nsent = find_verbs(sent2, verbs)
    sent2_html.append(nsent)

df["sent1_html"] = sent1_html
df["sent2_html"] = sent2_html

In [23]:
df["sent1_html"].head()

0    Drug and alcohol addiction, domestic violence,...
1    International help with wetland restoration ha...
2    The World Bank is <strong>approving</strong> l...
3    In the US, new regulations were <strong>issued...
4    HMP-33, a ginger extract, and SAM-e also <stro...
Name: sent1_html, dtype: object

In [29]:
df.to_csv("cls_gold_trap_2.csv", index=False)

#### Noticed QASRL also has the tags: "LSB and RSB instead of '['']'"

In [None]:
df = pd.read_csv("final_mturk_gold_cls_data.csv")

In [None]:
df["qa_1"] = df["qa_1"].apply(lambda x: eval(x))
df["qa_2"] = df["qa_2"].apply(lambda x: eval(x))

In [None]:
def replace_brackets(qa):#5_5ecbplus_7
    for q in qa:
        if "-LSB-" in q[4]:
            q[4] = q[4].replace("-LSB-", "[")
        if "-RSB-" in q[4]:
            q[4] = q[4].replace("-RSB-", "]")
    return qa

In [25]:
df["qa_2"] = df["qa_2"].apply(lambda x: replace_brackets(x))

In [22]:
pd.set_option('display.max_colwidth', -1)

In [29]:
print(df[df.qasrl_id_2 == "5_5ecbplus_2"]["qa_2"])

113    [[601, canned, 7, Where did someone can someone?, Up the coast], [602, canned, 7, Who canned someone?, the Philly Sixers], [603, canned, 7, Who did someone can?, Jim O'Brien], [604, canned, 7, Why did someone can someone?, had been under fire all season], [605, hired, 21, How was someone hired by someone?, quickly], [606, hired, 21, Who hired someone?, the Philly Sixers], [607, hired, 21, Who was hired by someone?, Mo Cheeks], [608, hired, 21, Why has someone been hired?, canned Jim O'Brien [ who had been under fire all season ]]]
Name: qa_2, dtype: object


In [30]:
df.to_csv("final_mturk_gold_cls_data.csv", index=False)