In [1]:
import sys
sys.path.insert(0, "../") 
from utils.ClaimDB import ClaimDB
import pickle
import json
import pandas as pd
from tqdm import tqdm

# Keep only IRC

In [None]:
# Load the ACL and arXiv corpora
with open("../data/acl/corpus_ACL.pkl", "rb") as f:
    corpus_ACL = pickle.load(f)

with open("../data/arxiv/corpus_arxiv.pkl", "rb") as f:
    corpus_arxiv = pickle.load(f)

In [7]:
def reorder_section_hierarchy(sections):

    # initialize section hierarchy with abstract
    d = {0: {"header" : "abstract", "subsections" : {}}}
    i = 1

    # add the other sections
    for _, s in sections.items():
        
        if s["head_n"] == None:

            if s["n"] != None or s["header"] != "unidentified-section":
                d[i] = {"header" : s["header"],
                        "subsections": {}}
                i += 1
            
            else:
                j = len(d[i-1]["subsections"])

                if j > 0:
                    k = len(d[i-1]["subsections"][j-1]["subsections"])
                    if k > 0:
                        d[i-1]["subsections"][j-1]["subsections"][k] = {"header" : s["header"], "subsections" : {}}
                    else:
                        d[i-1]["subsections"][j] = {"header" : s["header"], "subsections" : {}}
                else:
                    d[i-1]["subsections"][j] = {"header" : s["header"], "subsections" : {}}
        
        else:
            j = len(d[i-1]["subsections"])
            d[i-1]["subsections"][j] = {"header" : s["header"], "subsections" : {}}

    return d

def pretty_print_sections_dict(d, indent = 0):
    s = ""
    for i, (k, v) in enumerate(d.items()):
        if v["header"] != "abstract":
            i = i + 1
        s += "\t"*indent + f"{i}. " + v["header"] + "\n"
        if "subsections" in v:
            s += pretty_print_sections_dict(v["subsections"], indent + 1)

    return s

def find_introduction(d):
    for k, v in d.items():
        if "introduction" in v["header"].lower():
            return v["header"]
    return None

In [6]:
def find_main_head(d, section, direct = False):
    
    for i, sec in d.items():

        if i == 0:
            continue

        subsections = sec["subsections"]
        subsections_names = [v["header"] for k, v in subsections.items()]

        if section in subsections_names:
            return sec["header"]

        else:
            # check in the subsections
            to_explore = list(subsections.items())
            while len(to_explore) > 0:
                k, v = to_explore.pop(0)
                if section in [v["header"] for k, v in v["subsections"].items()]:
                    return v["header"]
                else:
                    to_explore.extend(v["subsections"].items())

    return None

def find_all_children(d, section):
    
    for i, sec in d.items():

        if i == 0:
            continue

        subsections = sec["subsections"]
        subsections_names = [v["header"] for k, v in subsections.items()]

        if sec["header"] == section:
            return subsections_names
        
        else:
            # check in the subsections
            to_explore = list(subsections.items())
            while len(to_explore) > 0:
                k, v = to_explore.pop(0)
                if v["header"] == section:
                    return [v["header"] for k, v in v["subsections"].items()]
                else:
                    to_explore.extend(v["subsections"].items())

    return None

In [105]:
corpus_ACL.describe()

Corpus 'ACL' was filled with 73496 papers:
  - 60123 papers were successfully loaded
  - 13373 papers could not be loaded


In [109]:
from tqdm import tqdm

n = 0
RES = ["result", "performance", "evaluation", "experiment"]
CONCL = ["analysis", "discussion", "limit", "ethic", "conclusion", "concluding", "future"]
all_IRC = []

# for p in corpus_ACL.papers_with_errors:
#     if p.init_error.startswith("Too many candidate sentences") or p.init_error.startswith("Not enough candidate sentences"):
#         p.init_error = None
#         corpus_ACL.papers.append(p)

corpus_ACL.papers_with_errors = [p for p in [p_ for p_ in corpus_ACL.papers_with_errors if p_.init_error is not None] if not p.init_error.startswith("Too many candidate sentences") and not p.init_error.startswith("Not enough candidate sentences")]

for p in tqdm(corpus_ACL.papers):
    sections = [v["header"] for k, v in p.sections.items()]

    sections_d = reorder_section_hierarchy(p.sections)
    
    found_intro = False
    found_results = False
    found_conclusion = False

    sections_to_keep = set()

    # introduction
    for i, section in enumerate(sections):
        if "introduction" in section.lower():
            found_intro = True
            sections_to_keep.add((i, section))
            break

    # results
    for j, section in enumerate(sections):
        if j > i:
            for res in RES:
                if res in section.lower():
                    found_results = True


                    head = find_main_head(sections_d, section)
                    if head is None:
                        head = section

                    children = find_all_children(sections_d, head)

                    if children is None:
                        sections_to_keep.add((j, section))
                    else:
                        sections_to_keep.add((j, head))
                        for child in children:
                            sections_to_keep.add((j, child))


    # conclusion
    for k, section in enumerate(sections):
        for concl in CONCL:
            if concl in section.lower():
                found_conclusion = True
                
                head = find_main_head(sections_d, section)
                if head is None:
                    head = section

                children = find_all_children(sections_d, head)

                if children is None:
                    sections_to_keep.add((k, section))
                else:
                    sections_to_keep.add((k, head))
                    for child in children:
                        sections_to_keep.add((k, child))


    
    if found_intro and found_results and found_conclusion:
        all_IRC.append(list(sections_to_keep))
        n += 1
        p.content["candidate"] = p.content["section"].apply(lambda x: x in [s[1] for s in sections_to_keep])

    else:
        p.init_error = "Not following IRC structure"
        corpus_ACL.papers_with_errors.append(p)


print(f"{n} / {len(corpus_ACL.papers)} papers have an introduction, results and conclusion section")


100%|██████████| 60123/60123 [00:17<00:00, 3409.88it/s]

41633 / 60123 papers have an introduction, results and conclusion section





In [111]:
import numpy as np

nb_cands = [len(p.content[p.content["candidate"] == True]) for p in corpus_ACL.papers]

q1 = np.percentile(nb_cands, 25)
q3 = np.percentile(nb_cands, 75)
iqr = q3 - q1

min_cands = q1 - 1.5*iqr
max_cands = q3 + 1.5*iqr
print(q1, q3)
print(min_cands, max_cands)

for p in corpus_ACL.papers:
    nb_cands = len(p.content[p.content["candidate"] == True])

    if nb_cands < 20:
        p.init_error = "Not enough candidate sentences (less than 20)"
        corpus_ACL.papers_with_errors.append(p)

    elif nb_cands > max_cands:
        p.init_error = f"Too many candidate sentences (more than Q3 + 1.5*IQR = {max_cands})"
        corpus_ACL.papers_with_errors.append(p)  

no_errors = [p for p in corpus_ACL.papers if p.init_error is None]
corpus_ACL.papers = no_errors

corpus_ACL.describe()

62.0 132.0
-43.0 237.0
Corpus 'ACL' was filled with 73055 papers:
  - 40966 papers were successfully loaded
  - 32089 papers could not be loaded


In [112]:
with open("../data/acl/corpus_ACL_IRC.pkl", "wb") as f:
    pickle.dump(corpus_ACL, f)

In [113]:
n = 0
RES = ["result", "performance", "evaluation", "experiment"]
CONCL = ["analysis", "discussion", "limit", "ethic", "conclusion", "concluding", "future"]
all_IRC = []

for p in corpus_arxiv.papers_with_errors:
    if p.init_error.startswith("Too many candidate sentences") or p.init_error.startswith("Not enough candidate sentences"):
        p.init_error = None
        corpus_arxiv.papers.append(p)

for p in tqdm(corpus_arxiv.papers):
    sections = [v["header"] for k, v in p.sections.items()]

    sections_d = reorder_section_hierarchy(p.sections)
    
    found_intro = False
    found_results = False
    found_conclusion = False

    sections_to_keep = set()

    # introduction
    for i, section in enumerate(sections):
        if "introduction" in section.lower():
            found_intro = True
            sections_to_keep.add((i, section))
            break

    # results
    for j, section in enumerate(sections):
        if j > i:
            for res in RES:
                if res in section.lower():
                    found_results = True


                    head = find_main_head(sections_d, section)
                    if head is None:
                        head = section

                    children = find_all_children(sections_d, head)

                    if children is None:
                        sections_to_keep.add((j, section))
                    else:
                        sections_to_keep.add((j, head))
                        for child in children:
                            sections_to_keep.add((j, child))


    # conclusion
    for k, section in enumerate(sections):
        for concl in CONCL:
            if concl in section.lower():
                found_conclusion = True
                
                head = find_main_head(sections_d, section)
                if head is None:
                    head = section

                children = find_all_children(sections_d, head)

                if children is None:
                    sections_to_keep.add((k, section))
                else:
                    sections_to_keep.add((k, head))
                    for child in children:
                        sections_to_keep.add((k, child))


    
    if found_intro and found_results and found_conclusion:
        all_IRC.append(list(sections_to_keep))
        n += 1
        p.content["candidate"] = p.content["section"].apply(lambda x: x in [s[1] for s in sections_to_keep])

    else:
        p.init_error = "Not following IRC structure"
        corpus_arxiv.papers_with_errors.append(p)

print(f"{n} / {len(corpus_arxiv.papers)} papers have an introduction, results and conclusion section")


100%|██████████| 30361/30361 [00:15<00:00, 1974.30it/s]

23809 / 30361 papers have an introduction, results and conclusion section





In [115]:
without_errors = [p for p in corpus_arxiv.papers if p.init_error == None]
print(f"Number of papers without errors: {len(without_errors)}")
corpus_arxiv.papers = without_errors

corpus_arxiv.describe(error_verbose=True)

Number of papers without errors: 23809
Corpus 'arXiv' was filled with 35719 papers:
  - 23809 papers were successfully loaded
  - 11910 papers could not be loaded

Errors:
  - FileNotFoundError: XML file does not exist : 3471
  - Noisy data: wrong language (fr) : 48
  - Noisy data: wrong language (uk) : 5
  - Noisy data: wrong language (ru) : 9
  - Noisy data: wrong language (da) : 4
  - parsing error: not enough paper content found (<2 distinct sections) : 14
  - Noisy data: wrong language (hi) : 1
  - Noisy data: wrong language (de) : 8
  - Noisy data: wrong language (tr) : 12
  - Noisy data: wrong language (id) : 4
  - Noisy data: wrong language (pt) : 8
  - Noisy data: wrong language (pl) : 2
  - Noisy data: wrong language (es) : 12
  - Noisy data: wrong language (it) : 8
  - Noisy data: wrong language (zh-cn) : 1
  - Noisy data: wrong language (et) : 3
  - Noisy data: wrong language (tl) : 1
  - Noisy data: wrong language (hu) : 1
  - Noisy data: wrong language (ko) : 1
  - Noisy 

In [116]:
import numpy as np

nb_cands = [len(p.content[p.content["candidate"] == True]) for p in corpus_arxiv.papers]

q1 = np.percentile(nb_cands, 25)
q3 = np.percentile(nb_cands, 75)
iqr = q3 - q1

min_cands = q1 - 1.5*iqr
max_cands = q3 + 1.5*iqr
print(q1, q3)
print(min_cands, max_cands)

for p in corpus_arxiv.papers:
    nb_cands = len(p.content[p.content["candidate"] == True])

    if nb_cands < 20:
        p.init_error = "Not enough candidate sentences (less than 20)"
        corpus_arxiv.papers_with_errors.append(p)

    elif nb_cands > max_cands:
        p.init_error = f"Too many candidate sentences (more than Q3 + 1.5*IQR = {max_cands})"
        corpus_arxiv.papers_with_errors.append(p)  

no_errors = [p for p in corpus_arxiv.papers if p.init_error is None]
corpus_arxiv.papers = no_errors

corpus_arxiv.describe()

70.0 144.0
-41.0 255.0
Corpus 'arXiv' was filled with 35719 papers:
  - 22826 papers were successfully loaded
  - 12893 papers could not be loaded


In [117]:
with open("../data/arxiv/corpus_arxiv_IRC.pkl", "wb") as f:
    pickle.dump(corpus_arxiv, f)

# Inspect IRC

In [3]:
# # Load the ACL and arXiv corpora
# with open("../data/acl/corpus_ACL_IRC.pkl", "rb") as f:
#     corpus_ACL = pickle.load(f)

# with open("../data/arxiv/corpus_arxiv_IRC.pkl", "rb") as f:
#     corpus_arxiv = pickle.load(f)


In [10]:
# for corpus in [corpus_ACL, corpus_arxiv]:
#     for p in tqdm(corpus.papers):
#         p.content["candidate"] = p.content.apply(lambda x: True if x["section"] == "abstract" else x["candidate"], axis = 1)

# cdb = ClaimDB(corpora = [corpus_ACL, corpus_arxiv])

100%|██████████| 40966/40966 [00:48<00:00, 840.49it/s]
100%|██████████| 22826/22826 [00:34<00:00, 658.43it/s]


In [4]:
with open("../data/cdb_IRC.pkl", "rb") as f:
    cdb = pickle.load(f)

corpus_ACL, corpus_arxiv = cdb.corpora

In [3]:
corpus_ACL.papers[0].content

Unnamed: 0,id,sentence,section,candidate
0,0,There is a need to measure word similarity whe...,abstract,True
1,1,"Usually, measures of similarity between two wo...",abstract,True
2,2,The taxonomy approaches are more or less seman...,abstract,True
3,3,"However, in real applications, both semantic a...",abstract,True
4,4,Word similarity based on context vectors is a ...,abstract,True
...,...,...,...,...
184,184,c) Syntactic and semantic similarity is balanc...,Conclusions,True
185,185,The performance of our method might have been ...,Conclusions,True
186,186,"Further more, Cilin was published a long time ...",Conclusions,True
187,187,"However, our experimental results are encourag...",Conclusions,True


In [9]:
corpus_ACL_0 = [p for p in corpus_ACL.papers if p.year < 1994]
corpus_ACL_1 = [p for p in corpus_ACL.papers if p.year >= 1994 and p.year < 2004]
corpus_ACL_2 = [p for p in corpus_ACL.papers if p.year >= 2004 and p.year < 2014]
corpus_ACL_3 = [p for p in corpus_ACL.papers if p.year >= 2014]

corpus_ACL_by_year_slices = [corpus_ACL_0, corpus_ACL_1, corpus_ACL_2, corpus_ACL_3]

corpus_arxiv_0 = [p for p in corpus_arxiv.papers if p.year < 1994]
corpus_arxiv_1 = [p for p in corpus_arxiv.papers if p.year >= 1994 and p.year < 2004]
corpus_arxiv_2 = [p for p in corpus_arxiv.papers if p.year >= 2004 and p.year < 2014]
corpus_arxiv_3 = [p for p in corpus_arxiv.papers if p.year >= 2014]

corpus_arxiv_by_year_slices = [corpus_arxiv_0, corpus_arxiv_1, corpus_arxiv_2, corpus_arxiv_3]

acl_total = 0
arxiv_total = 0

for c_acl in corpus_ACL_by_year_slices:
    n = len(c_acl)
    acl_total += n
    print(n)

print("total:", acl_total)

for c_arxiv in corpus_arxiv_by_year_slices:
    n = len(c_arxiv)
    arxiv_total += n
    print(n)

print("total:", arxiv_total)

281
1971
11285
27429
total: 40966
0
101
200
22525
total: 22826


In [18]:
with open("../data/annotated_articles.json", "r") as f:
    d = json.load(f)

ACL_annotated = []
arxiv_annotated = []

for v in d.keys():
    if v != "v5":
        for paper in d[v]:
            if paper[0] == "ACL":
                ACL_annotated.append(paper[1])
            else:
                arxiv_annotated.append(paper[1])

print(ACL_annotated)
print(len(ACL_annotated))
print(arxiv_annotated)
print(len(arxiv_annotated))

['2020.signlang-1.20', 'W17-4709', 'N19-1358', 'Y15-1047', 'P18-1048', 'W17-5513', '2022.naacl-main.19', '2022.in2writing-1.4', 'W18-3406', 'H89-1053', 'D15-1013', 'P97-1030', 'Y11-1012', 'W01-1826', 'D19-1210', 'W98-1507', 'W01-0812', 'W94-0305', 'P90-1033', 'A92-1020', 'P11-1048', 'P11-4022', '2020.emnlp-main.554', 'P13-1099', 'E87-1013', 'W03-0425', '2020.acl-main.413', 'W18-0527', 'C92-1030', 'W16-5818', 'W99-0402', 'W13-4420', '2001.mtsummit-papers.31', '1993.tmi-1.6', 'W00-1415', 'C98-2177', '2007.mtsummit-papers.40', 'N18-1107', 'Y03-1032', 'H89-2041', 'P19-1654', 'P93-1023', 'W08-1302', 'P11-1029', 'C90-1007', 'C92-2087', 'H89-2017', 'P91-1024', '2020.acl-demos.20', 'W00-0505', 'P07-2029', 'Q16-1037', 'I13-1029', 'J92-4001', 'P07-1088', 'J97-2004', 'W09-0809', 'E93-1027', 'E09-1027', 'W99-0609', 'J98-3005', '2020.lrec-1.826', 'W12-4402', 'C14-1028', 'Y09-2035', 'H93-1064', '2020.loresmt-1.4', 'D18-1087']
68
['2103.14302', '1708.01009', '1611.08765', '1605.05172', '2012.04584', 

In [19]:
ACL_ids = [p.id for p in corpus_ACL.papers]

"P18-1048" in ACL_ids

True

In [7]:
ACL_ids[0]

'O02-2002'

In [20]:
for c_acl in corpus_ACL_by_year_slices:
    c_acl = [p for p in c_acl if p.id not in ACL_annotated]

for c_arx in corpus_arxiv_by_year_slices:
    c_arx = [p for p in c_arx if p.id not in arxiv_annotated]

for c_acl in corpus_ACL_by_year_slices:
    n = len(c_acl)
    acl_total += n
    print(n)

print("total:", acl_total)

for c_arxiv in corpus_arxiv_by_year_slices:
    n = len(c_arxiv)
    arxiv_total += n
    print(n)

print("total:", arxiv_total)

281
1971
11285
27429
total: 122898
0
101
200
22525
total: 68478


In [21]:
import random
import numpy as np

random_papers = []
random.seed(0)

for c_acl in corpus_ACL_by_year_slices:
    random_papers.extend(np.random.choice(c_acl, 15))

for c_arx in corpus_arxiv_by_year_slices[1:]:
    random_papers.extend(np.random.choice(c_arx, 20))

random.shuffle(random_papers)
print(len(random_papers))

120


In [22]:
random_papers_ids = [[p.corpus.name, p.id] for p in random_papers]
print(random_papers_ids)

d["v5"] = random_papers_ids
print(d)

with open("../data/annotated_articles.json", "w") as f:
    json.dump(d, f)

[['ACL', 'W19-8612'], ['arXiv', '2311.15402'], ['arXiv', '1302.1572'], ['ACL', 'E87-1037'], ['arXiv', '2205.10593'], ['ACL', 'W19-2609'], ['arXiv', 'cmp-lg/9608003'], ['ACL', 'W01-0506'], ['ACL', 'N10-1040'], ['ACL', 'P02-1043'], ['ACL', 'W16-2379'], ['arXiv', 'cs/0105005'], ['ACL', 'J01-2001'], ['arXiv', 'cmp-lg/9711004'], ['arXiv', 'cs/0307055'], ['ACL', 'J03-1001'], ['arXiv', '1302.2131'], ['ACL', 'J00-4001'], ['ACL', 'C90-2054'], ['arXiv', 'cmp-lg/9505021'], ['ACL', 'C86-1067'], ['arXiv', '2209.10505'], ['ACL', 'R13-1019'], ['arXiv', '1307.0261'], ['arXiv', 'cs/0405044'], ['ACL', 'C10-2093'], ['ACL', 'P19-1570'], ['arXiv', '2204.10181'], ['arXiv', '1212.2477'], ['ACL', 'W11-4614'], ['ACL', 'P91-1038'], ['ACL', 'N03-3005'], ['ACL', 'P15-1021'], ['ACL', 'W16-2373'], ['arXiv', '1807.04978'], ['arXiv', '1301.3614'], ['ACL', 'A88-1027'], ['ACL', 'D14-1169'], ['ACL', 'Y03-1009'], ['arXiv', '2012.08695'], ['arXiv', 'cmp-lg/9506024'], ['arXiv', '1310.1975'], ['ACL', '2006.amta-papers.16'],

In [23]:
random_sentences_ids = []
coord2idx = {v:k for k,v in cdb.idx_map.items()}

for rp in random_papers:

    sentences_ids = rp.content[rp.content["candidate"] == True]["id"].tolist()

    random_sentences_ids.extend([coord2idx[(rp.corpus.name, rp.id, i)] for i in sentences_ids])

df = cdb.candidates.loc[random_sentences_ids]
print(df.shape)

(12603, 7)


In [25]:
for clem in d["v4"]:
    if clem in d["v5"]:
        print(clem)

['arXiv', '1010.2384']
['arXiv', 'cmp-lg/9605029']


In [26]:
def reorder_section_hierarchy(sections):

    # initialize section hierarchy with abstract
    d = {0: {"header" : "abstract", "subsections" : {}}}
    i = 1

    # add the other sections
    for _, s in sections.items():
        
        if s["head_n"] == None:

            if s["n"] != None or s["header"] != "unidentified-section":
                d[i] = {"header" : s["header"],
                        "subsections": {}}
                i += 1
            
            else:
                j = len(d[i-1]["subsections"])

                if j > 0:
                    k = len(d[i-1]["subsections"][j-1]["subsections"])
                    if k > 0:
                        d[i-1]["subsections"][j-1]["subsections"][k] = {"header" : s["header"], "subsections" : {}}
                    else:
                        d[i-1]["subsections"][j] = {"header" : s["header"], "subsections" : {}}
                else:
                    d[i-1]["subsections"][j] = {"header" : s["header"], "subsections" : {}}
        
        else:
            j = len(d[i-1]["subsections"])
            d[i-1]["subsections"][j] = {"header" : s["header"], "subsections" : {}}

    return d

import random
import numpy as np

def get_random_candidates_subset(cdb, ACL_count, arxiv_count, seed = 0, random_papers = []):
    
    coord2idx = {v:k for k,v in cdb.idx_map.items()}

    if random_papers == []:
        ## TODO: check which papers have already been annotated
        random.seed(seed)
        random_ACL = np.random.choice(corpus_ACL.papers, ACL_count, replace = False)
        random_arxiv = np.random.choice(corpus_arxiv.papers, arxiv_count, replace = False)
        
        random_papers = list(random_ACL)
        random_papers.extend(list(random_arxiv))

        random.shuffle(random_papers)

    random_idx = []

    for rp in random_papers:

        cand_ids = rp.content[rp.content["candidate"] == True]["id"].tolist()

        random_idx.extend([coord2idx[(rp.corpus.name, rp.id, i)] for i in cand_ids])

    return cdb.candidates.loc[random_idx]

def get_feedback_on_article(p, n=90):
    s = "="*n + "\n" + "Annotator feedback" + "\n" + "="*n + "\n\n"
    s += f"You just finished annotating the article entitled <<{p.title}>>. Please answer following questions: \n\n"

    s1 = "1. Do you think that this article was difficult to understand, in a way that may have affected the quality of your annotations, because of its technicity / because it handles subjects you are unfamiliar with ?\n\n"
    s1 += "Please add any label of your choice if your answer is yes."

    s2 = "2. Do you think that this article was difficult to understand, in a way that may have affected the quality of your annotations, because of its writing style / structure / parsing errors ?\n\n"
    s2 += "Please add any label of your choice if your answer is yes."

    s3 = "3. Did you know / read the article before this annotation task, or do you think you have identified its authors ?\n\n"
    s3 += "Please add any label of your choice if your answer is yes."

    return [s + s1, s + s2, s + s3]

def find_main_head(d, section):
    
    section_n = 0
    head = section
    head_n = 0

    for i, sec in d.items():

        if section == sec["header"]:
            head_n = i
            section_n = i
            break

        subsections = sec["subsections"]
        subsections_names = [v["header"] for k, v in subsections.items()]

        if section in subsections_names:
           head = sec["header"]
           head_n = i
           section_n = subsections_names.index(section)
           break

        else:
            # check in the subsections
            to_explore = list(subsections.items())
            ids = [[k] for k, v in subsections.items()]
            while len(to_explore) > 0:
                k, v = to_explore.pop(0)
                id_ = ids.pop(0)

                subsections_names = [v["header"] for k, v in v["subsections"].items()]

                if section in subsections_names:
                    head = sec["header"]
                    head_n = i
                    section_n = id_
                else:
                    to_explore.extend(v["subsections"].items())
                    ids.extend([[k] + [k_] for k_, v in v["subsections"].items()])


    return section, section_n, head, head_n

def find_all_children(d, section):
    
    children = [section]

    for i, sec in d.items():

        if sec["header"] == section:

            while "subsections" in sec.keys():
                ss = sec["subsections"]
                children.extend([v["header"] for k, v in ss.items()])
                sec = ss

    return children


def prepare_for_doccano_format(cdb, df:pd.DataFrame)-> pd.DataFrame:
    """A function to prepare a dataframe of sentences for Doccano format
    - df : a pandas DataFrame with columns {corpus, paper_id, sentence_id, sentence, section}"""

    data = []
    coord2idx = {v:k for k,v in cdb.idx_map.items()}


    for i, row in df.iterrows():
        c = cdb.get_corpus_by_name(row["corpus"])
        p = c.get_paper_by_id(row["paper_id"])

        sections_d = reorder_section_hierarchy(p.sections)
        main_sections_str = ""
        for i, sec in enumerate(sections_d.values()):
            main_sections_str += str(i) + ". " + sec["header"] + "\n"

        sec, sec_n, head, head_n = find_main_head(sections_d, row["section"])
        

        idx = coord2idx[(c.name, p.id, row["sentence_id"])]
        text = row["sentence"]

        
        prev_sent_id = int(row["sentence_id"]) - 1
        next_sent_id = int(row["sentence_id"]) + 1

        # get previous sentence
        if prev_sent_id in p.content["id"].values:
            prev_doc = p.content.loc[prev_sent_id]
            prev_text = prev_doc["sentence"]
            prev_sec = prev_doc["section"]

        else:
            prev_text = ""
            prev_sec = ""

        # get next sentence
        if next_sent_id in p.content["id"].values:
            next_doc = p.content.loc[next_sent_id]
            next_text = next_doc["sentence"]
            next_sec = next_doc["section"]
        
        else:
            next_text = ""
            next_sec = ""


        data.append({
            "text": text,
            "doc_id": idx,
            "corpus": p.corpus.name,
            "paper_title" : p.title,
            "paper_id" : p.id,
            "paper_structure": main_sections_str,
            "year": p.year,
            "section": sec,
            "section_n" : sec_n,
            "main_head" : head,
            "main_head_n" : head_n,
            "prev_text": prev_text,
            "prev_section": prev_sec,
            "next_text": next_text,
            "next_section": next_sec,
            "label": ""
        })

    df_doccano = pd.DataFrame(data)

    return df_doccano

In [27]:
df_doccano = prepare_for_doccano_format(cdb, df)
df_doccano.head()

Unnamed: 0,text,doc_id,corpus,paper_title,paper_id,paper_structure,year,section,section_n,main_head,main_head_n,prev_text,prev_section,next_text,next_section,label
0,Multiple headlines of a newspaper article have...,2242712,ACL,Multiple News Headlines Generation using Page ...,W19-8612,0. abstract\n1. Introduction\n2. Multiple News...,2019,abstract,0,abstract,0,,,A headline depends on the content and intent o...,abstract,
1,A headline depends on the content and intent o...,2242713,ACL,Multiple News Headlines Generation using Page ...,W19-8612,0. abstract\n1. Introduction\n2. Multiple News...,2019,abstract,0,abstract,0,Multiple headlines of a newspaper article have...,abstract,While a single headline expresses the whole co...,abstract,
2,While a single headline expresses the whole co...,2242714,ACL,Multiple News Headlines Generation using Page ...,W19-8612,0. abstract\n1. Introduction\n2. Multiple News...,2019,abstract,0,abstract,0,A headline depends on the content and intent o...,abstract,We suggest an automatic generation method of s...,abstract,
3,We suggest an automatic generation method of s...,2242715,ACL,Multiple News Headlines Generation using Page ...,W19-8612,0. abstract\n1. Introduction\n2. Multiple News...,2019,abstract,0,abstract,0,While a single headline expresses the whole co...,abstract,Our generation method is based on the Pointer-...,abstract,
4,Our generation method is based on the Pointer-...,2242716,ACL,Multiple News Headlines Generation using Page ...,W19-8612,0. abstract\n1. Introduction\n2. Multiple News...,2019,abstract,0,abstract,0,We suggest an automatic generation method of s...,abstract,We conducted automatic evaluations for generat...,abstract,


In [28]:
text2 = []
import re

current_head = ""
current_section = ""
h = 0
s = 0
total_head_len = 0
total_section_len = 0

for i, row in df_doccano.iterrows():
    n = 90

    sec = row["section"] 
    sec_n = row["section_n"]
    head = row["main_head"]
    head_n = row["main_head_n"]

    if row["corpus"] == "ACL":
            p = corpus_ACL.get_paper_by_id(row["paper_id"])
    else:
        p = corpus_arxiv.get_paper_by_id(row["paper_id"])

    if head != current_head:
        current_head = head

        cands = p.content[p.content["candidate"] == True]
        children = find_all_children(reorder_section_hierarchy(p.sections), head)
        total_head_len = len(cands[cands["section"].isin(children)])

        h = 1

    else:
        h += 1

    if sec != current_section:
        current_section = sec
        
        if row["corpus"] == "ACL":
            p = corpus_ACL.get_paper_by_id(row["paper_id"])
        else:
            p = corpus_arxiv.get_paper_by_id(row["paper_id"])

        cands = p.content[p.content["candidate"] == True]
        children = find_all_children(reorder_section_hierarchy(p.sections), sec)
        total_section_len = len(cands[cands["section"].isin(children)])

        s = 1

    else:
        s += 1


    text = p.title.replace("\n", "").replace("\t", "") + "\n"
    text += "=" * n + "\n" + str(head_n) + ". " + head + " -- " + str(h) + "/" + str(total_head_len) + "\n" + "=" * n + "\n"

    if sec != head:
        text += str(sec_n) + ". " + sec + " -- " + str(s) + "/" + str(total_section_len) +"\n" + "-" * n + "\n" 

    text+= row["text"]

    text2.append(text)

df_doccano["text"] = text2

In [29]:
# insert feedback questions in the dataset
for rp in random_papers:
    last_row= df_doccano[df_doccano["paper_title"] == rp.title].iloc[-1]
    last_index = float(last_row.name)
    values = list(last_row.values)

    fb = get_feedback_on_article(rp)
    for fb_q, h in zip(fb, [0.25, 0.5, 0.75]):
        values[0] = fb_q
        for i in range(7, len(values)):
            values[i] = ""
        df_doccano.loc[last_index + h] = values

df_doccano = df_doccano.sort_index().reset_index(drop = True)

In [30]:
df_doccano[df_doccano["paper_title"].str.startswith("Latent")]
# df_doccano.at[0, "text"]

Unnamed: 0,text,doc_id,corpus,paper_title,paper_id,paper_structure,year,section,section_n,main_head,main_head_n,prev_text,prev_section,next_text,next_section,label


In [31]:
df_doccano = df_doccano.drop(columns = ["corpus", "paper_id", "section",  "section_n", "main_head", "main_head_n"])
df_doccano

Unnamed: 0,text,doc_id,paper_title,paper_structure,year,prev_text,prev_section,next_text,next_section,label
0,Multiple News Headlines Generation using Page ...,2242712,Multiple News Headlines Generation using Page ...,0. abstract\n1. Introduction\n2. Multiple News...,2019,,,A headline depends on the content and intent o...,abstract,
1,Multiple News Headlines Generation using Page ...,2242713,Multiple News Headlines Generation using Page ...,0. abstract\n1. Introduction\n2. Multiple News...,2019,Multiple headlines of a newspaper article have...,abstract,While a single headline expresses the whole co...,abstract,
2,Multiple News Headlines Generation using Page ...,2242714,Multiple News Headlines Generation using Page ...,0. abstract\n1. Introduction\n2. Multiple News...,2019,A headline depends on the content and intent o...,abstract,We suggest an automatic generation method of s...,abstract,
3,Multiple News Headlines Generation using Page ...,2242715,Multiple News Headlines Generation using Page ...,0. abstract\n1. Introduction\n2. Multiple News...,2019,While a single headline expresses the whole co...,abstract,Our generation method is based on the Pointer-...,abstract,
4,Multiple News Headlines Generation using Page ...,2242716,Multiple News Headlines Generation using Page ...,0. abstract\n1. Introduction\n2. Multiple News...,2019,We suggest an automatic generation method of s...,abstract,We conducted automatic evaluations for generat...,abstract,
...,...,...,...,...,...,...,...,...,...,...
12952,Comparison of Syntactic Parsers on Biomedical ...,4678499,Comparison of Syntactic Parsers on Biomedical ...,0. abstract\n1. Introduction and related work\...,2020,"In this light, SNN is attractive as it shows t...",Conclusions,Our experiment with self-training Google parse...,Conclusions,
12953,Comparison of Syntactic Parsers on Biomedical ...,4678500,Comparison of Syntactic Parsers on Biomedical ...,0. abstract\n1. Introduction and related work\...,2020,Mate is the one with the lowest time required ...,Conclusions,,,
12954,==============================================...,4678500,Comparison of Syntactic Parsers on Biomedical ...,0. abstract\n1. Introduction and related work\...,2020,,,,,
12955,==============================================...,4678500,Comparison of Syntactic Parsers on Biomedical ...,0. abstract\n1. Introduction and related work\...,2020,,,,,


In [32]:
df_doccano.to_csv("to-annotate-Fanny-120.csv", index = False)