In [1]:
import numpy as np
import sys
sys.path.insert(0, "../") 
from utils.ClaimDB import ClaimDB
from utils.Paper import Paper
import pickle
import json
import pandas as pd
from tqdm import tqdm

In [2]:
with open("../data/cdb_IRC.pkl", "rb") as f:
    cdb = pickle.load(f)

corpus_ACL, corpus_arxiv = cdb.corpora

In [3]:
list(set([p.init_error for p in corpus_ACL.papers_with_errors]))

['Noisy data: wrong language (so)',
 'Noisy data: wrong language (it)',
 'Noisy data: wrong language (pl)',
 'Not following IRC structure',
 'Noisy data: wrong language (hu)',
 'Too many candidate sentences (more than Q3 + 1.5*IQR = 237.0)',
 'Noisy data: wrong language (sl)',
 'Too old (before 1986)',
 'Noisy data: wrong language (nl)',
 'Noisy data: wrong language (sk)',
 'Noisy data: wrong language (vi)',
 'Noisy data: wrong language (sw)',
 'Parsing error: no abstract found',
 'Noisy data: wrong language (bn)',
 'Noisy data: wrong language (id)',
 'Noisy data: wrong language (zh-tw)',
 'Noisy data: wrong language (de)',
 'Noisy data: wrong language (hr)',
 'Noisy data: wrong language (ja)',
 'Noisy data: wrong language (pt)',
 'Noisy data: wrong language (no)',
 'Noisy data: wrong language (hi)',
 'Noisy data: wrong language (ro)',
 'Noisy data: wrong language (lt)',
 'Noisy data: wrong language (tr)',
 'Noisy data: wrong language (sv)',
 'Noisy data: wrong language (sq)',
 'Noisy 

In [4]:
papers_ACL = [p for p in corpus_ACL.papers]
papers_ACL.extend([p for p in corpus_ACL.papers_with_errors if p.init_error == "Not following IRC structure"])
print(len(papers_ACL))

papers_arxiv = [p for p in corpus_arxiv.papers]
papers_arxiv.extend([p for p in corpus_arxiv.papers_with_errors if p.init_error == "Not following IRC structure"])
print(len(papers_arxiv))

57954
29952


In [9]:
import re
from xml.etree import ElementTree as ET

def parse_sections(xml_path):

    with open(xml_path, "r", encoding = "utf-8") as f:
        xml = f.read()

    # Remove all ref tags but keep their content (<ref>content</ref> -> content)
    ref_pattern = re.compile(r"<ref.*?>(.*?)</ref>")
    content = re.sub(ref_pattern, r"\1", xml)

    root = ET.fromstring(content)
    sections = {}
    nb_sections = 0

    for child in root[1][0]: # root.text.body
            # check the <div> (sections identified by grobid) because they indicate the sections of the paper (but also figures or notes)
            if "div" in child.tag:
                if len(child) > 0: # if not empty
                    header, n, head_n = child[0].text, None, None

                    # we do not want to keep the figures and tables
                    if not header.lower().startswith("figure") and not header.lower().startswith("table"):

                        # extract the textual content of the section
                        if len(child) > 0 :
                            text = "\n".join([Paper.clean_text(c.text) for c in child[1:]])
                            
                            # in case the section header is too long, we consider it as part of the content
                            if not Paper.is_acceptable_section_header(header):
                                if Paper.get_alpha_numerical_ratio(header) > 0.5:

                                    header = "unidentified-section"


                        # check if the section if numbered
                        if "n" in list(child[0].attrib):
                            n = child[0].attrib["n"]
                            # check if this section is actually a subsection
                            head_n = re.search(re.compile("(.*)\.\d"), n)
                            if head_n:
                                head_n = head_n.group(1)
                        
                        # update the sections oranisation
                        sections[nb_sections] = {"n": n, "header": header, "head_n": head_n}
                        nb_sections += 1

    return sections

def reorder_section_hierarchy(sections):

    # initialize section hierarchy with abstract
    d = {0: {"header" : "abstract", "subsections" : {}}}
    i = 1

    # add the other sections
    for _, s in sections.items():
        
        if s["head_n"] == None:

            if s["n"] != None or s["header"] != "unidentified-section":
                d[i] = {"header" : s["header"],
                        "n" : s["n"],
                        "subsections": {}}
                i += 1
            
            else:
                j = len(d[i-1]["subsections"])

                if j > 0:
                    k = len(d[i-1]["subsections"][j-1]["subsections"])
                    if k > 0:
                        d[i-1]["subsections"][j-1]["subsections"][k] = {"header" : s["header"], "n": s["n"], "subsections" : {}}
                    else:
                        d[i-1]["subsections"][j] = {"header" : s["header"], "n": s["n"], "subsections" : {}}
                else:
                    d[i-1]["subsections"][j] = {"header" : s["header"], "n": s["n"], "subsections" : {}}
        
        else:
            j = len(d[i-1]["subsections"])
            d[i-1]["subsections"][j] = {"header" : s["header"], "n": s["n"], "subsections" : {}}

    return d

In [6]:
p = [p for p in papers_arxiv if "Russian NLU" in p.title][0]
p.sections

{0: {'n': '1', 'header': 'Introduction', 'head_n': None},
 1: {'n': '2', 'header': 'Related work', 'head_n': None},
 2: {'n': '2.1', 'header': 'Unsupervised sentence embeddings', 'head_n': '2'},
 3: {'n': '2.2', 'header': 'Supervised sentence embeddings', 'head_n': '2'},
 4: {'n': '2.3', 'header': 'Language models', 'head_n': '2'},
 5: {'n': '2.4',
  'header': 'Evaluation of sentence embedding models',
  'head_n': '2'},
 6: {'n': '3.1',
  'header': 'Multiple Choice Question Answering (MCQA)',
  'head_n': '3'},
 7: {'n': '3.2',
  'header': 'Multiple choice next sentence prediction (NSP)',
  'head_n': '3'},
 8: {'n': '3.3', 'header': 'Paraphrase identification (PI)', 'head_n': '3'},
 9: {'n': '3.4', 'header': 'Dataset statistics', 'head_n': '3'},
 10: {'n': '4', 'header': 'Methods', 'head_n': None},
 11: {'n': '4.1', 'header': 'Unsupervised approach', 'head_n': '4'},
 12: {'n': '4.2', 'header': 'Supervised approach', 'head_n': '4'},
 13: {'n': '5.2', 'header': 'ELMo', 'head_n': '5'},
 14

In [10]:
reorder_section_hierarchy(parse_sections(p.xml_path))

{0: {'header': 'abstract', 'subsections': {}},
 1: {'header': 'Introduction', 'n': '1', 'subsections': {}},
 2: {'header': 'Related work',
  'n': '2',
  'subsections': {0: {'header': 'Unsupervised sentence embeddings',
    'n': '2.1',
    'subsections': {}},
   1: {'header': 'Supervised sentence embeddings',
    'n': '2.2',
    'subsections': {}},
   2: {'header': 'Language models', 'n': '2.3', 'subsections': {}},
   3: {'header': 'Evaluation of sentence embedding models',
    'n': '2.4',
    'subsections': {}}}},
 3: {'header': 'Datasets',
  'n': '3',
  'subsections': {0: {'header': 'Multiple Choice Question Answering (MCQA)',
    'n': '3.1',
    'subsections': {}},
   1: {'header': 'Multiple choice next sentence prediction (NSP)',
    'n': '3.2',
    'subsections': {}},
   2: {'header': 'Paraphrase identification (PI)',
    'n': '3.3',
    'subsections': {}},
   3: {'header': 'Dataset statistics', 'n': '3.4', 'subsections': {}}}},
 4: {'header': 'Methods',
  'n': '4',
  'subsections'

In [11]:
parse_sections(p.xml_path)

{0: {'n': '1', 'header': 'Introduction', 'head_n': None},
 1: {'n': '2', 'header': 'Related work', 'head_n': None},
 2: {'n': '2.1', 'header': 'Unsupervised sentence embeddings', 'head_n': '2'},
 3: {'n': '2.2', 'header': 'Supervised sentence embeddings', 'head_n': '2'},
 4: {'n': '2.3', 'header': 'Language models', 'head_n': '2'},
 5: {'n': '2.4',
  'header': 'Evaluation of sentence embedding models',
  'head_n': '2'},
 6: {'n': '3', 'header': 'Datasets', 'head_n': None},
 7: {'n': '3.1',
  'header': 'Multiple Choice Question Answering (MCQA)',
  'head_n': '3'},
 8: {'n': '3.2',
  'header': 'Multiple choice next sentence prediction (NSP)',
  'head_n': '3'},
 9: {'n': '3.3', 'header': 'Paraphrase identification (PI)', 'head_n': '3'},
 10: {'n': '3.4', 'header': 'Dataset statistics', 'head_n': '3'},
 11: {'n': '4', 'header': 'Methods', 'head_n': None},
 12: {'n': '4.1', 'header': 'Unsupervised approach', 'head_n': '4'},
 13: {'n': '4.2', 'header': 'Supervised approach', 'head_n': '4'},


In [12]:
for p in tqdm(papers_ACL):
    try:
        p.sections = parse_sections(p.xml_path)
        p.sections_hierarchy = reorder_section_hierarchy(p.sections)
    except:
        print(p.id)

  0%|          | 0/57954 [00:00<?, ?it/s]

100%|██████████| 57954/57954 [14:12<00:00, 67.94it/s]  


In [14]:
old = [p for p in corpus_ACL.papers_with_errors if p.init_error.startswith("Too old")]
print(len(old))

602


In [15]:
for p in tqdm(old):
    try:
        p.init_error = None
        p.sections = parse_sections(p.xml_path)
        p.sections_hierarchy = reorder_section_hierarchy(p.sections)
    except:
        print(p.id)

papers_ACL.extend(old)
print(len(papers_ACL))

100%|██████████| 602/602 [00:07<00:00, 79.21it/s]


58556


In [16]:
for p in tqdm(papers_arxiv):
    try:
        p.sections = parse_sections(p.xml_path)
        p.sections_hierarchy = reorder_section_hierarchy(p.sections)
    except:
        print(p.id)

100%|██████████| 29952/29952 [08:10<00:00, 61.07it/s]  


In [17]:
with open("../data/annotated_articles.json", "r") as f:
    anno_idx = json.load(f)

for k,v in anno_idx.items():
    print(k, ": ", len(v))

print(len(corpus_ACL.papers))
print(len(corpus_arxiv.papers))

v1 :  10
v2 :  2
v3 :  2
v4 :  120
v5 :  120
40966
22826


In [18]:
def find_main_head(d, section, direct = False):
    
    for i, sec in d.items():

        if i == 0:
            continue

        subsections = sec["subsections"]
        subsections_names = [v["header"] for k, v in subsections.items()]

        if section in subsections_names:
            return sec["header"]

        else:
            # check in the subsections
            to_explore = list(subsections.items())
            while len(to_explore) > 0:
                k, v = to_explore.pop(0)
                if section in [v["header"] for k, v in v["subsections"].items()]:
                    return v["header"]
                else:
                    to_explore.extend(v["subsections"].items())

    return None

def find_all_children(d, section):
    
    for i, sec in d.items():

        if i == 0:
            continue

        subsections = sec["subsections"]
        subsections_names = [v["header"] for k, v in subsections.items()]

        if sec["header"] == section:
            return subsections_names
        
        else:
            # check in the subsections
            to_explore = list(subsections.items())
            while len(to_explore) > 0:
                k, v = to_explore.pop(0)
                if v["header"] == section:
                    return [v["header"] for k, v in v["subsections"].items()]
                else:
                    to_explore.extend(v["subsections"].items())

    return None

In [20]:
list(set([p.init_error for p in papers_ACL]))

[None, 'Not following IRC structure']

In [29]:
n = 0
RES = ["result", "performance", "evaluation", "experiment"]
CONCL = ["analysis", "discussion", "limit", "ethic", "conclusion", "concluding", "future"]
all_IRC = []

for p in tqdm(papers_ACL):

    sections = [s["header"] for k, s in p.sections.items()]
    sections_d = p.sections_hierarchy
    
    found_intro = False
    found_results = False
    found_conclusion = False

    sections_to_keep = set()

    # introduction
    for i, section in enumerate(sections):
        if "introduction" in section.lower():
            found_intro = True
            sections_to_keep.add((i, section))
            break

    # results
    for j, section in enumerate(sections):
        if j > i:
            for res in RES:
                if res in section.lower():
                    found_results = True


                    head = find_main_head(sections_d, section)
                    if head is None:
                        head = section

                    children = find_all_children(sections_d, head)

                    if children is None:
                        sections_to_keep.add((j, section))
                    else:
                        sections_to_keep.add((j, head))
                        for child in children:
                            sections_to_keep.add((j, child))


    # conclusion
    for k, section in enumerate(sections):
        for concl in CONCL:
            if concl in section.lower():
                found_conclusion = True
                
                head = find_main_head(sections_d, section)
                if head is None:
                    head = section

                children = find_all_children(sections_d, head)

                if children is None:
                    sections_to_keep.add((k, section))
                else:
                    sections_to_keep.add((k, head))
                    for child in children:
                        sections_to_keep.add((k, child))


    
    if found_intro and found_results and found_conclusion:
        all_IRC.append(list(sections_to_keep))
        n += 1
        p.init_error = None
        p.content["candidate"] = p.content["section"].apply(lambda x: x in [s[1] for s in sections_to_keep])

    else:
        p.init_error = "Not following IRC structure"

print(f"{n} / {len(papers_ACL)} papers have an introduction, results and conclusion section")

100%|██████████| 58556/58556 [00:15<00:00, 3712.45it/s] 

42759 / 58556 papers have an introduction, results and conclusion section





In [30]:
n = 0

for p in tqdm(papers_arxiv):
    
    sections = [s["header"] for k, s in p.sections.items()]
    sections_d = p.sections_hierarchy
    
    found_intro = False
    found_results = False
    found_conclusion = False

    sections_to_keep = set()

    # introduction
    for i, section in enumerate(sections):
        if "introduction" in section.lower():
            found_intro = True
            sections_to_keep.add((i, section))
            break

    # results
    for j, section in enumerate(sections):
        if j > i:
            for res in RES:
                if res in section.lower():
                    found_results = True


                    head = find_main_head(sections_d, section)
                    if head is None:
                        head = section

                    children = find_all_children(sections_d, head)

                    if children is None:
                        sections_to_keep.add((j, section))
                    else:
                        sections_to_keep.add((j, head))
                        for child in children:
                            sections_to_keep.add((j, child))


    # conclusion
    for k, section in enumerate(sections):
        for concl in CONCL:
            if concl in section.lower():
                found_conclusion = True
                
                head = find_main_head(sections_d, section)
                if head is None:
                    head = section

                children = find_all_children(sections_d, head)

                if children is None:
                    sections_to_keep.add((k, section))
                else:
                    sections_to_keep.add((k, head))
                    for child in children:
                        sections_to_keep.add((k, child))


    
    if found_intro and found_results and found_conclusion:
        all_IRC.append(list(sections_to_keep))
        n += 1
        p.init_error = None
        p.content["candidate"] = p.content["section"].apply(lambda x: x in [s[1] for s in sections_to_keep])

    else:
        p.init_error = "Not following IRC structure"

print(f"{n} / {len(papers_arxiv)} papers have an introduction, results and conclusion section")

100%|██████████| 29952/29952 [00:13<00:00, 2283.85it/s]

24117 / 29952 papers have an introduction, results and conclusion section





In [33]:
corpus_ACL.papers_with_errors = [p for p in corpus_ACL.papers_with_errors if p.init_error is not None]
print(len(corpus_ACL.papers_with_errors))
for p in [p for p in papers_ACL if p.init_error is not None]:
    if p not in corpus_ACL.papers_with_errors:
        corpus_ACL.papers_with_errors.append(p)
corpus_ACL.papers = [p for p in papers_ACL if p.init_error is None]
print(len(corpus_ACL.papers))

30296
42759


In [34]:
corpus_arxiv.papers_with_errors = [p for p in corpus_arxiv.papers_with_errors if p.init_error is not None]
print(len(corpus_arxiv.papers_with_errors))
for p in [p for p in papers_arxiv if p.init_error is not None]:
    if p not in corpus_arxiv.papers_with_errors:
        corpus_arxiv.papers_with_errors.append(p)
corpus_arxiv.papers = [p for p in papers_arxiv if p.init_error is None]
print(len(corpus_arxiv.papers))

10862
24117


In [80]:
for p in corpus_ACL.papers:
    p.content["candidate"] = p.content.apply(lambda x: x["candidate"] or x["section"] == "abstract", axis = 1)

for p in corpus_arxiv.papers:
    p.content["candidate"] = p.content.apply(lambda x: x["candidate"] or x["section"] == "abstract", axis = 1)

In [84]:
cdb_IRC_corr = ClaimDB(corpora = [corpus_ACL, corpus_arxiv])

100%|██████████| 42759/42759 [02:43<00:00, 262.06it/s]
100%|██████████| 24117/24117 [01:44<00:00, 230.89it/s]
100%|██████████| 2/2 [04:27<00:00, 133.81s/it]


In [85]:
cdb_IRC_corr.candidates.head()

Unnamed: 0,idx,corpus,paper_id,year,sentence_id,sentence,section
0,0,ACL,O02-2002,2002,0,There is a need to measure word similarity whe...,abstract
1,1,ACL,O02-2002,2002,1,"Usually, measures of similarity between two wo...",abstract
2,2,ACL,O02-2002,2002,2,The taxonomy approaches are more or less seman...,abstract
3,3,ACL,O02-2002,2002,3,"However, in real applications, both semantic a...",abstract
4,4,ACL,O02-2002,2002,4,Word similarity based on context vectors is a ...,abstract


In [86]:
with open("../data/cdb_IRC_corr.pkl", "wb") as f:
    pickle.dump(cdb_IRC_corr, f)

In [41]:
with open("../data/annotated_articles.json", "r") as f:
    d = json.load(f)

for k, v in d.items():
    print(k, len(v))

v1 10
v2 2
v3 2
v4 120
v5 120


In [87]:
ACL_annotated = []
arxiv_annotated = []

for v in d.keys():
    if v != "v5":
        for paper in d[v]:
            if paper[0] == "ACL":
                ACL_annotated.append(paper[1])
            else:
                arxiv_annotated.append(paper[1])

print(ACL_annotated)
print(len(ACL_annotated))
print(arxiv_annotated)
print(len(arxiv_annotated))

['2020.signlang-1.20', 'W17-4709', 'N19-1358', 'Y15-1047', 'P18-1048', 'W17-5513', '2022.naacl-main.19', '2022.in2writing-1.4', 'W18-3406', 'H89-1053', 'D15-1013', 'P97-1030', 'Y11-1012', 'W01-1826', 'D19-1210', 'W98-1507', 'W01-0812', 'W94-0305', 'P90-1033', 'A92-1020', 'P11-1048', 'P11-4022', '2020.emnlp-main.554', 'P13-1099', 'E87-1013', 'W03-0425', '2020.acl-main.413', 'W18-0527', 'C92-1030', 'W16-5818', 'W99-0402', 'W13-4420', '2001.mtsummit-papers.31', '1993.tmi-1.6', 'W00-1415', 'C98-2177', '2007.mtsummit-papers.40', 'N18-1107', 'Y03-1032', 'H89-2041', 'P19-1654', 'P93-1023', 'W08-1302', 'P11-1029', 'C90-1007', 'C92-2087', 'H89-2017', 'P91-1024', '2020.acl-demos.20', 'W00-0505', 'P07-2029', 'Q16-1037', 'I13-1029', 'J92-4001', 'P07-1088', 'J97-2004', 'W09-0809', 'E93-1027', 'E09-1027', 'W99-0609', 'J98-3005', '2020.lrec-1.826', 'W12-4402', 'C14-1028', 'Y09-2035', 'H93-1064', '2020.loresmt-1.4', 'D18-1087']
68
['2103.14302', '1708.01009', '1611.08765', '1605.05172', '2012.04584', 

In [88]:
corpus_ACL_0 = [p for p in corpus_ACL.papers if p.year < 1994]
corpus_ACL_1 = [p for p in corpus_ACL.papers if p.year >= 1994 and p.year < 2004]
corpus_ACL_2 = [p for p in corpus_ACL.papers if p.year >= 2004 and p.year < 2014]
corpus_ACL_3 = [p for p in corpus_ACL.papers if p.year >= 2014]

corpus_ACL_by_year_slices = [corpus_ACL_0, corpus_ACL_1, corpus_ACL_2, corpus_ACL_3]

corpus_arxiv_0 = [p for p in corpus_arxiv.papers if p.year < 1994]
corpus_arxiv_1 = [p for p in corpus_arxiv.papers if p.year >= 1994 and p.year < 2004]
corpus_arxiv_2 = [p for p in corpus_arxiv.papers if p.year >= 2004 and p.year < 2014]
corpus_arxiv_3 = [p for p in corpus_arxiv.papers if p.year >= 2014]

corpus_arxiv_by_year_slices = [corpus_arxiv_0, corpus_arxiv_1, corpus_arxiv_2, corpus_arxiv_3]

In [89]:
acl_total = 0
arxiv_total = 0

for c_acl in corpus_ACL_by_year_slices:
    c_acl = [p for p in c_acl if p.id not in ACL_annotated]

for c_arx in corpus_arxiv_by_year_slices:
    c_arx = [p for p in c_arx if p.id not in arxiv_annotated]

for c_acl in corpus_ACL_by_year_slices:
    n = len(c_acl)
    acl_total += n
    print(n)

print("total:", acl_total)

for c_arxiv in corpus_arxiv_by_year_slices:
    n = len(c_arxiv)
    arxiv_total += n
    print(n)

print("total:", arxiv_total)

316
2071
11791
28581
total: 42759
0
109
217
23791
total: 24117


In [90]:
import random
import numpy as np

random_papers = []
random.seed(0)

for c_acl in corpus_ACL_by_year_slices:
    random_papers.extend(np.random.choice(c_acl, 15))

for c_arx in corpus_arxiv_by_year_slices[1:]:
    random_papers.extend(np.random.choice(c_arx, 20))

random.shuffle(random_papers)
print(len(random_papers))

random_papers_ids = [[p.corpus.name, p.id] for p in random_papers]
print(random_papers_ids)

120
[['ACL', 'W16-0510'], ['arXiv', '2008.09513'], ['arXiv', '1203.3511'], ['ACL', 'C92-4182'], ['arXiv', '2112.01742'], ['ACL', 'W15-0626'], ['arXiv', 'cmp-lg/9606012'], ['ACL', 'W03-1314'], ['ACL', 'S13-2022'], ['ACL', 'W03-1012'], ['ACL', 'L16-1683'], ['arXiv', 'cmp-lg/9703003'], ['ACL', 'P98-2188'], ['arXiv', 'cmp-lg/9504020'], ['arXiv', 'cmp-lg/9607024'], ['ACL', '2000.tc-1.5'], ['arXiv', '1312.6192'], ['ACL', 'C96-2100'], ['ACL', 'J91-4001'], ['arXiv', 'cmp-lg/9405014'], ['ACL', 'H93-1004'], ['arXiv', '2402.07255'], ['ACL', 'P12-2059'], ['arXiv', '1002.0481'], ['arXiv', '1203.4605'], ['ACL', 'N09-2067'], ['ACL', 'W17-1903'], ['arXiv', '2306.09539'], ['arXiv', '1204.6362'], ['ACL', 'P13-1099'], ['ACL', 'A92-1027'], ['ACL', '2001.mtsummit-teach.5'], ['ACL', 'C18-1179'], ['ACL', '2022.bionlp-1.22'], ['arXiv', '1412.4846'], ['arXiv', '1105.1702'], ['ACL', 'A88-1002'], ['ACL', 'W19-2910'], ['ACL', 'W03-1729'], ['arXiv', '2309.10668'], ['arXiv', 'cmp-lg/9407002'], ['arXiv', '1101.5494'

In [91]:
random_sentences_ids = []
coord2idx = {v:k for k,v in cdb_IRC_corr.idx_map.items()}

for rp in random_papers:

    sentences_ids = rp.content[rp.content["candidate"] == True]["id"].tolist()

    random_sentences_ids.extend([coord2idx[(rp.corpus.name, rp.id, i)] for i in sentences_ids])

df = cdb_IRC_corr.candidates.loc[random_sentences_ids]
print(df.shape)

(10731, 7)


In [93]:
def find_main_head(d, section):
    
    section_n = 0
    head = section
    head_n = 0

    for i, sec in d.items():

        if section == sec["header"]:
            head_n = i
            section_n = i
            break

        subsections = sec["subsections"]
        subsections_names = [v["header"] for k, v in subsections.items()]

        if section in subsections_names:
           head = sec["header"]
           head_n = i
           section_n = subsections_names.index(section)
           break

        else:
            # check in the subsections
            to_explore = list(subsections.items())
            ids = [[k] for k, v in subsections.items()]
            while len(to_explore) > 0:
                k, v = to_explore.pop(0)
                id_ = ids.pop(0)

                subsections_names = [v["header"] for k, v in v["subsections"].items()]

                if section in subsections_names:
                    head = sec["header"]
                    head_n = i
                    section_n = id_
                else:
                    to_explore.extend(v["subsections"].items())
                    ids.extend([[k] + [k_] for k_, v in v["subsections"].items()])


    return section, section_n, head, head_n

def prepare_for_doccano_format(cdb, df:pd.DataFrame)-> pd.DataFrame:
    """A function to prepare a dataframe of sentences for Doccano format
    - df : a pandas DataFrame with columns {corpus, paper_id, sentence_id, sentence, section}"""

    data = []
    coord2idx = {v:k for k,v in cdb.idx_map.items()}


    for i, row in df.iterrows():
        c = cdb.get_corpus_by_name(row["corpus"])
        p = c.get_paper_by_id(row["paper_id"])

        sections_d = p.sections_hierarchy
        main_sections_str = ""

        for i, sec in enumerate(sections_d.values()):
            main_sections_str += str(i) + ". " + sec["header"] + "\n"
            
        sec, sec_n, head, head_n = find_main_head(sections_d, row["section"])
        

        idx = coord2idx[(c.name, p.id, row["sentence_id"])]
        text = row["sentence"]

        
        prev_sent_id = int(row["sentence_id"]) - 1
        next_sent_id = int(row["sentence_id"]) + 1

        # get previous sentence
        if prev_sent_id in p.content["id"].values:
            prev_doc = p.content.loc[prev_sent_id]
            prev_text = prev_doc["sentence"]
            prev_sec = prev_doc["section"]

        else:
            prev_text = ""
            prev_sec = ""

        # get next sentence
        if next_sent_id in p.content["id"].values:
            next_doc = p.content.loc[next_sent_id]
            next_text = next_doc["sentence"]
            next_sec = next_doc["section"]
        
        else:
            next_text = ""
            next_sec = ""


        data.append({
            "text": text,
            "doc_id": idx,
            "corpus": p.corpus.name,
            "paper_title" : p.title,
            "paper_id" : p.id,
            "paper_structure": main_sections_str,
            "year": p.year,
            "section": sec,
            "section_n" : sec_n,
            "main_head" : head,
            "main_head_n" : head_n,
            "prev_text": prev_text,
            "prev_section": prev_sec,
            "next_text": next_text,
            "next_section": next_sec,
            "label": ""
        })

    df_doccano = pd.DataFrame(data)

    return df_doccano

In [94]:
df_doccano = prepare_for_doccano_format(cdb_IRC_corr, df)
df_doccano.head()

Unnamed: 0,text,doc_id,corpus,paper_title,paper_id,paper_structure,year,section,section_n,main_head,main_head_n,prev_text,prev_section,next_text,next_section,label
0,The automated scoring of second-language (L2) ...,1143514,ACL,Unsupervised Modeling of Topical Relevance in ...,W16-0510,0. abstract\n1. Introduction\n2. Related Resea...,2016,abstract,0,abstract,0,,,"In this paper, we focus on determining the top...",abstract,
1,"In this paper, we focus on determining the top...",1143515,ACL,Unsupervised Modeling of Topical Relevance in ...,W16-0510,0. abstract\n1. Introduction\n2. Related Resea...,2016,abstract,0,abstract,0,The automated scoring of second-language (L2) ...,abstract,Given the burden involved in manually assignin...,abstract,
2,Given the burden involved in manually assignin...,1143516,ACL,Unsupervised Modeling of Topical Relevance in ...,W16-0510,0. abstract\n1. Introduction\n2. Related Resea...,2016,abstract,0,abstract,0,"In this paper, we focus on determining the top...",abstract,We show that expanding prompts using topically...,abstract,
3,We show that expanding prompts using topically...,1143517,ACL,Unsupervised Modeling of Topical Relevance in ...,W16-0510,0. abstract\n1. Introduction\n2. Related Resea...,2016,abstract,0,abstract,0,Given the burden involved in manually assignin...,abstract,"Finally, we incorporate our prompt-relevance m...",abstract,
4,"Finally, we incorporate our prompt-relevance m...",1143518,ACL,Unsupervised Modeling of Topical Relevance in ...,W16-0510,0. abstract\n1. Introduction\n2. Related Resea...,2016,abstract,0,abstract,0,We show that expanding prompts using topically...,abstract,Given the increase in demand for educational t...,Introduction,


In [95]:
def find_all_children(d, section):
    
    children = [section]

    for i, sec in d.items():

        if sec["header"] == section:

            while "subsections" in sec.keys():
                ss = sec["subsections"]
                children.extend([v["header"] for k, v in ss.items()])
                sec = ss

    return children

In [96]:
text2 = []
import re

current_head = ""
current_section = ""
h = 0
s = 0
total_head_len = 0
total_section_len = 0

for i, row in df_doccano.iterrows():
    n = 90

    sec = row["section"] 
    sec_n = row["section_n"]
    head = row["main_head"]
    head_n = row["main_head_n"]

    if row["corpus"] == "ACL":
            p = corpus_ACL.get_paper_by_id(row["paper_id"])
    else:
        p = corpus_arxiv.get_paper_by_id(row["paper_id"])

    if head != current_head:
        current_head = head

        cands = p.content[p.content["candidate"] == True]
        children = find_all_children(p.sections_hierarchy, head)
        total_head_len = len(cands[cands["section"].isin(children)])

        h = 1

    else:
        h += 1

    if sec != current_section:
        current_section = sec
        
        if row["corpus"] == "ACL":
            p = corpus_ACL.get_paper_by_id(row["paper_id"])
        else:
            p = corpus_arxiv.get_paper_by_id(row["paper_id"])

        cands = p.content[p.content["candidate"] == True]
        children = find_all_children(p.sections_hierarchy, sec)
        total_section_len = len(cands[cands["section"].isin(children)])

        s = 1

    else:
        s += 1


    text = p.title.replace("\n", "").replace("\t", "") + "\n"
    text += "=" * n + "\n" + str(head_n) + ". " + head + " -- " + str(h) + "/" + str(total_head_len) + "\n" + "=" * n + "\n"

    if sec != head:
        text += str(sec_n) + ". " + sec + " -- " + str(s) + "/" + str(total_section_len) +"\n" + "-" * n + "\n" 

    text+= row["text"]

    text2.append(text)

df_doccano["text"] = text2

In [97]:
def get_feedback_on_article(p, n=90):
    s = "="*n + "\n" + "Annotator feedback" + "\n" + "="*n + "\n\n"
    s += f"You just finished annotating the article entitled <<{p.title}>>. Please answer following questions: \n\n"

    s1 = "1. Do you think that this article was difficult to understand, in a way that may have affected the quality of your annotations, because of its technicity / because it handles subjects you are unfamiliar with ?\n\n"
    s1 += "Please add any label of your choice if your answer is yes."

    s2 = "2. Do you think that this article was difficult to understand, in a way that may have affected the quality of your annotations, because of its writing style / structure / parsing errors ?\n\n"
    s2 += "Please add any label of your choice if your answer is yes."

    s3 = "3. Did you know / read the article before this annotation task, or do you think you have identified its authors ?\n\n"
    s3 += "Please add any label of your choice if your answer is yes."

    return [s + s1, s + s2, s + s3]

In [98]:
# insert feedback questions in the dataset
for rp in random_papers:
    last_row= df_doccano[df_doccano["paper_title"] == rp.title].iloc[-1]
    last_index = float(last_row.name)
    values = list(last_row.values)

    fb = get_feedback_on_article(rp)
    for fb_q, h in zip(fb, [0.25, 0.5, 0.75]):
        values[0] = fb_q
        for i in range(7, len(values)):
            values[i] = ""
        df_doccano.loc[last_index + h] = values

df_doccano = df_doccano.sort_index().reset_index(drop = True)

In [99]:
df_doccano = df_doccano.drop(columns = ["corpus", "paper_id", "section",  "section_n", "main_head", "main_head_n"])

In [100]:
df_doccano

Unnamed: 0,text,doc_id,paper_title,paper_structure,year,prev_text,prev_section,next_text,next_section,label
0,Unsupervised Modeling of Topical Relevance in ...,1143514,Unsupervised Modeling of Topical Relevance in ...,0. abstract\n1. Introduction\n2. Related Resea...,2016,,,"In this paper, we focus on determining the top...",abstract,
1,Unsupervised Modeling of Topical Relevance in ...,1143515,Unsupervised Modeling of Topical Relevance in ...,0. abstract\n1. Introduction\n2. Related Resea...,2016,The automated scoring of second-language (L2) ...,abstract,Given the burden involved in manually assignin...,abstract,
2,Unsupervised Modeling of Topical Relevance in ...,1143516,Unsupervised Modeling of Topical Relevance in ...,0. abstract\n1. Introduction\n2. Related Resea...,2016,"In this paper, we focus on determining the top...",abstract,We show that expanding prompts using topically...,abstract,
3,Unsupervised Modeling of Topical Relevance in ...,1143517,Unsupervised Modeling of Topical Relevance in ...,0. abstract\n1. Introduction\n2. Related Resea...,2016,Given the burden involved in manually assignin...,abstract,"Finally, we incorporate our prompt-relevance m...",abstract,
4,Unsupervised Modeling of Topical Relevance in ...,1143518,Unsupervised Modeling of Topical Relevance in ...,0. abstract\n1. Introduction\n2. Related Resea...,2016,We show that expanding prompts using topically...,abstract,Given the increase in demand for educational t...,Introduction,
...,...,...,...,...,...,...,...,...,...,...
11084,Segmented Harmonic Loss: Handling Class-Imbala...,5654074,Segmented Harmonic Loss: Handling Class-Imbala...,0. abstract\n1. Introduction\n2. Data\n3. Comp...,2023,"In our future work, those are the aspects we w...",Conclusion & Future Work,// Compute the allowed standard deviation usin...,Conclusion & Future Work,
11085,Segmented Harmonic Loss: Handling Class-Imbala...,5654075,Segmented Harmonic Loss: Handling Class-Imbala...,0. abstract\n1. Introduction\n2. Data\n3. Comp...,2023,We believe that would unlock the full potentia...,Conclusion & Future Work,,,
11086,==============================================...,5654075,Segmented Harmonic Loss: Handling Class-Imbala...,0. abstract\n1. Introduction\n2. Data\n3. Comp...,2023,,,,,
11087,==============================================...,5654075,Segmented Harmonic Loss: Handling Class-Imbala...,0. abstract\n1. Introduction\n2. Data\n3. Comp...,2023,,,,,


In [101]:
df_doccano.to_csv("to-annotate-Fanny-120_corr.csv", index = False)

## Inspecting my annotations

In [9]:
data_dir = "CLEM-120-sauvegarde_27_05/"
admin = pd.read_csv(f"{data_dir}admin.csv")

In [10]:
anno_admin = df[~df["label"].isna()]
print(anno_admin.shape)

(184, 12)


In [11]:
anno_admin["paper_title"].unique()

array(['Multi-Task Active Learning for Neural Semantic Role Labeling on Low Resource Conversational Corpus',
       'Text Similarity Using Word Embeddings to Classify Misinformation',
       'Learning Taxonomy for Text Segmentation by Formal Concept Analysis',
       '{C}-Feel-It: A Sentiment Analyzer for Micro-blogs'], dtype=object)

In [13]:
print(len(anno_admin["paper_title"].unique()))

4


In [12]:
df = pd.read_csv(f"{data_dir}anno2.csv")
anno_df = df[~df["label"].isna()]
print(anno_df.shape)

anno_df["paper_title"].unique()

(3568, 12)


array(['Human-Level Performance on Word Analogy Questions by Latent Relational\n  Analysis',
       'Automatic Discovery of Contextual Factors Describing Phonological Variation',
       'Text Classification based on Multi-granularity Attention Hybrid Neural\n  Network',
       'Specialized Language Models using Dialogue Predictions',
       'Mistake-Driven Mixture of Hierarchical Tag Context Trees',
       'Factual or Satisfactory: What Search Results Are Better?',
       'Automatic Grammar Partitioning for Syntactic Parsing',
       'Unsupervised Discovery of Multimodal Links in Multi-image, Multi-sentence Documents',
       'Machine Learning of User Profiles: Representational Issues',
       'Word-Sense Distinguishability and Inter-Coder Agreement',
       'Gathering Statistics to Aspectually Classify Sentences with a Genetic\n  Algorithm',
       'A statistical model for word discovery in child directed speech',
       'Reusing a Statistical Language Model for Generation',
       'L

In [15]:
print(len(anno_df["paper_title"].unique()))

59


In [43]:
pbs = anno_df[anno_df["text"].str.contains("Annotator feedback")]
pbs_1 = pbs[pbs["text"].str.contains("technicity /")].paper_title
pbs_2 = pbs[pbs["text"].str.contains("parsing error")].paper_title
pbs_3 = set(pbs_1) & set(pbs_2)
print(pbs_1.shape)
print(pbs_2.shape)
print(len(pbs_3))

(12,)
(15,)
6


In [31]:
label_counts = anno_df["label"].groupby(anno_df["label"]).count().sort_values(ascending=False)
print(label_counts)

label
result                                1230
contribution-AIC                       620
context-AIC                            528
context-AIC#rw                         225
outline-AIC                            133
                                      ... 
contribution-AIC#error#outline-AIC       1
contribution-AIC#directions              1
context-AIC#limitation#rw                1
context-AIC#result#rw                    1
directions#impact                        1
Name: label, Length: 61, dtype: int64


In [32]:
label_counts[label_counts > 1]

label
result                                 1230
contribution-AIC                        620
context-AIC                             528
context-AIC#rw                          225
outline-AIC                             133
directions                              124
limitation#result                        87
error#result                             72
limitation                               69
contribution-AIC#result                  55
error                                    54
context-AIC#error#rw                     49
result#rw                                44
contribution-AIC#rw                      43
context-AIC#error                        32
impact                                   22
impact#result                            18
contribution-AIC#outline-AIC             16
contribution-AIC#error                   15
contribution-AIC#impact                  14
directions#limitation                    14
directions#result                        10
context-AIC#impact        

In [21]:
with open("../data/arxiv/corpus_arxiv_IRC.pkl", "rb") as f:
    corpus_arxiv = pickle.load(f)

In [24]:
p = [p for p in corpus_arxiv.papers if p.title.startswith("Sentence Embeddings for Russian")][0]

In [29]:
p.content["section"].unique()

array(['abstract', 'Introduction', 'Related work',
       'Unsupervised sentence embeddings',
       'Supervised sentence embeddings', 'Language models',
       'Evaluation of sentence embedding models',
       'Multiple Choice Question Answering (MCQA)',
       'Multiple choice next sentence prediction (NSP)',
       'Paraphrase identification (PI)', 'Dataset statistics', 'Methods',
       'Unsupervised approach', 'Supervised approach', 'ELMo', 'BERT',
       'Conclusion'], dtype=object)

In [44]:
p.sections

{0: {'n': '1', 'header': 'Introduction', 'head_n': None},
 1: {'n': '2', 'header': 'Related work', 'head_n': None},
 2: {'n': '2.1', 'header': 'Unsupervised sentence embeddings', 'head_n': '2'},
 3: {'n': '2.2', 'header': 'Supervised sentence embeddings', 'head_n': '2'},
 4: {'n': '2.3', 'header': 'Language models', 'head_n': '2'},
 5: {'n': '2.4',
  'header': 'Evaluation of sentence embedding models',
  'head_n': '2'},
 6: {'n': '3.1',
  'header': 'Multiple Choice Question Answering (MCQA)',
  'head_n': '3'},
 7: {'n': '3.2',
  'header': 'Multiple choice next sentence prediction (NSP)',
  'head_n': '3'},
 8: {'n': '3.3', 'header': 'Paraphrase identification (PI)', 'head_n': '3'},
 9: {'n': '3.4', 'header': 'Dataset statistics', 'head_n': '3'},
 10: {'n': '4', 'header': 'Methods', 'head_n': None},
 11: {'n': '4.1', 'header': 'Unsupervised approach', 'head_n': '4'},
 12: {'n': '4.2', 'header': 'Supervised approach', 'head_n': '4'},
 13: {'n': '5.2', 'header': 'ELMo', 'head_n': '5'},
 14

In [45]:
def reorder_section_hierarchy(sections):

    # initialize section hierarchy with abstract
    d = {0: {"header" : "abstract", "subsections" : {}}}
    i = 1

    # add the other sections
    for _, s in sections.items():
        
        if s["head_n"] == None:

            if s["n"] != None or s["header"] != "unidentified-section":
                d[i] = {"header" : s["header"],
                        "subsections": {}}
                i += 1
            
            else:
                j = len(d[i-1]["subsections"])

                if j > 0:
                    k = len(d[i-1]["subsections"][j-1]["subsections"])
                    if k > 0:
                        d[i-1]["subsections"][j-1]["subsections"][k] = {"header" : s["header"], "subsections" : {}}
                    else:
                        d[i-1]["subsections"][j] = {"header" : s["header"], "subsections" : {}}
                else:
                    d[i-1]["subsections"][j] = {"header" : s["header"], "subsections" : {}}
        
        else:
            j = len(d[i-1]["subsections"])
            d[i-1]["subsections"][j] = {"header" : s["header"], "subsections" : {}}

    return d

reorder_section_hierarchy(p.sections)

{0: {'header': 'abstract', 'subsections': {}},
 1: {'header': 'Introduction', 'subsections': {}},
 2: {'header': 'Related work',
  'subsections': {0: {'header': 'Unsupervised sentence embeddings',
    'subsections': {}},
   1: {'header': 'Supervised sentence embeddings', 'subsections': {}},
   2: {'header': 'Language models', 'subsections': {}},
   3: {'header': 'Evaluation of sentence embedding models', 'subsections': {}},
   4: {'header': 'Multiple Choice Question Answering (MCQA)',
    'subsections': {}},
   5: {'header': 'Multiple choice next sentence prediction (NSP)',
    'subsections': {}},
   6: {'header': 'Paraphrase identification (PI)', 'subsections': {}},
   7: {'header': 'Dataset statistics', 'subsections': {}}}},
 3: {'header': 'Methods',
  'subsections': {0: {'header': 'Unsupervised approach', 'subsections': {}},
   1: {'header': 'Supervised approach', 'subsections': {}},
   2: {'header': 'ELMo', 'subsections': {}},
   3: {'header': 'BERT', 'subsections': {}}}},
 4: {'he