In [2]:
import sys
sys.path.insert(0, "../")
from utils.ClaimDB import ClaimDB
import pickle
import json
import pandas as pd

### 1. First instanciation of `ClaimDB` : load papers from ACL and ArXiv `Corpus`

In [3]:
# Load the ACL and arXiv corpora
with open("../data/acl/corpus_ACL.pkl", "rb") as f:
    corpus_ACL = pickle.load(f)

with open("../data/arxiv/corpus_arxiv.pkl", "rb") as f:
    corpus_arxiv = pickle.load(f)

In [4]:
# initialize a claim database
cdb = ClaimDB(corpora = [corpus_ACL, corpus_arxiv], annotated_idx_path="../data/annotated_articles.json")

100%|██████████| 60725/60725 [04:29<00:00, 225.69it/s]
100%|██████████| 30361/30361 [03:01<00:00, 167.10it/s]
100%|██████████| 2/2 [07:30<00:00, 225.39s/it]


In [5]:
cdb.candidates.shape

(10900317, 6)

In [6]:
cdb.candidates.head(10)

Unnamed: 0,idx,corpus,paper_id,sentence_id,sentence,section
0,0,ACL,O02-2002,0,There is a need to measure word similarity whe...,abstract
1,1,ACL,O02-2002,1,"Usually, measures of similarity between two wo...",abstract
2,2,ACL,O02-2002,2,The taxonomy approaches are more or less seman...,abstract
3,3,ACL,O02-2002,3,"However, in real applications, both semantic a...",abstract
4,4,ACL,O02-2002,4,Word similarity based on context vectors is a ...,abstract
5,5,ACL,O02-2002,5,"In this paper, we propose using only syntactic...",abstract
6,6,ACL,O02-2002,6,The probabilistic distribution of co-occurrenc...,abstract
7,7,ACL,O02-2002,7,The agglomerative clustering algorithm is appl...,abstract
8,8,ACL,O02-2002,8,It turns out that words with similar syntactic...,abstract
9,9,ACL,O02-2002,9,It is well known that word-sense is defined by...,Introduction


In [7]:
cdb.idx_map[4]

('ACL', 'O02-2002', 4)

In [8]:
with open("../data/cdb.pkl", "wb") as f:
    pickle.dump(cdb, f)

### 2. Load from pickle

In [9]:
with open("../data/cdb.pkl", "rb") as f:
    cdb = pickle.load(f)

In [13]:
cdb.annotated_idx_path = "../data/annotated_articles.json"

In [15]:
cdb.draw_random_idx_from_corpus("ACL", 2)

['W17-5513', 'W09-1207']

In [19]:
cdb.draw_random_idx_from_corpus("arXiv", 2)

['1704.06217', '2108.06957']

In [26]:
import pandas as pd

idx = [("ACL", "W17-5513"), ("arXiv", "2108.06957")]
data = []

for c, id in idx:

    # find the paper
    if c == "ACL":
        p = cdb.corpora[0].get_paper_by_id(id)
    else:
        p = cdb.corpora[1].get_paper_by_id(id)

    # access its content
    contents = p.content[["sentence", "section", "id", "candidate"]].values
    
    for i, row in enumerate(contents):
        if row[3] == True:
            # catch the previous sentence
            if i == 0:
                prev_sec, prev_sent = None, None
            else:
                prev_sec, prev_sent = contents[i-1][1], contents[i-1][0]

            # current sentence (the one to annotate)
            sec, sent = row[1], row[0]

            # catch the next sentence
            if i == len(contents) - 1:
                next_sec, next_sent = None, None
            else:
                next_sec, next_sent = contents[i+1][1], contents[i+1][0]

            data.append({"article_id": p.id, "source": c, "sent_id": row[2], "prev_sec": prev_sec, "prev_sent": prev_sent, "sec": sec, "sent": sent, "next_sec": next_sec, "next_sent": next_sent})

df = pd.DataFrame(data)
df.head()

Unnamed: 0,article_id,source,sent_id,prev_sec,prev_sent,sec,sent,next_sec,next_sent
0,W17-5513,ACL,0,,,abstract,We propose a software architecture designed to...,abstract,The Modular Architecture for Conversational Ag...
1,W17-5513,ACL,1,abstract,We propose a software architecture designed to...,abstract,The Modular Architecture for Conversational Ag...,abstract,The architecture separates the domain of the c...
2,W17-5513,ACL,2,abstract,The Modular Architecture for Conversational Ag...,abstract,The architecture separates the domain of the c...,abstract,MACA provides tools to host dialogue agents on...
3,W17-5513,ACL,3,abstract,The architecture separates the domain of the c...,abstract,MACA provides tools to host dialogue agents on...,abstract,The current version of the framework already i...
4,W17-5513,ACL,4,abstract,MACA provides tools to host dialogue agents on...,abstract,The current version of the framework already i...,Introduction,Recent research in building sophisticated AIba...


In [27]:
df.shape

(176, 9)

In [30]:
rev_idx_map = {v: k for k, v in cdb.idx_map.items()}

df["idx"] = df.apply(lambda x: rev_idx_map[(x["source"], x["article_id"], x["sent_id"])], axis=1)

In [33]:
df = df[["idx", "sent", "sec", "prev_sec", "prev_sent", "next_sec", "next_sent"]]
df

Unnamed: 0,idx,sent,sec,prev_sec,prev_sent,next_sec,next_sent
0,3075788,We propose a software architecture designed to...,abstract,,,abstract,The Modular Architecture for Conversational Ag...
1,3075789,The Modular Architecture for Conversational Ag...,abstract,abstract,We propose a software architecture designed to...,abstract,The architecture separates the domain of the c...
2,3075790,The architecture separates the domain of the c...,abstract,abstract,The Modular Architecture for Conversational Ag...,abstract,MACA provides tools to host dialogue agents on...
3,3075791,MACA provides tools to host dialogue agents on...,abstract,abstract,The architecture separates the domain of the c...,abstract,The current version of the framework already i...
4,3075792,The current version of the framework already i...,abstract,abstract,MACA provides tools to host dialogue agents on...,Introduction,Recent research in building sophisticated AIba...
...,...,...,...,...,...,...,...
171,8054606,"These triples gives model wrong supervisions, ...",Conclusions,Conclusions,"First, in the RE subtask, many triples are not...",Conclusions,But this missing annotation issue is still an ...
172,8054607,But this missing annotation issue is still an ...,Conclusions,Conclusions,"These triples gives model wrong supervisions, ...",Conclusions,"Second, in the DEE subtask, how to process lon..."
173,8054608,"Second, in the DEE subtask, how to process lon...",Conclusions,Conclusions,But this missing annotation issue is still an ...,Conclusions,"In addition, if two arguments of one event are..."
174,8054609,"In addition, if two arguments of one event are...",Conclusions,Conclusions,"Second, in the DEE subtask, how to process lon...",Conclusions,This issue also should be well studied in the ...


In [41]:
df["label"] = [""] * df.shape[0]

In [42]:
df.to_csv("to-annotate-2.csv", index = False)

In [37]:
with open(cdb.annotated_idx_path, "r") as f:
    anno = json.load(f)

In [38]:
anno

{'v1': [['ACL', '2020.signlang-1.20'],
  ['ACL', 'W17-4709'],
  ['ACL', 'N19-1358'],
  ['arXiv', '2103.14302'],
  ['ACL', 'Y15-1047'],
  ['ACL', 'P18-1048'],
  ['arXiv', '1708.01009'],
  ['arXiv', '1611.08765'],
  ['arXiv', '1605.05172'],
  ['arXiv', '2012.04584']]}

In [39]:
anno['v2'] = [[c, id] for c,id in idx]
anno

{'v1': [['ACL', '2020.signlang-1.20'],
  ['ACL', 'W17-4709'],
  ['ACL', 'N19-1358'],
  ['arXiv', '2103.14302'],
  ['ACL', 'Y15-1047'],
  ['ACL', 'P18-1048'],
  ['arXiv', '1708.01009'],
  ['arXiv', '1611.08765'],
  ['arXiv', '1605.05172'],
  ['arXiv', '2012.04584']],
 'v2': [['ACL', 'W17-5513'], ['arXiv', '2108.06957']]}

In [40]:
with open(cdb.annotated_idx_path, "w") as f:
    json.dump(anno, f)