# Anthology BibTex Extractor ACL/EMNLP/NAACL

This uses https://www.aclweb.org/anthology/anthology+abstracts.bib.gz to extract paper information from bib file containing all anthology papers.

**TODO**: Add *COLING / HLT / IJCNLP* based on interest for paperviz.ml

In [1]:
import os
import json
import bibtexparser
from tqdm.notebook import tqdm

In [2]:
# Note: This cell can take some time to run due to the large filesize.
anthology_bib_path = '/home/ubuntu/anthology+abstracts.bib'

parser = bibtexparser.bparser.BibTexParser(common_strings=True)

with open(anthology_bib_path) as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file, parser)

## Note: Anthology does not contain abstracts for conferences before 2017

We will extract the abstracts for them later.

In [3]:
# Create Empty lists for saving parsed information
acl_papers20 = []
acl_papers19 = []
acl_papers18 = []
acl_papers17 = []
acl_papers16 = []
acl_papers15 = []
emnlp_papers19 = []
emnlp_papers18 = []
emnlp_papers17 = []
emnlp_papers16 = []
emnlp_papers15 = []
naacl_papers19 = []
naacl_papers18 = []
naacl_papers16 = []
naacl_papers15 = []

# Based on conference codes we extract various
# conferences for different years.
for paper in tqdm(bib_database.entries):
    try:
        if "000" in paper["url"]:
            # Don't append the first paper from all conferences
            # as they represent the whole conference volume pdf
            continue
        
        if "2020.acl-main" in paper["url"] and "2020.acl-main.0" not in paper["url"]:
            acl_papers20.append(paper)
        if "P19-1" in paper["url"]:
            acl_papers19.append(paper)
        if "P18-1" in paper["url"] or "P18-2" in paper["url"]:
            acl_papers18.append(paper)
        if "P17-1" in paper["url"] or "P17-2" in paper["url"]:
            acl_papers17.append(paper)
        if "D19-1" in paper["url"]:
            emnlp_papers19.append(paper)
        if "D18-1" in paper["url"]:
            emnlp_papers18.append(paper)
        if "D17-1" in paper["url"]:
            emnlp_papers17.append(paper)
        if "N19-1" in paper["url"]:
            naacl_papers19.append(paper)
        if "N18-1" in paper["url"] or "N18-2" in paper["url"]:
            naacl_papers18.append(paper)

        # Conferences before 2017 do not have abstracts.
        # We will not use these later.
        # TODO: Find a source to scrape their abstracts.
        if "P16-1" in paper["url"] or "P16-2" in paper["url"]:
            acl_papers16.append(paper)
        if "P15-1" in paper["url"] or "P15-2" in paper["url"]:
            acl_papers15.append(paper)
        if "D16-1" in paper["url"]:
            emnlp_papers16.append(paper)
        if "D15-1" in paper["url"]:
            emnlp_papers15.append(paper) 
        if "N16-1" in paper["url"]:
            naacl_papers16.append(paper)
        if "N15-1" in paper["url"]:
            naacl_papers15.append(paper)
    except:
        pass

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=58055.0), HTML(value='')))




In [4]:
len(acl_papers20), len(acl_papers19), len(acl_papers18), len(acl_papers17), len(acl_papers16), len(acl_papers15)

(778, 660, 381, 302, 328, 318)

In [5]:
len(naacl_papers19), len(naacl_papers18), len(naacl_papers16), len(naacl_papers15)

(423, 330, 181, 186)

In [6]:
len(emnlp_papers19), len(emnlp_papers18), len(emnlp_papers17), len(emnlp_papers16), len(emnlp_papers15)

(681, 549, 323, 264, 312)

In [7]:
def create_json(anthology_conf, conf_name, save_dir):
    conf_list = []
    for paper in anthology_conf:
        paper_dict = {}
        paper_dict['id'] = paper["ID"]
        paper_dict['conf_name'] = conf_name
        paper_dict['year'] = paper["year"]
        paper_dict['link'] = paper["url"]
        paper_dict['type'] = paper["ENTRYTYPE"]
        paper_dict['title'] = paper["title"]
        paper_dict['authors'] = paper["author"].split("  and\n")
        paper_dict['abstract'] = paper["abstract"]
        paper_dict['embedding_sci_bert'] = "None"
        paper_dict['embedding_sent_bert'] = "None"
        paper_dict['embedding_specter'] = "None"
        paper_dict['img_large'] = "None"
        paper_dict['img_small'] = "None"
        
        conf_list.append(paper_dict)
    
    save_dir = save_dir + conf_name
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    
    json_dump = save_dir + "/" + conf_name + "_" + paper["year"] + ".json"
    print(f"Dumping json to {json_dump}")
    with open(json_dump, 'w') as file:
        json.dump(conf_list, file)

    return conf_list

In [8]:
save_dir = "data/"

acl_2020_paperviz = create_json(acl_papers20, "ACL", save_dir)
acl_2019_paperviz = create_json(acl_papers19, "ACL", save_dir)
acl_2018_paperviz = create_json(acl_papers18, "ACL", save_dir)
acl_2017_paperviz = create_json(acl_papers17, "ACL", save_dir)

naacl_2019_paperviz = create_json(naacl_papers19, "NAACL", save_dir)
naacl_2018_paperviz = create_json(naacl_papers18, "NAACL", save_dir)

emnlp_2019_paperviz = create_json(emnlp_papers19, "EMNLP", save_dir)
emnlp_2018_paperviz = create_json(emnlp_papers18, "EMNLP", save_dir)
emnlp_2017_paperviz = create_json(emnlp_papers17, "EMNLP", save_dir)

Dumping json to scrape/data/ACL/ACL_2020.json
Dumping json to scrape/data/ACL/ACL_2019.json
Dumping json to scrape/data/ACL/ACL_2018.json
Dumping json to scrape/data/ACL/ACL_2017.json
Dumping json to scrape/data/NAACL/NAACL_2019.json
Dumping json to scrape/data/NAACL/NAACL_2018.json
Dumping json to scrape/data/EMNLP/EMNLP_2019.json
Dumping json to scrape/data/EMNLP/EMNLP_2018.json
Dumping json to scrape/data/EMNLP/EMNLP_2017.json
