# PMLR Extractor ICML/AISTATS

This uses http://proceedings.mlr.press/ to extract paper information from multiple webpages containing all PMLR papers.

In [1]:
import os
import json
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
def parse_pmlr(conference, save_dir):
    """
    Parse ICML or AISTATS
    
    args
    ----
        conference: <name><year> of conference 
                    chosen from url_lookup dict.
        save_dir: Directory where the json file
                  is dumped for later use.
    
    return
    ------
        conf_list: List of dicts
                   Each dict containing info about one paper.
    """
    url_lookup = {
        'icml2013': 'http://proceedings.mlr.press/v28/',
        'icml2014': 'http://proceedings.mlr.press/v32/',
        'icml2015': 'http://proceedings.mlr.press/v37/',
        'icml2016': 'http://proceedings.mlr.press/v48/',
        'icml2017': 'http://proceedings.mlr.press/v70/',
        'icml2018': 'http://proceedings.mlr.press/v80/',
        'icml2019': 'http://proceedings.mlr.press/v97/',
        'aistats2015': 'http://proceedings.mlr.press/v38/',
        'aistats2016': 'http://proceedings.mlr.press/v51/',
        'aistats2017': 'http://proceedings.mlr.press/v54/',
        'aistats2018': 'http://proceedings.mlr.press/v84/',
        'aistats2019': 'http://proceedings.mlr.press/v89/',
        'aistats2020': 'http://proceedings.mlr.press/v108/',
    }
    # Last 4 digits represent the year of conference
    conf_name = conference[:-4].upper()
    conf_year = conference[-4:]

    # BS4 Magic!
    url = url_lookup[conference]
    soup = BeautifulSoup(urlopen(url), "html.parser")
    papers = soup.find_all('div', {'class': 'paper'})
    print(f"Found {len(papers)} papers.")
    
    conf_list = []
    for paper in papers:
        title = paper.find(class_ = 'title').text
        authors = [x.strip() for x in paper.find(class_ = 'authors').string.split(',')]
        abstract_url = paper.find('a', text='abs')
        paper_url = paper.find('a', text='Download PDF')
        
        # Get abstract
        if abstract_url:
            abstract_url = abstract_url['href']
            abstract_soup = BeautifulSoup(urlopen(abstract_url), "html.parser")
            abstract = abstract_soup.find(id="abstract").text
        
        # Get PDF url
        if paper_url:
            paper_url = paper_url['href']
        
        # Generate an ID for each paper.
        paper_id = abstract_url.split("/")[-1].replace(".html", "")
        
        # Populate the paper_dict
        paper_dict = {
            'id': paper_id,
            'conf_name': conf_name,
            'year': conf_year,
            'link': abstract_url,
            'pdf': paper_url,
            'type': "None",
            'title': title,
            'authors': authors,
            'abstract': abstract,
            'embedding_sci_bert': "None",
            'embedding_sent_bert': "None",
            'embedding_specter': "None",
            'img_large': "None",
            'img_small': "None"
            }
        
        conf_list.append(paper_dict)
    
    # Create Directory to save extracted data
    save_dir = save_dir + conf_name.upper()
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    
    # Dump Extracted JSON to save directory
    json_dump = save_dir + "/" + conf_name + "_" + conf_year + ".json"
    print(f"Dumping json to {json_dump}")
    with open(json_dump, 'w') as file:
        json.dump(conf_list, file)
        
    return conf_list

In [3]:
save_dir = "data/"

icml_19_json = parse_pmlr("icml2019", save_dir)
icml_18_json = parse_pmlr("icml2018", save_dir)
icml_17_json = parse_pmlr("icml2017", save_dir)
icml_16_json = parse_pmlr("icml2016", save_dir)
icml_15_json = parse_pmlr("icml2015", save_dir)

Found 773 papers.
Dumping json to data/ICML/ICML_2019.json
Found 621 papers.
Dumping json to data/ICML/ICML_2018.json
Found 434 papers.
Dumping json to data/ICML/ICML_2017.json
Found 322 papers.
Dumping json to data/ICML/ICML_2016.json
Found 270 papers.
Dumping json to data/ICML/ICML_2015.json


In [5]:
aistats_20_json = parse_pmlr("aistats2020", save_dir)
aistats_19_json = parse_pmlr("aistats2019", save_dir)
aistats_18_json = parse_pmlr("aistats2018", save_dir)
aistats_17_json = parse_pmlr("aistats2017", save_dir)
aistats_16_json = parse_pmlr("aistats2016", save_dir)
aistats_15_json = parse_pmlr("aistats2015", save_dir)

Found 423 papers.
Dumping json to data/AISTATS/AISTATS_2020.json
Found 360 papers.
Dumping json to data/AISTATS/AISTATS_2019.json
Found 216 papers.
Dumping json to data/AISTATS/AISTATS_2018.json
Found 167 papers.
Dumping json to data/AISTATS/AISTATS_2017.json
Found 164 papers.
Dumping json to data/AISTATS/AISTATS_2016.json
Found 126 papers.
Dumping json to data/AISTATS/AISTATS_2015.json
