# Find accepted papers since 2019 from NeurIPS, ICLR, ICML
To run, set the following environment variables to your openreview login.
- `OPENREVIEW_USERNAME`
- `OPENREVIEW_PASSWORD`

In [1]:
import re
import os
from collections import defaultdict
import requests
import time
from bs4 import BeautifulSoup
import pickle

import openreview
from tqdm.auto import tqdm
from dataclasses import dataclass

In [2]:
client = openreview.Client(
    baseurl='https://api.openreview.net', 
    username=os.environ.get("OPENREVIEW_USERNAME"),
    password=os.environ.get("OPENREVIEW_PASSWORD"),
)

In [3]:
@dataclass
class Paper:
    conference: str
    abstract: str
    authors: list
    title: str
    url: str

In [4]:
LIMIT = int(1e6)  # Limit on papers from each conference to prevent too much scraping

In [5]:
all_papers = list()

### Title / author / abstract from last 2 NeurIPS

In [6]:
# Start by getting all the URLs to the paper homepage
year_list = [2019, 2020]
paper_urls = []
for year in year_list:
    
    # Read page URL
    home_url = f"https://proceedings.neurips.cc/paper/{year}"
    response = requests.get(home_url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find the list of papers
    ul_list = soup.find_all("ul")  # all ul items
    assert len(ul_list) == 2
    ul = ul_list[1]
    for li in ul.find_all("li"):
        paper_homepage = li.find("a")["href"]
        paper_urls.append("https://proceedings.neurips.cc" + paper_homepage)

del year, home_url, response, ul_list, ul, li, paper_homepage

In [7]:
for paper_homepage in tqdm(paper_urls[:LIMIT]):
    response = requests.get(paper_homepage)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = soup.find_all("p")
    authors = [name.strip() for name in paragraphs[1].text.split(",")]
    
    # Two cases for abstract
    abstract = paragraphs[-1].text
    title = soup.find_all("h4")[0].text
    all_papers.append(Paper(conference="neurips", abstract=abstract, authors=authors, title=title, url=paper_homepage))

del response, paper_homepage, paper_urls

  0%|          | 0/3326 [00:00<?, ?it/s]

## Last 3 ICML

In [8]:
# Start by getting all the URLs to the paper homepage
edition_list = ["v97", "v119", "v139"]
paper_urls = []
for edition in edition_list:
    
    # Read page URL
    home_url = f"https://proceedings.mlr.press/{edition}"
    response = requests.get(home_url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find the list of papers
    paper_divs = soup.find_all("div", attrs={"class": "paper"})
    for paper in paper_divs:
        paper_urls.append(paper.find("a")["href"])

del edition, home_url, response, paper_divs, paper

In [9]:
for paper_homepage in tqdm(paper_urls[:LIMIT]):
    response = requests.get(paper_homepage)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # paper info
    authors = [name.strip() for name in soup.find("span", attrs={"class": "authors"}).text.split(",")]
    abstract = soup.find("div", attrs={"class": "abstract"}).text
    title = soup.find("h1").text
    all_papers.append(Paper(conference="neurips", abstract=abstract, authors=authors, title=title, url=paper_homepage))

del response, paper_homepage, paper_urls, title, authors, abstract

  0%|          | 0/3040 [00:00<?, ?it/s]

## Last 3 ICLR

In [10]:
for year in [2019, 2020, 2021]:
    invitation_name = f"ICLR.cc/{year}/Conference/-/Blind_Submission"
    
    submissions = list(
        openreview.tools.iterget_notes(
            client,
            invitation=f"ICLR.cc/{year}/Conference/-/Blind_Submission"
        )
    )

    meta_reviews = list(
        openreview.tools.iterget_notes(
            client,
            invitation=f'ICLR.cc/{year}/Conference/-/Paper.*/Meta_Review'
        )
    )

    decisions = list(
        openreview.tools.iterget_notes(
            client,
            invitation=f'ICLR.cc/{year}/Conference/Paper.*/Decision'
        )
    )

    iclr_accepted = []
    for paper in submissions:

        _added_paper = False

        # Check acceptance by meta review
        matching_meta_reviews = [mr for mr in meta_reviews if mr.forum == paper.id]
        assert len(matching_meta_reviews) <= 1
        if len(matching_meta_reviews) > 0:
            if "accept" in matching_meta_reviews[0].content['recommendation'].lower():
                iclr_accepted.append(paper)
                _added_paper = True

        # Check acceptance by decisions
        matching_meta_reviews = [mr for mr in decisions if mr.forum == paper.id]
        assert len(matching_meta_reviews) <= 1
        if len(matching_meta_reviews) > 0:
            if "accept" in matching_meta_reviews[0].content['decision'].lower():
                assert not _added_paper  # no duplicate additions
                iclr_accepted.append(paper)
                _added_paper = True

        del _added_paper

    # Add papers
    for paper in iclr_accepted[:LIMIT]:
        all_papers.append(
            Paper(
        url=f"https://openreview.net/forum?id={paper.id}",
        authors=paper.content['authors'],
        title=paper.content['title'],
        conference="iclr",
        abstract=paper.content['abstract'],

            )
        )
    
del year, submissions, meta_reviews, decisions, iclr_accepted

In [11]:
len(all_papers)

8415

In [12]:
with open("2019-2021-icml-iclr-neurips-papers.pkl", "wb") as f:
    pickle.dump(all_papers, f)