### Scraping Peer Review Data for the ICLR Conference from OpenReview
- This notebook creates a web crawler to extract all peer review data for the ICLR conference from OpenReview.
- The data formatted as a CSV file, then saved to data/raw_peer_review_data.csv.
- The rest of the cleaning, which is different for each year of the conference, is done in cleaning_peer_review_data.ipynb.

In [14]:
import pandas as pd
import time
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup

import re


def get_page_source(
    driver, wait_condition=(By.CSS_SELECTOR, "div.forum-container div.note_contents")
):
    """
    Waits for the presence of a specific element on the page and returns the page source as a BeautifulSoup object.

    Args:
        driver: Selenium WebDriver instance.
        wait_condition: Tuple containing the locator strategy and locator value for the element to wait for.

    Returns:
        BeautifulSoup object representing the page source.
    """
    WebDriverWait(driver, 10).until(EC.presence_of_element_located(wait_condition))
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, "lxml")
    return soup


def get_paper_data(page_soup, paper_id):
    """
    Extracts paper data from the page source.

    Args:
        page_soup: BeautifulSoup object representing the page source.
        paper_id: ID of the paper.

    Returns:
        Dictionary containing the extracted paper data.
    """
    soup = page_soup.find("div", class_="forum-container").find("div", class_="note")
    paper_name = soup.find("h2", class_="note_content_title").text.strip()

    # Extract authors and their emails
    author_data = {}
    try:
        authors = soup.find_all("a", class_="profile-link")
        for i, author in enumerate(authors, start=1):
            author_name = author.text.strip()
            author_link = author["href"].split("=")[1]
            author_data[f"Author {i} name"] = author_name
            author_data[f"Author {i} id"] = author_link
    except:
        print("Authors not found for paper", paper_id)

    # Extract date published and date modified
    date_published, date_modified = None, None
    try:
        date_published, date_modified = re.findall(
            r"Published: (.*?), Last Modified: (.*)",
            soup.find("span", class_="date").text,
        )[0]
    except:
        try:
            date_text = soup.find("span", class_="date").text
            date_published, date_modified = re.findall(
                r"(\d{2} \w{3} \d{4})", date_text
            )
        except:
            print("Date not found for paper", paper_id)

    # Extract type of paper
    type_of_paper = None
    try:
        type_of_paper = (
            soup.find("span", class_="date").find_next_sibling("span").text.strip()
        )
    except:
        print("Type of paper not found for paper", paper_id)

    # Extract keywords and other information from note contents
    note_contents = soup.find_all("div", class_="note_contents")
    additional_info = {}
    for note in note_contents:
        try:
            key = note.find("span", class_="note_content_field").text.strip(": ")
            value = note.find("span", class_="note_content_value").text.strip()
            additional_info[key] = value
        except:
            print("Additional info not found for paper", paper_id)

    paper_info = {
        "Paper ID": paper_id,
        "Paper name": paper_name,
        **author_data,
        "Date published": date_published,
        "Date last modified": date_modified,
        "Type of paper": type_of_paper,
        **additional_info,
    }

    return paper_info


def get_comment_data(page_soup, driver, paper_id, in_subcomments=False):
    """
    Extracts comment data from the page source.

    Args:
        page_soup: BeautifulSoup object representing the page source.
        driver: Selenium WebDriver instance.
        paper_id: ID of the paper.
        in_subcomments: Boolean indicating whether the function is currently extracting subcomments.

    Returns:
        List of dictionaries containing the extracted comment data.
    """
    comments = (
        page_soup.find("div", class_="forum-container")
        .find("div", id="note_children")
        .find_all("div", class_="note_with_children")
    )

    if in_subcomments:
        comments = comments[1:]

    comment_data = []
    for soup in comments:
        if soup.find("div", class_="meta_row").text.strip() != "[Deleted]":
            title = soup.find("h2", class_="note_content_title").text.strip()
            subtitle = soup.find("div", class_="meta_row").span.text.strip()
            date_submitted, date_modified = None, None
            try:
                date_text = soup.find("span", class_="date").text.strip()
                dates = re.findall(r"(\d{2} \w{3} \d{4})", date_text)
                if len(dates) == 2:
                    date_submitted, date_modified = dates
                elif len(dates) == 1:
                    date_submitted = dates[0]
            except:
                print(f"Date not found for comment {title} on paper {paper_id}")
            review_type = (
                soup.find("span", class_="date").find_next_sibling("span").text.strip()
            )

            # Extracting all note_contents (i.e., comments) with names and content
            note_contents = soup.find_all("div", class_="note_contents")
            content_dict = {}
            for content in note_contents:
                field_name = content.find(
                    "span", class_="note_content_field"
                ).text.strip(": ")
                field_value = content.find(
                    "span", class_="note_content_value"
                ).text.strip()
                content_dict[field_name] = field_value

            extracted_info = {
                "Title": title,
                "Authors": subtitle,
                "Date Submitted": date_submitted,
                "Date Modified": date_modified,
                "Review Type": review_type,
                **content_dict,
            }
            comment_data.append(extracted_info)

        try:
            view_more_replies_id = soup.select_one(
                ":scope > .view-more-replies-container a"
            ).get("data-note-id")
            view_more_replies = driver.find_elements(
                By.CSS_SELECTOR, "a[data-note-id='" + view_more_replies_id + "']"
            )
            if view_more_replies:
                start_time = time.time()
                driver.execute_script("arguments[0].click();", view_more_replies[0])
                end_time = time.time()
                interval = end_time - start_time
                if interval < TIME:
                    time.sleep(TIME - interval)
                page_soup = get_page_source(driver)
                comment_data = get_comment_data(page_soup, driver, paper_id, True)
        except:
            pass

    if in_subcomments:  # Click back to the parent comment
        button = driver.find_element(
            By.CSS_SELECTOR, ".forum-container .view-all-replies-container button"
        )
        start_time = time.time()
        driver.execute_script("arguments[0].click();", button)
        end_time = time.time()
        interval = end_time - start_time
        if interval < TIME:
            time.sleep(TIME - interval)
        page_soup = get_page_source(driver)

    return comment_data


def txt_to_list(filename):
    """
    Reads a text file and returns its contents as a list of strings.
    Args:
    filename: Name of the text file to read.

    Returns:
        List of strings representing the lines in the text file.
    """
    with open(filename) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return content


def list_to_txt(filename, id_list):
    """
    Writes a list of IDs to a text file.
    Args:
    filename: Name of the text file to write.
    id_list: List of IDs to write to the file.
    """
    with open(filename, "w") as f:
        for item in id_list:
            f.write("%s\n" % item)


def format_df(dicts):
    """
    Formats the scraped data into a pandas DataFrame.
    Args:
    dicts: List of dictionaries containing the scraped data.

    Returns:
        Formatted pandas DataFrame.
    """

    df = pd.DataFrame(dicts)
    author_columns = [col for col in df.columns if col.startswith("Author ")]
    author_names = []
    author_ids = []
    num_authors = len(author_columns) // 2
    for _, row in df.iterrows():
        names = [
            row[f"Author {i+1} name"]
            for i in range(num_authors)
            if f"Author {i+1} name" in row
        ]
        emails = [
            row[f"Author {i+1} id"]
            for i in range(num_authors)
            if f"Author {i+1} id" in row
        ]
        author_names.append(names)
        author_ids.append(emails)
    df["Author Names"] = author_names
    df["Author IDs"] = author_ids
    for i in range(1, num_authors + 1):
        df.drop(
            [f"Author {i} name", f"Author {i} id"],
            axis=1,
            inplace=True,
            errors="ignore",
        )
    return df


def scrape_page(paper_id):
    """
    Scrapes paper data and comments from a specific paper page.

    Args:
        paper_id: ID of the paper to scrape.

    Returns:
        Dictionary containing the scraped paper data and comments.
    """
    url = "https://openreview.net/forum?id=" + paper_id
    driver.get(url)
    page_soup = get_page_source(driver)
    paper_data = get_paper_data(page_soup, paper_id)
    comments = get_comment_data(page_soup, driver, paper_id)
    paper_data["Comments"] = comments
    paper_data["Number of comments"] = len(comments)
    return paper_data

In [None]:
driver = webdriver.Safari()
TIME = 3.5

for year in range(2017, 2024):
    id_list = txt_to_list("data/id_data/" + str(year) + ".txt")
    print("Number of papers to scrape:", len(id_list))
    print("Scraping year", year)
    dicts = []
    batch_count = 1
    for i, paper_id in enumerate(tqdm(id_list), start=1):
        try:
            start_time = time.time()
            dicts.append(scrape_page(paper_id))
            end_time = time.time()
            if end_time - start_time < TIME:
                time.sleep(TIME - (end_time - start_time))

            if i % 50 == 0 or i == len(id_list):
                df = format_df(dicts)
                df.to_csv(
                    f"data/raw_peer_review_data/{year}_batch_{batch_count}.csv",
                    index=False,
                )
                dicts = []
                batch_count += 1
        except:
            print("Error occurred for paper", paper_id)
            time.sleep(TIME)
            continue
    print("Finished scraping year", year)