<a href="https://colab.research.google.com/github/AEGriffith/PhDUtilities/blob/main/ACM_Paper_Exctractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [155]:
#@title ACM Search Information
#@markdown Enter your search url:
search_url = "https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&ContentItemType=research-article&expand=dl&CCSAnd=60&AfterYear=2018&BeforeYear=2023&AllField=Fulltext%3A%28AI+Agent+%22Artificial+Intelligence%22%29+AND+Fulltext%3A%28Creativity%29+AND+Fulltext%3A%28Collab*+Support+Tool%29" #@param {type: "string"}
#@markdown Enter the first and last search year:
start_year = 2018 #@param {type: "integer"}
end_year = 2023 #@param {type: "integer"}
#@markdown Enter the filepath to save csv file (including csv name)
filepath = "/content/drive/MyDrive/Quals/df_example.csv" #@param {type: "string"} 


# Run All

## Setup

In [25]:
%%capture
# install chromium, its driver, and selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
# set options to be headless, ..
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

In [26]:
import urllib3
import pandas as pd

from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from collections import Counter
import re

In [28]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Functions

In [151]:
def modify_link(url, year):
    """
    Modify the link to increase page size and get number of pages.
    Update dates in url to allow for variable dates
    """
    # Set page size to 50
    if re.findall(r'pageSize=\d+', url):
      url = re.sub(r'pageSize=\d+', 'pageSize=50', url)
    else: 
      url = url + "&pageSize=50"
    # Set dates to equal variable dates of same year (so we can search one year at a time)
    url = re.sub(r'AfterYear=\d+', 'AfterYear={year}', url)
    url = re.sub(r'BeforeYear=\d+', 'BeforeYear={year}', url)
    # Finds the number of pages by dividing the number of search results by 50 (rounded up).
    driver.get(url.format(year=year))
    WebDriverWait(driver, 10)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    num_results = int((soup.find("span", {"class": "hitsLength"}).text).replace(",",""))
    num_pages: int = (num_results // 50) + 1
    # replace startPage=0 with startPage={page}
    if re.findall(r'startPage=\d+', url):
      url = re.sub(r'startPage=\d+', 'startPage={page}', url)
    else:
      url = url + "&startPage={page}"
    return url, num_pages

In [152]:
def get_paper_info(search_url):
  """
  Get the links, titles, citation count, download count, and epub date

  Returns a dictionary
  """

  paper_dict = {"paper_doi": [], "paper_title": [], "paper_month": [], "paper_year":[], "citation_count": [], "download_count": []}
  paper_urls = []

  for year in range(start_year, end_year+1):
    
    page_url, num_pages = modify_link(search_url, year)

    for page in range(num_pages):
      driver.get(page_url.format(page=page, year=year))
      WebDriverWait(driver, 10)
      html = driver.page_source
      soup = BeautifulSoup(html, "html.parser")

      # title and doi
      title_spans = soup.find_all("span", {"class": "hlFld-Title"})
      print("title_spans: ", len(title_spans))
      for title_span in title_spans:
        paper_url = title_span.find("a", href=True)
        if paper_url:
          paper_url = urllib3.util.url.parse_url(paper_url["href"]).url
          paper_urls.append(f'https://dlc.acm.org/{paper_url}')
        paper_title = title_span.find("a").text
        paper_dict["paper_title"].append(paper_title)
        paper_dict["paper_doi"].append(paper_url)

      # citation and download counts
      metrics = soup.find_all("li", {"class": "metric-holder"})
      for metric in metrics:
        paper_citation = metric.find("div", {"class": "citation"})
        # for citation in paper_citation:
        if paper_citation:
          citation = paper_citation.text
          citation = citation.replace("Total Citations", "")
          citation = citation.replace(",", "")
          citation = citation.replace(" ", "")
          paper_dict["citation_count"].append(int(citation))
        else:
          paper_dict["citation_count"].append(0)
      
        paper_download = metric.find("div", {"class": "metric"})
      # print("downloads: ", len(paper_downloads))
      # for download in paper_downloads:
        if paper_download:
          download = paper_download.text
          download = download.replace("Total Downloads", "")
          download = download.replace(",", "")
          download = download.replace(" ", "")
          paper_dict["download_count"].append(int(download))
        else:
          paper_dict["download_count"].append(0)

      # paper dates
      paper_dates = soup.find_all("div", {"class": "bookPubDate"})
      for date in paper_dates:
        date = date.text
        # get year from date
        month, year = date.split(" ")
        paper_dict["paper_month"].append(month)
        paper_dict["paper_year"].append(year)
  return paper_dict

In [None]:
driver = webdriver.Chrome('chromedriver',options=options)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Get paper lists and information and put it into a dataframe
paper_info = get_paper_info(search_url)
df = pd.DataFrame(paper_info)
df.to_csv(filepath)

driver.quit()

# Specific processing for my Qualifying Exam

In [150]:
kept_papers_count = 0
df.to_csv()
for year in range(start_year, end_year):
  df_by_year = df[df['paper_year'].astype(int)==year]
  avg_downloads = df_by_year.loc[:, 'download_count'].mean()
  df_keep = df_by_year[(df_by_year['download_count'] >= avg_downloads)]
  kept_papers_count += len(df_keep)
  df_lose = df_by_year[(df_by_year['download_count'] < avg_downloads)]
  df_keep.to_csv(f"/content/drive/MyDrive/Quals/df_keep_{year}.csv")
  df_lose.to_csv(f"/content/drive/MyDrive/Quals/df_lose_{year}.csv")

print(kept_papers_count)

812
