In [2]:
!apt-get update > /dev/null
!apt-get install -y chromium-chromedriver > /dev/null
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium beautifulsoup4 > /dev/null

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
cp: '/usr/lib/chromium-browser/chromedriver' and '/usr/bin/chromedriver' are the same file


In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import json
import time

# List of paper URLs to scrape
links = [
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/668173",
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/665047",
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/669736",
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/662273",
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/660030",
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/668861",
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/669377",
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/669431",
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/670481",
    "https://aiche.confex.com/aiche/2023/meetingapp.cgi/Paper/668329"
    ]

options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

def get_text(soup, selector):
    elements = soup.select(selector)
    if not elements:
        return "Not found"
    return " | ".join([el.get_text(strip=True) for el in elements])

driver = webdriver.Chrome(options=options)
all_data = []

try:
    for url in links:
        driver.get(url)
        WebDriverWait(driver, 20).until(
            EC.any_of(
                EC.presence_of_element_located((By.CSS_SELECTOR, "section.titleContent")),
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.field_Abstract"))
            )
        )
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        data = {
            "url": url,
            "topic": get_text(soup, "p.favoriteItem"),
            "date_time": get_text(soup, 'span.defaultTZ'),
            "abstract": get_text(soup, 'section.field_Abstract'),
            "presenting_author": f"{get_text(soup, 'a.presenter')} | {get_text(soup, 'span.roleAffiliation')}"
        }
        all_data.append(data)
finally:
    driver.quit()

# Save to JSON
with open('aiche_papers.json', 'w', encoding='utf-8') as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)


In [3]:
from selenium.common.exceptions import TimeoutException

def scrape_day(driver, date):
    day_data = {"date": date, "sessions": []}
    url = f"https://aiche.confex.com/aiche/2023/meetingapp.cgi/Day/{date}"
    try:
        driver.get(url)
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.sessionRow"))
            )
        except TimeoutException:
            print(f"⏩ No sessions found for {date}, skipping.")
            return None  # No sessions for this day
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        session_links = ["https://aiche.confex.com" + a['href']
                         for a in soup.select('div.sessionRow a[href*="/Session/"]')]
        # ... continue as before
        # ...
        return day_data
    except Exception as e:
        print(f"⚠️ Day error: {date} | {str(e)}")
        return None
