In [28]:
import json

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [29]:
PATH = "..\\chromedriver-win64\\chromedriver.exe" # path to chromedriver

In [30]:
service = Service(PATH)
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Uncomment if you want headless mode
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10, poll_frequency=.2)  # 10 second max wait
wait_loading = WebDriverWait(driver, 30, poll_frequency=.2)  # 30 second max wait

# Start at the review listing page
start_url = "https://www.cochranelibrary.com/cdsr/reviews"
driver.get(start_url)
wait.until(
    lambda driver_2: driver_2.execute_script('return document.readyState') == 'complete'
)

True

In [31]:
def wait_until_loading_screen_disappears():
    wait_loading.until(
        EC.invisibility_of_element_located((By.XPATH, "/html/body/div[2]/div[4]/div[1]/div[2]/div/div/div/div[2]"))
    )

# Configure how many pages to scrape
MAX_PAGES = 20
current_page = 1

all_reviews = []

# Close cookie popup
try:
    wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.osano-cm-dialog__close")))
    close_cookie = driver.find_element(By.CSS_SELECTOR, "button.osano-cm-dialog__close")
    close_cookie.click()
    print("Cookie popup closed.")
except TimeoutException:
    print("No cookie popup appeared.")

wait.until(EC.presence_of_element_located((By.ID, "resultPerPage")))

# Set 'Order by' to Relevancy
try:
    order_styled = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#orderBy + .select-styled")))
    order_styled.click()

    relevancy_option = wait.until(EC.element_to_be_clickable((By.XPATH, "//ul[@class='select-options']/li[@rel='relevancy']")))
    relevancy_option.click()
    print("Set order to Relevancy.")
    wait_until_loading_screen_disappears()
except Exception as e:
    print("Couldn't set order by:", e)

# Set 'Results per page' to 100
try:
    per_page_styled = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#resultPerPage + .select-styled")))
    per_page_styled.click()

    option_100 = wait.until(EC.element_to_be_clickable((By.XPATH, "//ul[@class='select-options']/li[@rel='100']")))
    option_100.click()
    print("Set results per page to 100.")
    wait_until_loading_screen_disappears()
except Exception as e:
    print("Couldn't set results per page:", e)

# Main scraping loop
while True:
    print(f"Scraping page {current_page}...")

    for item in driver.find_elements(By.CSS_SELECTOR, "h3.result-title a"):
        try:
            title = item.text.strip()
            relative_url = item.get_attribute("href")
            full_url = relative_url if relative_url.startswith("http") else "https://www.cochranelibrary.com" + relative_url
            all_reviews.append({"title": title, "full_url": full_url})
        except Exception as e:
            print("Error parsing item:", e)

    current_page += 1
    if current_page > MAX_PAGES:
        break

    # Try to go to next page
    try:
        print(f'Trying to go to the next page {current_page}...')
        next_button = driver.find_element(By.CSS_SELECTOR, "div.pagination-next-link a")
        next_button.click()

        # Wait for the new page's results to load
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.search-results-item-body")))
        wait_until_loading_screen_disappears()
    except Exception as e:
        print("No more pages or failed to load next page.")
        break

driver.quit()

# Output results
for review in all_reviews:
    title, url = review['title'], review['full_url']
    print(f"{title}\n{url}\n")

Cookie popup closed.
Set order to Relevancy.
Set results per page to 100.
Scraping page 1...
Trying to go to the next page 2...
Scraping page 2...
Trying to go to the next page 3...
Scraping page 3...
Trying to go to the next page 4...
Scraping page 4...
Trying to go to the next page 5...
Scraping page 5...
Trying to go to the next page 6...
Scraping page 6...
Trying to go to the next page 7...
Scraping page 7...
Trying to go to the next page 8...
Scraping page 8...
Trying to go to the next page 9...
Scraping page 9...
Trying to go to the next page 10...
Scraping page 10...
Trying to go to the next page 11...
Scraping page 11...
Trying to go to the next page 12...
Scraping page 12...
Trying to go to the next page 13...
Scraping page 13...
Trying to go to the next page 14...
Scraping page 14...
Trying to go to the next page 15...
Scraping page 15...
Trying to go to the next page 16...
Scraping page 16...
Trying to go to the next page 17...
Scraping page 17...
Trying to go to the next pa

In [32]:
print(f"Scraped {len(all_reviews)} reviews.")

# save to json
with open("../data/cochrane_reviews.json", "w", encoding="utf-8") as f:
    json.dump(all_reviews, f, ensure_ascii=False, indent=4, sort_keys=True)

print("Saved reviews in data to cochrane_reviews.json")

Scraped 2000 reviews.
Saved reviews in data to cochrane_reviews.json
