In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
import time
from bs4 import BeautifulSoup
import json


In [23]:
PATH = "..\\chromedriver-win64\\chromedriver.exe"
service = Service(PATH)
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10)

# Load reviews
with open("../data/cochrane_reviews.json", "r", encoding="utf-8") as f:
    reviews = json.load(f)

results = []
errors = []

for i, review in enumerate(reviews):
    start_url = review[1]
    try:
        driver.get(start_url)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Title
        title_tag = soup.find("h1", class_="publication-title")
        title = title_tag.get_text(strip=True) if title_tag else None

        # Authors
        authors = []
        author_tags = soup.select("ul.authors li.author a")
        for tag in author_tags:
            authors.append(tag.get_text(strip=True))

        # Abstract (by section)
        abstract_sections = soup.select("div.abstract section")
        abstract = {}
        for section in abstract_sections:
            heading = section.find("h3", class_="title")
            content = section.find("p")
            if heading and content:
                abstract[heading.get_text(strip=True)] = content.get_text(" ", strip=True)

        # PICO elements
        pico = {}
        pico_container = soup.find("div", class_="pico-table")
        if pico_container:
            for column in pico_container.find_all("div", class_="pico-column"):
                header = column.find("h6")
                if header:
                    key = header.get_text(strip=True).split("(")[0].strip()  # Remove count
                    terms = [li.get_text(strip=True) for li in column.select("ul.pico-terms li")]
                    pico[key] = terms

        # Collect result
        result = {
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "pico": pico,
            "source_url": start_url
        }
        results.append(result)

    except Exception as e:
        print(f"Error processing review {i}: {start_url}\n{e}")
        errors.append({"url": start_url, "error": str(e)})

# Write results
with open("../data/cochrane_parsed.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

with open("../data/cochrane_errors.json", "w", encoding="utf-8") as f:
    json.dump(errors, f, indent=2, ensure_ascii=False)

# Cleanup
driver.quit()