In [1]:
import json
import time

from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

In [2]:
PATH = "..\\chromedriver-win64\\chromedriver.exe"
service = Service(PATH)
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10, poll_frequency=.2)
MAX_RETRIES = 5
MAX_ARTICLES = 4

In [3]:
medicine_queries = ["DNA", "RNA", "protein", "cell", "gene", "genome", "mutation", "virus", "bacteria", "fungus", "immune system", "antibody", "antigen", "vaccine", "disease", "diagnosis", "treatment", "therapy", "clinical trial", "pharmacology", "neuroscience", "cardiology", "oncology", "endocrinology", "immunology", "microbiology", "virology", "anatomy", "physiology", "pathology", "biochemistry", "biophysics", "bioinformatics", "algorithm", "data", "model", "simulation", "analysis", "statistics", "research", "experiment", "hypothesis", "theory", "scientific method", "publication", "peer review", "conference", "journal", "ethics", "innovation", "technology"]

In [4]:
all_articles = []
errors = []

for i, query in enumerate(medicine_queries):
    try:
        articles = []
        for _ in range(MAX_RETRIES):
            driver.get(f'https://citation-screening.ec.tuwien.ac.at/search?search_query={query}&source=reformulate_search')
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "article.card")))

            articles = driver.find_elements(By.CSS_SELECTOR, "article.card")

            # articles[0] is a wikipedia article
            if len(articles) > 1:
                break

        article_iterator = iter(articles)
        article = next(article_iterator)
        tries = 0
        index = 1

        while tries < MAX_ARTICLES and index < len(articles):
            index += 1
            article = next(article_iterator)

            title_element = article.find_element(By.CSS_SELECTOR, ".card-content .title a")
            title = title_element.text
            source_url = title_element.get_attribute("href")

            authors_element = article.find_element(By.CSS_SELECTOR, ".card-content .subtitle")
            authors = authors_element.text.split(', ')

            abstract = {}
            try:
                abstract_button = article.find_element(By.CSS_SELECTOR, "p.is-5 button.card__show-more")
                abstract_button.click()
                time.sleep(0.5)
            except NoSuchElementException:
                pass

            abstract_text = article.find_element(By.CSS_SELECTOR, "div.content p.is-5").text[10:]
            if abstract_text != 'No abstract available':
                abstract["Abstract"] = abstract_text

            cso_keywords_button = article.find_element(By.CSS_SELECTOR, "h6 button.card__show-more")
            cso_keywords_button.click()
            time.sleep(0.5)

            cso_keywords = article.find_elements(By.CSS_SELECTOR, "a.button")
            if len(cso_keywords) > 0:
                abstract["CSO Keywords"] = ', '.join(cso_keyword.text for cso_keyword in cso_keywords)

            if len(abstract) > 0:
                tries += 1
                all_articles.append({
                    "title": title,
                    "authors": authors,
                    "abstract": abstract,
                    "source_url": source_url,
                })

        print(f'Done: {query}')

    except Exception as e:
        print(f"Error processing review {i}: {query}\n{e}")
        errors.append({"query": query, "error": str(e)})

Done: DNA
Done: RNA
Done: protein
Done: cell
Done: gene
Done: genome
Done: mutation
Done: virus
Done: bacteria
Done: fungus
Done: immune system
Done: antibody
Done: antigen
Done: vaccine
Done: disease
Done: diagnosis
Done: treatment
Done: therapy
Done: clinical trial
Done: pharmacology
Done: neuroscience
Done: cardiology
Done: oncology
Done: endocrinology
Done: immunology
Done: microbiology
Done: virology
Done: anatomy
Done: physiology
Done: pathology
Done: biochemistry
Done: biophysics
Done: bioinformatics
Done: algorithm
Done: data
Done: model
Done: simulation
Done: analysis
Done: statistics
Done: research
Done: experiment
Done: hypothesis
Done: theory
Done: scientific method
Done: publication
Done: peer review
Done: conference
Done: journal
Done: ethics
Done: innovation
Done: technology


In [5]:
# Write results
with open("../data/cruise_literature/cruise_literature_parsed.json", "w", encoding="utf-8") as f:
    json.dump(all_articles, f, indent=2, ensure_ascii=False, sort_keys=True)

with open("../data/cruise_literature/cruise_literature_errors.json", "w", encoding="utf-8") as f:
    json.dump(errors, f, indent=2, ensure_ascii=False, sort_keys=True)