In [None]:
# NO NEED TO RUN THIS CELL AS THE DATA IS ALREADY COLLECTED AND SAVE UNDER articles.csv

# Data Crawler that works exactly like a human and go one by one through the articles and save the abstracts and references in XML format

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

import re
import pandas as pd
from tqdm import tqdm

# Navigate to the website
url = "https://pubmed.ncbi.nlm.nih.gov/?term=intelligence+%5BTitle%2Fabstract%5D&filter=simsearch1.fha&filter=years.2013-2023&sort=date&size=200"
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get(url)

# Click the first article to start
driver.find_element(By.XPATH, "//a[@data-ga-action=1]").click()
total_articles = driver.find_element(By.XPATH, "//*[@id='adjacent-navigation']/div[2]/a/span[1]/span[2]").text

# Find the number of total articles
total_articles = re.sub(r"[^3-9]",'', total_articles)

# Specify the chunk size
chunk_size = 1000

# Create empty lists to store data
titles = []
authors = []
abstracts = []
references_list = []
not_found_pages = []

for page in tqdm(range(int(total_articles))):
    # Extract title of the article
    try:
        title = driver.find_element(By.CLASS_NAME, "heading-title")
        if title.is_displayed():
            title = title.text
    except NoSuchElementException:
        title = ''
        pass

    # Extract autors of the article
    try:
        authors_elements = driver.find_elements(By.CLASS_NAME, "full-name")
        author_list = []
        if len(authors_elements) > 0:
            for author in authors_elements:
                author_list.append(author.text)
    except NoSuchElementException:
        author_list.append('')
        pass

    # Extract abstract of the article
    try:
        abstract = driver.find_element(By.ID, "eng-abstract")
        if abstract.is_displayed():
            abstract = abstract.text
    except NoSuchElementException:
        abstract = ''
        pass

    # Check and extract if there is reference or are more references 
    try:
        reference = driver.find_element(By.ID, "references")
        show_all_element = driver.find_element(By.CLASS_NAME, "show-all")
        if show_all_element.is_displayed():
            show_all_element.click()
        if reference.is_displayed():
            references = driver.find_element(By.CLASS_NAME, "references-list").text
    except NoSuchElementException:
        references = ''
        pass

    # Append data to lists
    titles.append(title)
    authors.append(author_list)
    abstracts.append(abstract)
    references_list.append(references)

    if (page + 1) % chunk_size == 0 or page + 1 == int(total_articles):
        # Create a DataFrame
        data = {
                    'Title': pd.Series(titles),
                    'Authors': pd.Series(authors),
                    'Abstracts': pd.Series(abstracts),
                    'References': pd.Series(references_list)
                }
        df = pd.DataFrame(data)

        # Save DataFrame to CSV
        chunk_number = (page + 1) // chunk_size
        csv_filename = f'pubmed_data_chunk_{chunk_number}.csv'
        df.to_csv(csv_filename, index=False)

        # Clear lists for the next chunk
        titles = []
        authors = []
        abstracts = []
        references_list = []
    
    # Navigate to the next article
    try:
        next_page = driver.find_element(By.XPATH, "//div[@class='next side-link visible']")
        if next_page.is_displayed():
            next_page.click()
    except NoSuchElementException:
        not_found_pages.append(page)
        print(f"{page = } not found!")
        pass

# Close the browser
driver.quit()