In [1]:
import pandas as pd
from nature_urls import nature_urls
from pathlib import Path
import time
import random

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
class Nature_Spyder:
    def __init__(self, url_list:list[str|Path]):
        # Initialize the Chrome driver
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        self.driver = webdriver.Chrome(options=options)
        self.urls = url_list
        self.articles_urls = []
        
        self.title_list = []
        self.article_url_list = []
        self.description_list = []
        self.authors_list = []
        self.publication_date_list = []
        self.access_type_list = []

        #random.shuffle(self.urls)

    def scroll_down(self, pixels):
        ActionChains(self.driver).scroll_by_amount(0, pixels).perform()
    
    def scrape_articles_info(self, url):
        # Open the target URL
        self.driver.get(url)

        # Find all article elements on the page
        article_elements = self.driver.find_elements(By.CSS_SELECTOR, "article.u-full-height")

        title_list = []
        article_url_list = []
        description_list = []
        authors_list = []
        publication_date_list = []
        access_type_list = []
        
        # Iterate over each article element and extract information
        for article in article_elements:
            try:
                # Extract the title
                title_element = article.find_element(By.CSS_SELECTOR, "h3.c-card__title a")
                title = title_element.text
                title_list.append(title)
                
                # Extract the article URL
                article_url = title_element.get_attribute('href')
                article_url_list.append(article_url)
                
                # Extract the description/summary
                description_element = article.find_element(By.CSS_SELECTOR, "div.c-card__summary p")
                description = description_element.text
                description_list.append(description)

                # Extract the authors
                author_elements = article.find_elements(By.CSS_SELECTOR, "ul.c-author-list li span[itemprop='name']")
                authors = [author.text for author in author_elements]
                authors_list.append(authors)

                # Extract the publication date
                date_element = article.find_element(By.CSS_SELECTOR, "time.c-meta__item")
                publication_date = date_element.get_attribute('datetime')
                publication_date_list.append(publication_date)

                # Extract the access type
                access_element = article.find_element(By.CSS_SELECTOR, "span.u-color-open-access")
                access_type = access_element.text
                access_type_list.append(access_type)

                # Print the scraped information
                print(f"Title: {title}")
                #print(f"URL: {article_url}")
                #print(f"Description: {description}")
                #print(f"Authors: {', '.join(authors)}")
                #print(f"Publication Date: {publication_date}")
                #print(f"Access Type: {access_type}")
                #print("=" * 40)
            except Exception as e:
                print(f"Error scraping article: {e}")

        # Close the driver
        self.driver.quit()

        return title_list, article_url_list, description_list, authors_list, publication_date_list, access_type_list

    def scrape_all(self,) -> None:
        for url_loc in self.urls:
            time.sleep(12.)
            try:
                title_list, article_url_list, description_list, authors_list, publication_date_list, access_type_list = self.scrape_articles_info(url=url_loc)
                self.title_list += title_list
                self.article_url_list += article_url_list
                self.description_list += description_list
                self.authors_list += authors_list
                self.publication_date_list += publication_date_list
                self.access_type_list += access_type_list

            except Exception as e:
                print('Skip')
                print(f'url: {url_loc}, SKIPPED due to error {e}')
        pass

In [None]:
random.seed(35)
for i in range(500):
    # wait
    random.shuffle(nature_urls)
    nature_subset_urls = nature_urls[:2]
    time.sleep(15)
    
    # scrape
    spyder = Nature_Spyder(nature_subset_urls)
    spyder.scrape_all()
    
    # store
    df_nature = pd.DataFrame({'html_url' : spyder.article_url_list})
    df_nature.to_csv('./registry/nature_database.csv', sep='|', mode='a', header=False)

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('./registry/nature_database.csv', sep='|')

In [6]:
len(df.drop_duplicates(subset=['html_url']))

420

In [5]:
len(df)

860