In [1]:
import os
import json
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from time import sleep

def login(driver, base_url, username, password):
    """Log into the website and authenticate the session."""
    driver.get(base_url)

    # Wait for redirection and Shibboleth login page to load
    sleep(3)  # Adjust the sleep time as needed

    # Inspect the page and find the correct IDs or names for these elements
    username_field_id = 'username'  # Replace with the actual ID or name
    password_field_id = 'password'  # Replace with the actual ID or name

    # Fill in the username and password
    driver.find_element(By.ID, username_field_id).send_keys(username)
    driver.find_element(By.ID, password_field_id).send_keys(password)

    # Submit the form
    driver.find_element(By.ID, password_field_id).send_keys(Keys.RETURN)

    # Wait for the authentication to complete and the target page to load
    sleep(10)  # Adjust the sleep time as needed

def crawl_article(driver, url, filename):
    """Crawl an article, extract all HTML, and save it to a specified file."""
    driver.get(url)
    sleep(3)  # Adjust the sleep time as needed

    html = driver.page_source

    # Save the HTML content to the specified file
    with open(filename, "w", encoding="utf-8") as file:
        file.write(html)

def create_directory(path):
    """Create a directory if it doesn't exist."""
    if not os.path.exists(path):
        os.makedirs(path)

In [2]:
# Load environment variables from .env file
load_dotenv()

# Access your environment variable
username = os.getenv('MY_APP_USERNAME')
password = os.getenv('MY_APP_PASSWORD')

# Main script
base_url = 'https://www-microbiologyresearch-org.ezp3.lib.umn.edu'

driver = webdriver.Chrome()
login(driver, base_url, username, password)

del username, password

In [3]:
# Load JSON data
with open('articles.json', 'r') as file:
    data = json.load(file)

base_dir = 'articles'  # Base directory name
create_directory(base_dir)  # Create the base directory

# Iterate through the JSON data
for year, volumes in data.items():
    for volume, pages in volumes.items():
        for page, articles in pages.items():
            page_dir = os.path.join(base_dir, year, volume, page)
            create_directory(page_dir)  # Create the directory for each page

            for article_number, article in enumerate(articles, start=1):
                # Skip the article if 'filename' key exists
                if 'filename' in article:
                    continue

                formatted_number = f"{article_number:03}"  # Zero-padded number
                filename = f"{year}_{volume}_{page}_{formatted_number}.html"
                full_path = os.path.join(page_dir, filename)
                url = article['link']

                # Crawl the article and save the HTML
                crawl_article(driver, url, full_path)

                # Update JSON data with the filename
                article['filename'] = filename

            # Save the updated JSON data after each page is processed
            with open('articles.json', 'w') as file:
                json.dump(data, file, indent=4)

In [20]:
driver.quit()