In [1]:
import os
import json
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from time import sleep

def login(driver, base_url, username, password):
    """Log into the website and authenticate the session."""
    driver.get(base_url)

    # Wait for redirection and Shibboleth login page to load
    sleep(3)  # Adjust the sleep time as needed

    # Inspect the page and find the correct IDs or names for these elements
    username_field_id = 'username'  # Replace with the actual ID or name
    password_field_id = 'password'  # Replace with the actual ID or name
    submit_button_id = 'submit'     # Replace with the actual ID or name of the submit button

    # Fill in the username and password
    driver.find_element(By.ID, username_field_id).send_keys(username)
    driver.find_element(By.ID, password_field_id).send_keys(password)

    # Submit the form
    driver.find_element(By.ID, password_field_id).send_keys(Keys.RETURN)

    # Wait for the authentication to complete and the target page to load
    sleep(10)  # Adjust the sleep time as needed

def update_json(data, volume, issue, page, title, link):
    """Update the JSON object with article details."""
    if volume not in data:
        data[volume] = {}
    if issue not in data[volume]:
        data[volume][issue] = {}
    if page not in data[volume][issue]:
        data[volume][issue][page] = []

    data[volume][issue][page].append({'title': title, 'link': link})

def crawl_volume_issue(driver, base_url, volume, issue, data):
    """Crawl articles for a specific volume and issue, updating the JSON data."""
    page = 1
    seen_articles = set()

    while True:
        # Construct the URL for the specific volume and issue
        url = f"{base_url}/content/journal/ijsem/{volume}/{issue}?page={page}"
        driver.get(url)

        # Wait for the page to load
        sleep(3)

        try:
            article_elements = driver.find_elements(By.CSS_SELECTOR, ".articleInToc .articleTitle.js-articleTitle a")
            if not article_elements:
                print("No articles at all")
                break

            new_articles = False
            for article_element in article_elements:
                title = article_element.text
                link = article_element.get_attribute('href')

                if (title, link) not in seen_articles and 'sp. nov.' in title:
                    update_json(data, volume, issue, str(page), title, link)
                    seen_articles.add((title, link))
                    new_articles = True

            if not new_articles:
                # print("No new articles")
                break

            print("Crawling page: " + str(page))
            page += 1
        except NoSuchElementException:
            break

    # Save the updated data to articles.json
    with open('articles.json', 'w') as json_file:
        json.dump(data, json_file, indent=4)

def crawl_all_volumes_issues(driver, base_url, volumes_issues, data):
    """
    Crawls articles for all specified volumes and their corresponding issues.

    :param driver: Selenium WebDriver instance.
    :param base_url: Base URL of the website to crawl.
    :param volumes_issues: Dictionary where keys are volumes and values are lists of issues.
    :param data: Dictionary to store the crawled data.
    """
    for volume, issues in volumes_issues.items():
        for issue in issues:
            print(f"Crawling Volume {volume}, Issue {issue}...")
            crawl_volume_issue(driver, base_url, volume, issue, data)
            sleep(2)  # Short break to prevent overwhelming the server

In [5]:
# Load environment variables from .env file
load_dotenv()

# Access your environment variable
username = os.getenv('MY_APP_USERNAME')
password = os.getenv('MY_APP_PASSWORD')

# Load existing JSON data or initialize a new dictionary
try:
    with open('articles.json', 'r') as json_file:
        articles_data = json.load(json_file)
except FileNotFoundError:
    articles_data = {}

# Main script
base_url = 'https://www-microbiologyresearch-org.ezp3.lib.umn.edu'

# Example usage
volumes_and_issues = {
    '73': ['11', '10', '9', '8', '7', '6', '5', '5a', '4', '3', '2', '1'],
    '72': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '71': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '70': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '69': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1', '1A'],
    '68': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '67': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '66': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '65': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '64': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '63': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '62': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '61': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '60': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '59': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '58': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '57': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '56': ['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1'],
    '55': ['6', '5', '4', '3', '2', '1'],
    '54': ['6', '5', '4', '3', '2', '1'],
    '53': ['6', '5', '4', '3', '2', '1']
}

driver = webdriver.Chrome()
login(driver, base_url, username, password)

# Crawl all specified volumes and issues
crawl_all_volumes_issues(driver, base_url, volumes_and_issues, articles_data)

driver.quit()

# Save the updated data to articles.json
with open('articles.json', 'w') as json_file:
    json.dump(articles_data, json_file, indent=4)

del username, password

Crawling Volume 72, Issue 12...
Crawling page: 1
Crawling page: 2
Crawling Volume 72, Issue 11...
Crawling page: 1
Crawling page: 2
Crawling Volume 72, Issue 10...
Crawling page: 1
Crawling page: 2
Crawling page: 3
Crawling page: 4
Crawling Volume 72, Issue 9...
Crawling page: 1
Crawling Volume 72, Issue 8...
Crawling page: 1
Crawling page: 2
Crawling page: 3
Crawling Volume 72, Issue 7...
Crawling page: 1
Crawling page: 2
Crawling Volume 72, Issue 6...
Crawling page: 1
Crawling page: 2
Crawling Volume 72, Issue 5...
Crawling page: 1
Crawling page: 2
Crawling page: 3
Crawling Volume 72, Issue 4...
Crawling page: 1
Crawling page: 2
Crawling Volume 72, Issue 3...
Crawling page: 1
Crawling page: 2
Crawling page: 3
Crawling Volume 72, Issue 2...
Crawling page: 1
Crawling page: 2
Crawling page: 3
Crawling Volume 72, Issue 1...
Crawling page: 1
Crawling page: 2
Crawling Volume 71, Issue 12...
Crawling page: 1
Crawling page: 2
Crawling page: 3
Crawling Volume 71, Issue 11...
Crawling page: 1


In [6]:
import json

def count_articles_stats(json_file_path):
    try:
        with open(json_file_path, 'r') as file:
            data = json.load(file)

        total_articles = 0
        articles_by_volume = {}

        for volume, issues in data.items():
            volume_article_count = 0
            for issue, pages in issues.items():
                for page, articles in pages.items():
                    volume_article_count += len(articles)
            total_articles += volume_article_count
            articles_by_volume[volume] = volume_article_count

        print(f"Total Articles: {total_articles}")
        print("Articles by Volume:")
        for volume, count in articles_by_volume.items():
            print(f"  Volume {volume}: {count} articles")

    except FileNotFoundError:
        print(f"No file found at {json_file_path}")
    except json.JSONDecodeError:
        print(f"Error decoding JSON from {json_file_path}")

# Example usage
json_file_path = 'articles.json'
count_articles_stats(json_file_path)


Total Articles: 11162
Articles by Volume:
  Volume 73: 625 articles
  Volume 72: 410 articles
  Volume 71: 520 articles
  Volume 70: 738 articles
  Volume 69: 530 articles
  Volume 68: 556 articles
  Volume 67: 790 articles
  Volume 66: 776 articles
  Volume 65: 701 articles
  Volume 64: 593 articles
  Volume 63: 710 articles
  Volume 62: 467 articles
  Volume 61: 480 articles
  Volume 60: 502 articles
  Volume 59: 509 articles
  Volume 58: 465 articles
  Volume 57: 486 articles
  Volume 56: 419 articles
  Volume 55: 353 articles
  Volume 54: 295 articles
  Volume 53: 237 articles
