In [1]:
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime

def scrape_repository():
    """
    Scrapes thesis data from the UNHAS Statistics repository.
    """
    # Automatically install and set up the ChromeDriver
    service = ChromeService(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--headless") # Optional: run in background
    options.add_argument("--log-level=3") # Suppress console logs
    driver = webdriver.Chrome(service=service, options=options)

    base_url = "https://repository.unhas.ac.id/view/divisions/statistika/"
    print(f"Navigating to {base_url}...")
    driver.get(base_url)
    time.sleep(3) 

    repository_data = {}

    # Find all year links on the main page to avoid stale elements
    year_elements = driver.find_elements(By.XPATH, "/html/body/div[1]/div/div[2]/div/ul/li/a")
    year_links = [(elem.text, elem.get_attribute('href')) for elem in year_elements]

    def get_element_text_or_none(driver, xpath):
        """Safely gets text from an element, returning None if not found."""
        try:
            return driver.find_element(By.XPATH, xpath).text.strip()
        except NoSuchElementException:
            return None

    # Loop 1: Iterate through each year
    for year_text, year_url in year_links:
        print(f"\nProcessing Year: {year_text}")
        repository_data[year_text] = {}
        driver.get(year_url)
        time.sleep(2)

        thesis_urls = []
        thesis_index = 1
        # Loop 2: Find all thesis links for the current year
        while True:
            try:
                xpath = f"/html/body/div[1]/div/div[2]/div[2]/p[{thesis_index}]/a"
                thesis_link_element = driver.find_element(By.XPATH, xpath)
                thesis_urls.append(thesis_link_element.get_attribute('href'))
                thesis_index += 1
            except NoSuchElementException:
                break # Exit loop when no more thesis links are found
        
        # Loop 3: Visit each thesis page and scrape data
        for i, thesis_url in enumerate(thesis_urls):
            driver.get(thesis_url)
            time.sleep(1)

            title = get_element_text_or_none(driver, '//*[@id="page-title"]')
            if not title:
                print(f"  - Skipping entry {i+1}/{len(thesis_urls)} (Title not found)")
                continue
            
            print(f"  - Scraping [{i+1}/{len(thesis_urls)}]: {title[:60]}...")

            # Scrape all required details
            thesis_details = {
                "author": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/p/span"),
                "abstract": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/div[3]/p"),
                "item_type": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/table/tbody/tr[1]/td"),
                "date_deposited": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/table/tbody/tr[5]/td"),
                "last_deposited": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/table/tbody/tr[6]/td"),
                "url": thesis_url
            }
            
            repository_data[year_text][title] = thesis_details

    # Save the final data structure to a JSON file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f'output/unhas_repository_{timestamp}.json'
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(repository_data, f, ensure_ascii=False, indent=4)

    print(f"\n✅ Scraping complete. Data has been saved to '{output_filename}'.")
    driver.quit()


if __name__ == '__main__':
    scrape_repository()

Navigating to https://repository.unhas.ac.id/view/divisions/statistika/...

Processing Year: 2025
  - Scraping [1/6]: Perbandingan Model Threshold Generalized Autoregressive Cond...
  - Scraping [2/6]: PEMODELAN ROBUST MIXED GEOGRAPHICALLY AND TEMPORALLY WEIGHTE...
  - Scraping [3/6]: Penggunaan Peta Kendali Generally Weighted Moving Average Pa...
  - Scraping [4/6]: PENGGUNAAN METODE POSSIBILISTIC FUZZY C-MEANS UNTUK PENGELOM...
  - Scraping [5/6]: ANALISIS KORELASI KANONIK MENGGUNAKAN MATRIKS VARIAN KOVARIA...
  - Scraping [6/6]: PERAMALAN MODEL HYBRID METODE SEASONAL AUTOREGRESSIVE INTEGR...

Processing Year: 2024
  - Scraping [1/97]: Regresi Kuantil Elastic-Net dan Two-Step Robust Weighted Lea...
  - Scraping [2/97]: MODEL REGRESI ROBUST IMPROVED GEOGRAPHICALLY AND TEMPORALLY ...
  - Scraping [3/97]: Perbandingan Metode Random Forest dan Naive Bayes pada Klasi...
  - Scraping [4/97]: Analisis Periode Kekeringan Meteorologis Berbasis Standardiz...
  - Scraping [5/97]: PEMODELAN REGR