In [8]:
import json
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime
import google.generativeai as genai
from dotenv import load_dotenv

In [1]:
def scrape_repository():
    """
    Scrapes thesis data from the UNHAS Statistics repository.
    """
    # Automatically install and set up the ChromeDriver
    service = ChromeService(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--headless") # Optional: run in background
    options.add_argument("--log-level=3") # Suppress console logs
    driver = webdriver.Chrome(service=service, options=options)

    base_url = "https://repository.unhas.ac.id/view/divisions/statistika/"
    print(f"Navigating to {base_url}...")
    driver.get(base_url)
    time.sleep(3) 

    repository_data = {}

    # Find all year links on the main page to avoid stale elements
    year_elements = driver.find_elements(By.XPATH, "/html/body/div[1]/div/div[2]/div/ul/li/a")
    year_links = [(elem.text, elem.get_attribute('href')) for elem in year_elements]

    def get_element_text_or_none(driver, xpath):
        """Safely gets text from an element, returning None if not found."""
        try:
            return driver.find_element(By.XPATH, xpath).text.strip()
        except NoSuchElementException:
            return None

    # Loop 1: Iterate through each year
    for year_text, year_url in year_links:
        print(f"\nProcessing Year: {year_text}")
        repository_data[year_text] = {}
        driver.get(year_url)
        time.sleep(2)

        thesis_urls = []
        thesis_index = 1
        # Loop 2: Find all thesis links for the current year
        while True:
            try:
                xpath = f"/html/body/div[1]/div/div[2]/div[2]/p[{thesis_index}]/a"
                thesis_link_element = driver.find_element(By.XPATH, xpath)
                thesis_urls.append(thesis_link_element.get_attribute('href'))
                thesis_index += 1
            except NoSuchElementException:
                break # Exit loop when no more thesis links are found
        
        # Loop 3: Visit each thesis page and scrape data
        for i, thesis_url in enumerate(thesis_urls):
            driver.get(thesis_url)
            time.sleep(1)

            title = get_element_text_or_none(driver, '//*[@id="page-title"]')
            if not title:
                print(f"  - Skipping entry {i+1}/{len(thesis_urls)} (Title not found)")
                continue
            
            print(f"  - Scraping [{i+1}/{len(thesis_urls)}]: {title[:60]}...")

            # Scrape all required details
            thesis_details = {
                "author": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/p/span"),
                "abstract": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/div[3]/p"),
                "item_type": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/table/tbody/tr[1]/td"),
                "date_deposited": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/table/tbody/tr[5]/td"),
                "last_deposited": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/table/tbody/tr[6]/td"),
                "url": thesis_url
            }
            
            repository_data[year_text][title] = thesis_details

    # Save the final data structure to a JSON file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f'output/unhas_repository_{timestamp}.json'
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(repository_data, f, ensure_ascii=False, indent=4)

    print(f"\n✅ Scraping complete. Data has been saved to '{output_filename}'.")
    driver.quit()


if __name__ == '__main__':
    scrape_repository()

Navigating to https://repository.unhas.ac.id/view/divisions/statistika/...

Processing Year: 2025
  - Scraping [1/6]: Perbandingan Model Threshold Generalized Autoregressive Cond...
  - Scraping [2/6]: PEMODELAN ROBUST MIXED GEOGRAPHICALLY AND TEMPORALLY WEIGHTE...
  - Scraping [3/6]: Penggunaan Peta Kendali Generally Weighted Moving Average Pa...
  - Scraping [4/6]: PENGGUNAAN METODE POSSIBILISTIC FUZZY C-MEANS UNTUK PENGELOM...
  - Scraping [5/6]: ANALISIS KORELASI KANONIK MENGGUNAKAN MATRIKS VARIAN KOVARIA...
  - Scraping [6/6]: PERAMALAN MODEL HYBRID METODE SEASONAL AUTOREGRESSIVE INTEGR...

Processing Year: 2024
  - Scraping [1/97]: Regresi Kuantil Elastic-Net dan Two-Step Robust Weighted Lea...
  - Scraping [2/97]: MODEL REGRESI ROBUST IMPROVED GEOGRAPHICALLY AND TEMPORALLY ...
  - Scraping [3/97]: Perbandingan Metode Random Forest dan Naive Bayes pada Klasi...
  - Scraping [4/97]: Analisis Periode Kekeringan Meteorologis Berbasis Standardiz...
  - Scraping [5/97]: PEMODELAN REGR

In [10]:
# --- Configuration ---
# 1. Set up your Google API Key
load_dotenv()  # This loads variables from .env into the environment

API_KEY = os.getenv("GOOGLE_API_KEY")
if not API_KEY:
    raise ValueError("GOOGLE_API_KEY environment variable not set. Please set your API key.")

genai.configure(api_key=API_KEY)

# 2. Define filenames and batch size
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
INPUT_FILENAME = 'output/unhas_repository_20250730_004812.json'
OUTPUT_FILENAME = f'output/unhas_repository_classified_{timestamp}.json'
BATCH_SIZE = 20 

# 3. Define the categories and instructions for the Gemini model
CATEGORIES = {
    "Regresi": "Fokus pada pemodelan hubungan antara variabel yang bentuknya telah ditentukan sebelumnya (misalnya linear atau logistik). Tujuan utamanya adalah untuk inferensi, yaitu memahami dan mengukur seberapa besar pengaruh satu variabel terhadap variabel lainnya.",
    "Regresi Nonparametrik": "Metode regresi yang digunakan ketika bentuk hubungan antara variabel tidak diasumsikan mengikuti model matematis tertentu. Pendekatan ini lebih fleksibel dan cocok untuk data dengan pola yang kompleks dan tidak linear. Contoh utamanya meliputi Regresi Spline, Regresi Kernel, dan Local Regression (LOESS).",
    "Pengendalian Kualitas Statistika": "Fokus pada penggunaan alat statistik, terutama peta kendali (control chart), untuk memantau, mengendalikan, dan meningkatkan kualitas suatu proses yang sedang berjalan. Tujuannya adalah untuk mendeteksi variasi tak wajar secara visual agar proses tetap stabil dan outputnya konsisten sesuai standar.",
    "Perancangan Percobaan": "Metodologi untuk merancang eksperimen dari awal secara efisien. Berbeda dengan Pengendalian Kualitas Statistika yang memantau proses, Perancangan Percobaan bertujuan untuk menguji dan membandingkan pengaruh berbagai perlakuan secara aktif dalam sebuah percobaan terkontrol (misalnya RAL, RAK) untuk menemukan pengaturan yang optimal.",
    "Analisis Runtun Waktu": "Analisis data yang memiliki ketergantungan temporal, di mana urutan waktu pengamatan sangat penting. Tujuan utamanya adalah untuk memahami pola historis (tren, musiman) dan melakukan peramalan (forecasting) ke masa depan.",
    "Machine Learning": "Bidang yang berfokus pada pengembangan algoritma untuk membuat prediksi atau klasifikasi seakurat mungkin dengan belajar dari data. ML lebih mengutamakan kemampuan prediktif daripada interpretasi model. Cakupannya luas, mulai dari metode seperti Support Vector Machine (SVM) dan aplikasi praktis seperti analisis sentimen, hingga model yang sangat kompleks dan seringkali bersifat black box.",
    "Analisis Data Spasial": "Metode analisis khusus untuk data yang memiliki ketergantungan spasial, di mana lokasi geografis menjadi kunci. Fokusnya adalah memodelkan bagaimana nilai pada satu lokasi berhubungan dengan nilai di lokasi tetangganya. Ini seringkali melibatkan pengujian autokorelasi spasial (misalnya dengan Moran's I) dan penerapan model regresi yang disesuaikan untuk data spasial, seperti Geographically Weighted Regression (GWR), yang menghasilkan model lokal untuk setiap lokasi pengamatan.",
    "Lainnya": "Kategori untuk topik penelitian skripsi yang tidak termasuk dalam klasifikasi fokus yang telah disebutkan di atas, seperti analisis survival, psikometri, atau bioinformatika."
}

def generate_classification_prompt(batch_items):
    """Generates the prompt for the Gemini API call."""
    category_list_str = "\n".join([f"- **{cat}**: {desc}" for cat, desc in CATEGORIES.items()])
    items_to_classify_str = json.dumps(batch_items, indent=2, ensure_ascii=False)
    prompt = f"""
    You are an expert academic classifier specializing in statistics. Your task is to classify each research item into one of the following categories based on its title and abstract.

    **Categories and Descriptions:**
    {category_list_str}

    **Instructions:**
    1. Analyze the title and abstract for each item in the JSON array below.
    2. For each item, determine the most fitting category from the list provided.
    3. Your response MUST be a valid JSON object that maps each 'id' to its corresponding category name.
    4. The category name MUST be one of these exact strings: {', '.join(CATEGORIES.keys())}.
    5. Do NOT include any explanations, comments, or markdown formatting (like ```json) in your response.

    **Research Items to Classify:**
    {items_to_classify_str}

    **Required Output Format (JSON object):**
    {{
      "id_1": "CategoryName",
      "id_2": "CategoryName",
      ...
    }}
    """
    return prompt


def classify_theses():
    """Loads, classifies, and saves thesis data with improved robustness."""
    try:
        with open(INPUT_FILENAME, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: Input file '{INPUT_FILENAME}' not found. Please run the scraper first.")
        return

    model = genai.GenerativeModel('gemini-1.5-pro-latest')

    tasks = []
    task_id_counter = 0
    for year, theses in data.items():
        for title, details in theses.items():
            if "study_focus" in details:
                continue
            
            abstract = details.get("abstract", "") or ""
            if "LIHAT DI FULL TEXT" in abstract.upper():
                abstract = ""
            
            tasks.append({
                "id": f"task_{task_id_counter}",
                "title": title,
                "abstract": abstract,
                "original_object": details
            })
            task_id_counter += 1

    if not tasks:
        print("✅ All items are already classified. No action needed.")
        return

    print(f"Found {len(tasks)} items to classify. Starting process in batches of {BATCH_SIZE}...")
    
    for i in range(0, len(tasks), BATCH_SIZE):
        batch = tasks[i:i + BATCH_SIZE]
        batch_input_for_prompt = [{"id": t["id"], "title": t["title"], "abstract": t["abstract"]} for t in batch]
        
        print(f"  - Processing batch {i//BATCH_SIZE + 1}/{(len(tasks) + BATCH_SIZE - 1)//BATCH_SIZE}...")
        
        prompt = generate_classification_prompt(batch_input_for_prompt)
        
        classifications = {}
        # --- IMPROVEMENT 1A: Added a retry mechanism ---
        retries = 3
        for attempt in range(retries):
            try:
                response = model.generate_content(prompt)
                
                # --- FIX: Clean and validate the response before parsing ---
                # 1. Check if the response has text content.
                if not response.text:
                    raise ValueError("API returned an empty response.")
                
                # 2. Clean potential markdown formatting.
                cleaned_text = response.text.strip().replace("```json", "").replace("```", "").strip()
                
                # 3. Parse the cleaned JSON.
                classifications = json.loads(cleaned_text)
                print("    - Batch successfully processed by API.")
                break # Exit retry loop on success
            except (json.JSONDecodeError, ValueError) as e:
                print(f"    - Warning: API call or parsing failed on attempt {attempt + 1}. Error: {e}")
                # Log the problematic response for debugging
                if 'response' in locals() and hasattr(response, 'text'):
                    print(f"    - Problematic API response text: '{response.text}'")
                if attempt < retries - 1:
                    time.sleep(5) # Wait before retrying
                else:
                    print(f"    - Error: Batch failed after {retries} attempts. Items will be marked 'Classification Failed'.")
            except Exception as e:
                print(f"    - Warning: An unexpected error occurred on attempt {attempt + 1}. Error: {e}")
                if attempt < retries - 1:
                    time.sleep(5)
                else:
                    print(f"    - Error: Batch failed after {retries} attempts. Items will be marked 'Classification Failed'.")


        # --- IMPROVEMENT 1B: Validate each classification ---
        for task in batch:
            task_id = task["id"]
            category = classifications.get(task_id) # Get the category from the API response
            
            if category and category in CATEGORIES:
                task["original_object"]["study_focus"] = category
            else:
                # If category is missing from response, or is not a valid category, mark it.
                task["original_object"]["study_focus"] = "Classification Failed"
                if category: # Log if the category was invalid
                    print(f"    - Warning: Invalid category '{category}' for {task_id}. Defaulting to failed.")

    with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
    print(f"\n✅ Classification complete! Results saved to '{OUTPUT_FILENAME}'.")


if __name__ == '__main__':
    classify_theses()

Found 252 items to classify. Starting process in batches of 20...
  - Processing batch 1/13...


KeyboardInterrupt: 