In [8]:
import json
import time
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime
import google.generativeai as genai
from dotenv import load_dotenv

In [None]:
def scrape_repository():
    """
    Scrapes thesis data from the UNHAS Statistics repository.
    """
    # Automatically install and set up the ChromeDriver
    service = ChromeService(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--headless") # Optional: run in background
    options.add_argument("--log-level=3") # Suppress console logs
    driver = webdriver.Chrome(service=service, options=options)

    base_url = "https://repository.unhas.ac.id/view/divisions/statistika/"
    print(f"Navigating to {base_url}...")
    driver.get(base_url)
    time.sleep(3) 

    repository_data = {}

    # Find all year links on the main page to avoid stale elements
    year_elements = driver.find_elements(By.XPATH, "/html/body/div[1]/div/div[2]/div/ul/li/a")
    year_links = [(elem.text, elem.get_attribute('href')) for elem in year_elements]

    def get_element_text_or_none(driver, xpath):
        """Safely gets text from an element by its full XPath."""
        try:
            return driver.find_element(By.XPATH, xpath).text.strip()
        except NoSuchElementException:
            return None
    
    # --- NEW, MORE ROBUST HELPER FUNCTION ---
    def get_table_value_by_header(driver, header_text):
        """
        Finds a table row by its header text and returns the value from the next cell.
        This is more reliable than using a fixed row index.
        """
        try:
            # This XPath finds a <th> containing the header_text, then gets the text of the <td> next to it.
            xpath = f"//th[contains(text(), '{header_text}')]/following-sibling::td"
            return driver.find_element(By.XPATH, xpath).text.strip()
        except NoSuchElementException:
            return None

    # Loop 1: Iterate through each year
    for year_text, year_url in year_links:
        print(f"\nProcessing Year: {year_text}")
        repository_data[year_text] = {}
        driver.get(year_url)
        time.sleep(2)

        thesis_urls = []
        thesis_index = 1
        # Loop 2: Find all thesis links for the current year
        while True:
            try:
                xpath = f"/html/body/div[1]/div/div[2]/div[2]/p[{thesis_index}]/a"
                thesis_link_element = driver.find_element(By.XPATH, xpath)
                thesis_urls.append(thesis_link_element.get_attribute('href'))
                thesis_index += 1
            except NoSuchElementException:
                break # Exit loop when no more thesis links are found
        
        # Loop 3: Visit each thesis page and scrape data
        for i, thesis_url in enumerate(thesis_urls):
            driver.get(thesis_url)
            time.sleep(1)

            title = get_element_text_or_none(driver, '//*[@id="page-title"]')
            if not title:
                print(f"  - Skipping entry {i+1}/{len(thesis_urls)} (Title not found)")
                continue
            
            print(f"  - Scraping [{i+1}/{len(thesis_urls)}]: {title[:60]}...")

            # --- UPDATED SCRAPING LOGIC ---
            # Scrape all required details using the new robust method for table data
            thesis_details = {
                "author": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/p/span"),
                "abstract": get_element_text_or_none(driver, "/html/body/div[1]/div/div[2]/div/div[4]/div[3]/p"),
                "item_type": get_table_value_by_header(driver, "Item Type:"),
                "date_deposited": get_table_value_by_header(driver, "Date Deposited:"),
                "last_modified": get_table_value_by_header(driver, "Last Modified:"),
                "url": thesis_url
            }
            
            repository_data[year_text][title] = thesis_details

    # Save the final data structure to a JSON file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f'output/unhas_repository_{timestamp}.json'
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(repository_data, f, ensure_ascii=False, indent=4)

    print(f"\n✅ Scraping complete. Data has been saved to '{output_filename}'.")
    driver.quit()


if __name__ == '__main__':
    scrape_repository()

In [None]:
# --- Configuration ---
# 1. Set up your Google API Key
load_dotenv()  # This loads variables from .env into the environment

API_KEY = os.getenv("GOOGLE_API_KEY")
if not API_KEY:
    raise ValueError("GOOGLE_API_KEY environment variable not set. Please set your API key.")

genai.configure(api_key=API_KEY)

# 2. Define filenames and batch size
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
INPUT_FILENAME = 'output/unhas_repository_20250730_004812.json'
OUTPUT_FILENAME = f'output/unhas_repository_classified_{timestamp}.json'
BATCH_SIZE = 20 

# 3. Define the categories and instructions for the Gemini model
CATEGORIES = {
    "Regresi":"Fokus pada **inferensi statistik** untuk memahami dan mengukur hubungan antar variabel menggunakan model dengan **bentuk fungsional yang telah ditentukan** (misalnya, linear, logistik). Tujuan utamanya adalah menjelaskan *seberapa besar* pengaruh satu variabel terhadap variabel lain.",
    "Regresi Nonparametrik":"Fokus pada pemodelan hubungan antar variabel **TANPA asumsi bentuk fungsional tertentu**. Metode ini sangat fleksibel dan digunakan ketika pola data kompleks, non-linear, dan tidak diketahui sebelumnya. Tujuannya adalah membiarkan data 'berbicara' untuk membentuk modelnya sendiri.",
    "Pengendalian Kualitas Statistika":"Fokus pada **pemantauan (monitoring) proses yang sedang berjalan** untuk memastikan stabilitas dan konsistensi output. Alat utamanya adalah **peta kendali (control chart)** untuk mendeteksi variasi yang tidak wajar secara visual dan menjaga proses tetap dalam spesifikasi.",
    "Perancangan Percobaan":"Fokus pada **perancangan eksperimen secara proaktif SEBELUM data dikumpulkan**. Tujuannya adalah untuk secara efisien membandingkan efek dari berbagai **perlakuan (treatments)** melalui intervensi aktif untuk menemukan pengaturan atau kondisi yang paling optimal.",
    "Analisis Runtun Waktu":"Analisis data yang variabel utamanya adalah **waktu**. Metode ini secara khusus menangani data dengan **ketergantungan temporal** (nilai saat ini dipengaruhi oleh nilai sebelumnya). Tujuan utamanya adalah memahami pola historis dan melakukan **peramalan (forecasting)**.",
    "Machine Learning":"Fokus utama pada **akurasi prediksi**. Tujuannya adalah membangun algoritma yang dapat belajar dari data untuk membuat prediksi atau klasifikasi seakurat mungkin, seringkali **mengorbankan interpretasi model** demi performa prediktif yang superior.",
    "Analisis Data Spasial":"Analisis data yang variabel utamanya adalah **lokasi geografis**. Metode ini secara khusus menangani data dengan **ketergantungan spasial** (nilai di satu lokasi dipengaruhi oleh nilai di lokasi tetangganya). Fokus utamanya adalah pemetaan dan pemodelan **autokorelasi spasial**.",
    "Analisis Survival":"Metode statistik khusus untuk menganalisis data **'waktu-ke-kejadian' (time-to-event)**. Fokusnya adalah memodelkan waktu hingga suatu peristiwa terjadi dan menangani **data tersensor (censored data)**, di mana peristiwa tersebut tidak diamati untuk semua subjek.",
    "Ekonometrika dan Manajemen Risiko":"Aplikasi statistik khusus pada **data keuangan dan ekonomi** untuk mengukur dan mengelola risiko. Fokus utamanya adalah kuantifikasi risiko investasi melalui metrik seperti **Value at Risk (VaR) dan CVaR**, pemodelan portofolio, dan analisis dependensi aset.",
    "Lainnya":"Kategori untuk metodologi statistik yang tidak memiliki karakteristik unik dari kategori lain yang telah disebutkan. Contohnya meliputi **psikometri, bioinformatika, atau analisis data kategorik murni**."
}

def generate_classification_prompt(batch_items):
    """Generates the prompt for the Gemini API call."""
    category_list_str = "\n".join([f"- **{cat}**: {desc}" for cat, desc in CATEGORIES.items()])
    items_to_classify_str = json.dumps(batch_items, indent=2, ensure_ascii=False)
    prompt = f"""
    You are an expert academic classifier specializing in statistics. Your task is to classify each research item into one of the following categories based on its title and abstract.

    **Categories and Descriptions:**
    {category_list_str}

    **Instructions:**
    1. Analyze the title and abstract for each item in the JSON array below.
    2. For each item, determine the most fitting category from the list provided.
    3. Your response MUST be a valid JSON object that maps each 'id' to its corresponding category name.
    4. The category name MUST be one of these exact strings: {', '.join(CATEGORIES.keys())}.
    5. Do NOT include any explanations, comments, or markdown formatting (like ```json) in your response.

    **Research Items to Classify:**
    {items_to_classify_str}

    **Required Output Format (JSON object):**
    {{
      "id_1": "CategoryName",
      "id_2": "CategoryName",
      ...
    }}
    """
    return prompt


def classify_theses():
    """Loads, classifies, and saves thesis data with improved robustness."""
    try:
        with open(INPUT_FILENAME, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: Input file '{INPUT_FILENAME}' not found. Please run the scraper first.")
        return

    model = genai.GenerativeModel('gemini-2.5-pro')

    tasks = []
    task_id_counter = 0
    for year, theses in data.items():
        for title, details in theses.items():
            if "study_focus" in details:
                continue
            
            abstract = details.get("abstract", "") or ""
            if "LIHAT DI FULL TEXT" in abstract.upper():
                abstract = ""
            
            tasks.append({
                "id": f"task_{task_id_counter}",
                "title": title,
                "abstract": abstract,
                "original_object": details
            })
            task_id_counter += 1

    if not tasks:
        print("✅ All items are already classified. No action needed.")
        return

    print(f"Found {len(tasks)} items to classify. Starting process in batches of {BATCH_SIZE}...")
    
    for i in range(0, len(tasks), BATCH_SIZE):
        batch = tasks[i:i + BATCH_SIZE]
        batch_input_for_prompt = [{"id": t["id"], "title": t["title"], "abstract": t["abstract"]} for t in batch]
        
        print(f"  - Processing batch {i//BATCH_SIZE + 1}/{(len(tasks) + BATCH_SIZE - 1)//BATCH_SIZE}...")
        
        prompt = generate_classification_prompt(batch_input_for_prompt)
        
        classifications = {}
        # --- IMPROVEMENT 1A: Added a retry mechanism ---
        retries = 3
        for attempt in range(retries):
            try:
                response = model.generate_content(prompt)
                
                # --- FIX: Clean and validate the response before parsing ---
                # 1. Check if the response has text content.
                if not response.text:
                    raise ValueError("API returned an empty response.")
                
                # 2. Clean potential markdown formatting.
                cleaned_text = response.text.strip().replace("```json", "").replace("```", "").strip()
                
                # 3. Parse the cleaned JSON.
                classifications = json.loads(cleaned_text)
                print("    - Batch successfully processed by API.")
                break # Exit retry loop on success
            except (json.JSONDecodeError, ValueError) as e:
                print(f"    - Warning: API call or parsing failed on attempt {attempt + 1}. Error: {e}")
                # Log the problematic response for debugging
                if 'response' in locals() and hasattr(response, 'text'):
                    print(f"    - Problematic API response text: '{response.text}'")
                if attempt < retries - 1:
                    time.sleep(5) # Wait before retrying
                else:
                    print(f"    - Error: Batch failed after {retries} attempts. Items will be marked 'Classification Failed'.")
            except Exception as e:
                print(f"    - Warning: An unexpected error occurred on attempt {attempt + 1}. Error: {e}")
                if attempt < retries - 1:
                    time.sleep(5)
                else:
                    print(f"    - Error: Batch failed after {retries} attempts. Items will be marked 'Classification Failed'.")


        # --- IMPROVEMENT 1B: Validate each classification ---
        for task in batch:
            task_id = task["id"]
            category = classifications.get(task_id) # Get the category from the API response
            
            if category and category in CATEGORIES:
                task["original_object"]["study_focus"] = category
            else:
                # If category is missing from response, or is not a valid category, mark it.
                task["original_object"]["study_focus"] = "Classification Failed"
                if category: # Log if the category was invalid
                    print(f"    - Warning: Invalid category '{category}' for {task_id}. Defaulting to failed.")

    with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
    print(f"\n✅ Classification complete! Results saved to '{OUTPUT_FILENAME}'.")


if __name__ == '__main__':
    classify_theses()

In [None]:
with open(OUTPUT_FILENAME, 'r', encoding='utf-8') as f:
    data = json.load(f)

rows = []
for year, theses in data.items():
    for title, details in theses.items():
        row = {'year': year, 'title': title}
        row.update(details)
        rows.append(row)

df = pd.DataFrame(rows)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
df.to_excel(f'output/unhas_repository_classified_{timestamp}.xlsx', index=False)
print(f"✅ Data exported to 'output/unhas_repository_classified_{timestamp}.xlsx'")

In [None]:
def simplify_repository_data(input_path):
    """
    Reads a nested JSON repository file, flattens it, and extracts
    only the title, abstract, and study focus for each entry.

    Args:
        input_path (str): The path to the source JSON file.

    Returns:
        str: The path to the newly created simplified JSON file.
    """
    # Define output directory and create a timestamped filename
    output_dir = os.path.dirname(input_path)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f'unhas_repository_simplified_{timestamp}.json'
    output_path = os.path.join(output_dir, output_filename)
    
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    simplified_list = []
    # Iterate through the top-level keys (e.g., '2002', 'NULL')
    for year_key in data:
        # Iterate through each paper's title and its details
        for title, details in data[year_key].items():
            new_entry = {
                'title': title,
                'abstract': details.get('abstract', 'Not Available'),
                'study_focus': details.get('study_focus', 'Not Available')
            }
            simplified_list.append(new_entry)

    # Write the new flat list to the output file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(simplified_list, f, indent=4, ensure_ascii=False)
        
    return output_path

# --- Execution ---
if __name__ == '__main__':
    # Specify the path to your source file
    input_file = 'output/20250730/unhas_repository_classified_20250730_114427.json'
    
    # Run the function and print the result
    new_file_path = simplify_repository_data(input_file)
    print(f"✅ Successfully created simplified JSON file at: {new_file_path}")