In [1]:
%pip install pdfminer.six pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Part 1: Import Libraries and Initialize Directories
import os
import re
import csv
import logging
from datetime import datetime
import pandas as pd

# Define base directory (aligned with 01_scraper.py)
try:
    BASE_DIR = os.path.dirname(os.getcwd())  # Parent of 'notebooks'
except NameError:
    BASE_DIR = os.getcwd()  # Fallback for interactive environments like Jupyter

# Define paths
PATH_OUTPUT = os.path.join(BASE_DIR, 'data', 'raw')
PATH_CSV = os.path.join(BASE_DIR, 'data', 'processed', 'cases.csv')
LOG_DIR = os.path.join(BASE_DIR, 'logs')
LOG_PATH = os.path.join(LOG_DIR, 'metadata_extraction.log')

# Validate path length for Windows
MAX_PATH_LENGTH = 260

def validate_path(path):
    if len(path) > MAX_PATH_LENGTH:
        raise ValueError(f"Path {path} exceeds Windows maximum length of {MAX_PATH_LENGTH} characters")
    return path

# Ensure directories exist
for path in [LOG_DIR, os.path.dirname(PATH_CSV), PATH_OUTPUT]:
    try:
        validate_path(path)
        os.makedirs(path, exist_ok=True)
        logging.info(f"Directory ensured: {path}")
    except ValueError as e:
        logging.error(f"Path validation failed: {e}")
        raise
    except Exception as e:
        logging.error(f"Failed to create directory {path}: {e}")
        raise

# Initialize logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_PATH, mode='w', encoding='utf-8'),
        logging.StreamHandler()
    ],
    force=True
)
logging.info("Starting metadata extraction process at %s", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

# Part 2: Utility Functions
def clean_name(name):
    titles = [r'dr\.', r'drh\.', r'sh\.', r'mh\.', r'm\.h\.', r's\.h\.', r'prof\.', r'ir\.', r'hj\.', r'h\.']
    for title in titles:
        name = re.sub(title, '', name, flags=re.IGNORECASE)
    return ' '.join(name.split()).strip()

month_map = {
    'januari': 'January', 'februari': 'February', 'maret': 'March',
    'april': 'April', 'mei': 'May', 'juni': 'June', 'juli': 'July',
    'agustus': 'August', 'september': 'September', 'oktober': 'October',
    'november': 'November', 'desember': 'December'
}

# Part 3: Metadata Extraction Function
def extract_metadata(text, file_name):
    metadata = {
        'case_id': file_name.replace('.txt', ''),
        'nomor_perkara': '', 'tahun_putusan': '', 'bulan_putusan': '',
        'tanggal_putusan': '', 'jenis_perkara': '', 'tingkat_pemeriksaan': '',
        'lembaga_peradilan': '', 'pasal': '', 'hakim_ketua': '',
        'ringkasan_fakta': '', 'jumlah_kata_putusan': 0, 'full_text': text
    }

    text = ' '.join(text.split())  # Normalize spacing

    # Nomor perkara
    patterns = [
        r'(?:putusan|penetapan)\s*nomor\s*([\w/\s]+?\d{4})',
        r'nomor\s*([\w/\s]+?\d{4})'
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            metadata['nomor_perkara'] = re.sub(r'\s+', ' ', match.group(1)).strip()
            break
    if not metadata['nomor_perkara']:
        logging.warning(f"No nomor_perkara found in {file_name}. Sample: {text[:200]}")

    # Case year
    match = re.search(r'(\d{4})$', metadata['nomor_perkara'])
    if match:
        metadata['tahun_putusan'] = match.group(1)

    # Tanggal putusan
    date_patterns = [
        r'(\d{1,2}\s+\w+\s+\d{4})'
    ]
    for pattern in date_patterns:
        for m in re.finditer(pattern, text, re.IGNORECASE):
            date_text = m.group(1)
            for id_m, en_m in month_map.items():
                date_text = re.sub(rf'\b{id_m}\b', en_m, date_text, flags=re.IGNORECASE)
            try:
                date_obj = datetime.strptime(date_text, '%d %B %Y')
                metadata['tanggal_putusan'] = str(date_obj.day)
                metadata['bulan_putusan'] = date_obj.strftime('%B')
                metadata['tahun_putusan'] = metadata['tahun_putusan'] or str(date_obj.year)
                break
            except:
                continue

    # Jenis perkara
    jp_map = {'pidana': 'Pidana', 'perdata': 'Perdata', 'anak': 'Pidana Anak', 'narkotika': 'Pidana Narkotika'}
    for key, val in jp_map.items():
        if key in text.lower():
            metadata['jenis_perkara'] = val
            break

    # Tingkat pemeriksaan
    if 'peninjauan kembali' in text.lower():
        metadata['tingkat_pemeriksaan'] = 'Peninjauan Kembali'
    elif 'kasasi' in text.lower():
        metadata['tingkat_pemeriksaan'] = 'Kasasi'
    elif 'banding' in text.lower():
        metadata['tingkat_pemeriksaan'] = 'Banding'
    else:
        metadata['tingkat_pemeriksaan'] = 'Pertama'

    # Lembaga peradilan
    lembaga_map = [
        ('mahkamah agung', 'Mahkamah Agung'),
        ('pengadilan tinggi', 'Pengadilan Tinggi'),
        ('pengadilan agama', 'Pengadilan Agama'),
        ('pengadilan negeri', 'Pengadilan Negeri')
    ]
    for k, v in lembaga_map:
        if k in text.lower():
            metadata['lembaga_peradilan'] = v
            break
    else:
        metadata['lembaga_peradilan'] = 'Unknown'

    # Pasal
    match = re.search(r'(pasal\s+\d+(?:\s+ayat\s+\d+)?)', text, re.IGNORECASE)
    if match:
        metadata['pasal'] = match.group(1)

    # Hakim Ketua
    match = re.search(r'(?:ketua\s+majelis|hakim\s+ketua)[:\s]*([^\n,]+)', text, re.IGNORECASE)
    if match:
        metadata['hakim_ketua'] = clean_name(match.group(1))

    # Ringkasan fakta
    patterns = [r'(?:terdakwa|terpidana).*?dakwaan.*?berikut\s+(.+?)\s+(?:menimbang|demikian)', r'mengadili\s+(.+?)\s+(?:menimbang|demikian)']
    for pat in patterns:
        m = re.search(pat, text, re.IGNORECASE | re.DOTALL)
        if m:
            summary = ' '.join(m.group(1).split())
            metadata['ringkasan_fakta'] = summary[:1000] + '...' if len(summary) > 1000 else summary
            break

    metadata['jumlah_kata_putusan'] = len(text.split())
    return metadata

# Part 4: CSV Saving Function
def save_to_csv(metadata_list):
    fieldnames = [
        'case_id', 'nomor_perkara', 'tahun_putusan', 'bulan_putusan', 'tanggal_putusan',
        'jenis_perkara', 'tingkat_pemeriksaan', 'lembaga_peradilan', 'pasal',
        'hakim_ketua', 'ringkasan_fakta', 'jumlah_kata_putusan', 'full_text'
    ]
    with open(PATH_CSV, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in metadata_list:
            writer.writerow(row)
        logging.info(f"Saved {len(metadata_list)} metadata entries to {PATH_CSV}")

# Part 5: Main Processing

def process_text_files():
    if not os.path.exists(PATH_OUTPUT):
        logging.error(f"Directory {PATH_OUTPUT} does not exist.")
        return

    files = [f for f in os.listdir(PATH_OUTPUT) if f.endswith('.txt')]
    if not files:
        logging.warning(f"No .txt files found in {PATH_OUTPUT}")
        return

    metadata_list = []
    for file in files:
        path = os.path.join(PATH_OUTPUT, file)
        try:
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read().strip()
        except UnicodeDecodeError:
            with open(path, 'r', encoding='latin-1') as f:
                text = f.read().strip()
        if text:
            metadata = extract_metadata(text, file)
            metadata_list.append(metadata)
            logging.info(f"Processed file: {file}")

    if metadata_list:
        save_to_csv(metadata_list)
        print(f"Processed {len(metadata_list)} files and saved to {PATH_CSV}")
    else:
        logging.warning("No metadata extracted")

# Part 6: Entry Point
if __name__ == "__main__":
    process_text_files()

2025-06-26 02:34:39,553 - INFO - Directory ensured: c:\Users\Faliq\Pictures\PENALARAN-KOMPUTER\CBR_Penalararan_Komputer\CBR\logs
2025-06-26 02:34:39,554 - INFO - Directory ensured: c:\Users\Faliq\Pictures\PENALARAN-KOMPUTER\CBR_Penalararan_Komputer\CBR\data\processed
2025-06-26 02:34:39,555 - INFO - Directory ensured: c:\Users\Faliq\Pictures\PENALARAN-KOMPUTER\CBR_Penalararan_Komputer\CBR\data\raw
2025-06-26 02:34:39,557 - INFO - Starting metadata extraction process at 2025-06-26 02:34:39
2025-06-26 02:34:39,566 - INFO - Processed file: case_001.txt
2025-06-26 02:34:39,573 - INFO - Processed file: case_002.txt
2025-06-26 02:34:39,579 - INFO - Processed file: case_003.txt
2025-06-26 02:34:39,585 - INFO - Processed file: case_004.txt
2025-06-26 02:34:39,589 - INFO - Processed file: case_005.txt
2025-06-26 02:34:39,594 - INFO - Processed file: case_007.txt
2025-06-26 02:34:39,598 - INFO - Processed file: case_008.txt
2025-06-26 02:34:39,613 - INFO - Processed file: case_009.txt
2025-06-26

Processed 48 files and saved to c:\Users\Faliq\Pictures\PENALARAN-KOMPUTER\CBR_Penalararan_Komputer\CBR\data\processed\cases.csv
