In [1]:
import os
import re
import PyPDF2
import numpy as np
from nltk.tokenize import sent_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [2]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\erwin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\erwin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erwin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# summarizer.py

# --- 1. KONFIGURASI ---
# Sesuaikan path ini jika struktur folder Anda berbeda
PDF_DIR = os.path.join('data_putusan', 'dok_putusan_pdf')
REF_DIR = os.path.join('data_putusan', 'referensi_ringkasan')

# [cite_start]Compression rates yang akan diuji, sesuai paper [cite: 345]
COMPRESSION_RATES = [75, 50, 25]

# --- 2. SETUP PUSTAKA ---
# [cite_start]Inisialisasi Stemmer dan Stopword Remover dari Sastrawi [cite: 29, 146]
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()


In [4]:
# --- 3. FUNGSI-FUNGSI PREPROCESSING ---

def parse_pdf(file_path):
    """Mengekstrak teks mentah dari file PDF menggunakan PyPDF2."""
    text = ""
    try:
        with open(file_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"  - Error parsing {os.path.basename(file_path)}: {e}")
    return text

def preprocess_text(text):
    """
    Menjalankan pipeline preprocessing lengkap sesuai metodologi paper.
    """
    # [cite_start]a. Pembersihan Teks: Menghapus watermark dan spasi berlebih [cite: 132, 134]
    text = re.sub(r'(?i)mahkamah agung republik indonesia', '', text)
    text = re.sub(r'\n+', '\n', text).strip()

    # b. [cite_start]Normalisasi Singkatan: Mencegah salah deteksi akhir kalimat [cite: 138, 139]
    text = re.sub(r'\b(Kec)\.\s*', r'\1 ', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(Jln)\.\s*', r'\1 ', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(No)\.\s*', r'\1 ', text, flags=re.IGNORECASE)

    # c. [cite_start]Segmentasi Kalimat [cite: 136]
    original_sentences = sent_tokenize(text, language='indonesian')

    processed_sentences = []
    for sentence in original_sentences:
        # d. [cite_start]Stopwords Removal [cite: 149]
        temp_sentence = stopword_remover.remove(sentence.lower())
        # e. [cite_start]Stemming [cite: 145]
        temp_sentence = stemmer.stem(temp_sentence)
        processed_sentences.append(temp_sentence)

    return original_sentences, processed_sentences

# --- 4. FUNGSI PERINGKASAN & EVALUASI ---

def summarize_textrank(processed_sents, original_sents, compression_rate):
    """Meringkas teks menggunakan algoritma TextRank. """
    if not processed_sents or not any(processed_sents):
        return ""

    # [cite_start]Representasi kalimat (TF-IDF) [cite: 175]
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform(processed_sents)
    except ValueError:
        return "" # Terjadi jika semua kalimat kosong setelah preprocessing

    # [cite_start]Perhitungan matriks kemiripan (Cosine Similarity) [cite: 178, 182]
    sim_matrix = cosine_similarity(tfidf_matrix)
    np.fill_diagonal(sim_matrix, 0) # Hapus self-links

    # Konversi matriks ke graf dan penerapan algoritma TextRank 
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    # Pemilihan kalimat terbaik
    summary_ratio = (100 - compression_rate) / 100.0
    num_summary_sents = max(1, int(len(original_sents) * summary_ratio))
    
    ranked_sents = sorted(((scores[i], s) for i, s in enumerate(original_sents)), reverse=True)
    top_sents = [s for score, s in ranked_sents[:num_summary_sents]]

    # Mengurutkan kembali kalimat ringkasan sesuai urutan asli
    summary = sorted(top_sents, key=lambda s: original_sents.index(s))
    return " ".join(summary)

def evaluate_summary(system_summary, reference_summary):
    """
    [cite_start]Menghitung Precision, Recall, dan F-measure. [cite: 228]
    """
    system_sents = set(sent_tokenize(system_summary.lower()))
    reference_sents = set(sent_tokenize(reference_summary.lower()))

    tp = len(system_sents.intersection(reference_sents)) # True Positive 
    fp = len(system_sents.difference(reference_sents))   # False Positive
    fn = len(reference_sents.difference(system_sents))   # False Negative

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 # Rumus Precision [cite: 232]
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0     # Rumus Recall 
    f_measure = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 # Rumus F-measure 

    return precision, recall, f_measure

# --- 5. FUNGSI UTAMA ---

def main():
    """Fungsi utama untuk menjalankan seluruh alur replikasi."""
    if not os.path.exists(PDF_DIR) or not os.path.exists(REF_DIR):
        print("❌ Error: Pastikan folder 'Dataset/dok_putusan_pdf' dan 'Dataset/referensi_ringkasan' ada.")
        return

    pdf_files = sorted([f for f in os.listdir(PDF_DIR) if f.endswith('.pdf')])
    
    # Inisialisasi dictionary untuk menyimpan hasil
    results = {rate: {'precision': [], 'recall': [], 'f_measure': []} for rate in COMPRESSION_RATES}

    print(f"🚀 Memulai replikasi pada {len(pdf_files)} dokumen...")

    for i, pdf_file in enumerate(pdf_files[:10]):
        print(f"\n[Dokumen(10) {i+1}/{len(pdf_files)}] Memproses: {pdf_file}")

        # 1. Baca PDF dan Teks Referensi
        pdf_path = os.path.join(PDF_DIR, pdf_file)
        ref_path = os.path.join(REF_DIR, os.path.splitext(pdf_file)[0] + '.txt')

        if not os.path.exists(ref_path):
            print(f"  - Peringatan: File referensi '{os.path.basename(ref_path)}' tidak ditemukan.")
            continue
            
        raw_text = parse_pdf(pdf_path)
        with open(ref_path, 'r', encoding='utf-8') as f:
            ref_summary = f.read()

        # 2. Preprocessing
        original_sents, processed_sents = preprocess_text(raw_text)
        print(f"  - Ditemukan {len(original_sents)} kalimat.")

        # 3. Peringkasan dan Evaluasi untuk setiap compression rate
        for rate in COMPRESSION_RATES:
            # Peringkasan
            system_summary = summarize_textrank(processed_sents, original_sents, rate)
            
            # Evaluasi
            p, r, f1 = evaluate_summary(system_summary, ref_summary)
            
            # Simpan hasil
            results[rate]['precision'].append(p)
            results[rate]['recall'].append(r)
            results[rate]['f_measure'].append(f1)
    
    print("\n\n---" + "="*50)
    print("📊 HASIL AKHIR REPLIKASI (Rata-rata dari semua dokumen)")
    print("---" + "="*50)

    for rate in COMPRESSION_RATES:
        avg_p = np.mean(results[rate]['precision'])
        avg_r = np.mean(results[rate]['recall'])
        avg_f1 = np.mean(results[rate]['f_measure'])
        
        print(f"\n# Compression Rate: {rate}%")
        print(f"  - Rata-rata Precision: {avg_p:.2f}")
        print(f"  - Rata-rata Recall:    {avg_r:.2f}")
        print(f"  - Rata-rata F-measure: {avg_f1:.2f}")
        
    print("\n✅ Replikasi Selesai.")


In [6]:
import nltk
nltk.download('punkt') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\erwin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:

if __name__ == "__main__":
    main()

🚀 Memulai replikasi pada 50 dokumen...

[Dokumen(10) 1/50] Memproses: doc01.pdf


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/indonesian/[0m

  Searched in:
    - 'C:\\Users\\erwin/nltk_data'
    - 'c:\\Users\\erwin\\VDBQdrant\\nltk_data'
    - 'c:\\Users\\erwin\\VDBQdrant\\share\\nltk_data'
    - 'c:\\Users\\erwin\\VDBQdrant\\lib\\nltk_data'
    - 'C:\\Users\\erwin\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
