In [None]:
# # 1. Mount Google Drive (jika menggunakan Colab)
# from google.colab import drive
# drive.mount('/content/drive')


In [1]:

# ===== 2. Setup Pustaka & Inisialisasi =====
import os
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# --- Penambahan Pustaka Stemming Sastrawi ---
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


In [None]:

# Unduh resource NLTK jika belum ada
try:
    stopwords.words('indonesian')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')


# Inisialisasi Stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Inisialisasi Stopwords
stop_words = set(stopwords.words('indonesian'))


In [3]:
# ===== 3. Fungsi-fungsi Helper =====

def clean_and_stem_sentence(text):
    """
    Membersihkan dan melakukan stemming pada SATU kalimat.
    Fungsi ini dimodifikasi untuk menyertakan stemming.
    """
    # Hilangkan noise & simbol
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Hanya simpan huruf dan spasi
    text = text.lower()

    # Stemming
    text_stemmed = stemmer.stem(text)

    # Tokenisasi kata + filter stopwords
    tokens = word_tokenize(text_stemmed)
    filtered = [word for word in tokens if word not in stop_words and len(word) > 2]

    return ' '.join(filtered)

def textrank_summarizer(text, ratio=0.25):
    """Fungsi utama TextRank, tidak ada perubahan signifikan di sini."""
    raw_sentences = sent_tokenize(text)

    # Bersihkan setiap kalimat dengan fungsi baru
    indexed_cleaned = [
        (i, clean_and_stem_sentence(sent))
        for i, sent in enumerate(raw_sentences)
    ]
    indexed_cleaned = [(i, sent) for i, sent in indexed_cleaned if sent.strip()]

    if len(indexed_cleaned) < 2:
        return " ".join(raw_sentences) # Kembalikan teks asli jika terlalu pendek

    cleaned_sentences = [sent for _, sent in indexed_cleaned]
    original_indices = [i for i, _ in indexed_cleaned]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cleaned_sentences)
    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, 0)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    # Mapping skor kembali ke indeks kalimat asli
    ranked_sentences = sorted(
        ((scores[i], raw_sentences[original_indices[i]]) for i in range(len(original_indices))),
        reverse=True
    )

    n_summary = max(1, int(len(raw_sentences) * ratio))
    # Ambil kalimat terbaik dan urutkan sesuai urutan asli
    top_sentences = [s for _, s in ranked_sentences[:n_summary]]
    summary = sorted(top_sentences, key=lambda s: raw_sentences.index(s))

    return ' '.join(summary)

def evaluate_rouge(system_summary, reference_summary):
    """Fungsi evaluasi ROUGE, tidak ada perubahan."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, system_summary)
    return {metric: score for metric, score in scores.items()}


In [4]:
!pip show sastrawi


Name: Sastrawi
Version: 1.0.1
Summary: Library for stemming Indonesian (Bahasa) text
Home-page: https://github.com/har07/sastrawi
Author: Hanif Amal Robbani
Author-email: dev.har07@gmail.com
License: MIT
Location: C:\Users\erwin\VDBQdrant\Lib\site-packages
Requires: 
Required-by: 


In [None]:

# ===== 3. Fungsi-fungsi Helper =====

def clean_and_stem_sentence(text):
    """
    Membersihkan dan melakukan stemming pada SATU kalimat.
    Fungsi ini dimodifikasi untuk menyertakan stemming.
    """
    # Hilangkan noise & simbol
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Hanya simpan huruf dan spasi
    text = text.lower()

    # Stemming
    text_stemmed = stemmer.stem(text)

    # Tokenisasi kata + filter stopwords
    tokens = word_tokenize(text_stemmed)
    filtered = [word for word in tokens if word not in stop_words and len(word) > 2]

    return ' '.join(filtered)

def textrank_summarizer(text, ratio=0.25):
    """Fungsi utama TextRank, tidak ada perubahan signifikan di sini."""
    raw_sentences = sent_tokenize(text)

    # Bersihkan setiap kalimat dengan fungsi baru
    indexed_cleaned = [
        (i, clean_and_stem_sentence(sent))
        for i, sent in enumerate(raw_sentences)
    ]
    indexed_cleaned = [(i, sent) for i, sent in indexed_cleaned if sent.strip()]

    if len(indexed_cleaned) < 2:
        return " ".join(raw_sentences) # Kembalikan teks asli jika terlalu pendek

    cleaned_sentences = [sent for _, sent in indexed_cleaned]
    original_indices = [i for i, _ in indexed_cleaned]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cleaned_sentences)
    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, 0)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    # Mapping skor kembali ke indeks kalimat asli
    ranked_sentences = sorted(
        ((scores[i], raw_sentences[original_indices[i]]) for i in range(len(original_indices))),
        reverse=True
    )

    n_summary = max(1, int(len(raw_sentences) * ratio))
    # Ambil kalimat terbaik dan urutkan sesuai urutan asli
    top_sentences = [s for _, s in ranked_sentences[:n_summary]]
    summary = sorted(top_sentences, key=lambda s: raw_sentences.index(s))

    return ' '.join(summary)

def evaluate_rouge(system_summary, reference_summary):
    """Fungsi evaluasi ROUGE, tidak ada perubahan."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, system_summary)
    return {metric: score for metric, score in scores.items()}


In [None]:
# ===== 4. Proses Utama =====

MAX_DOCS = 10
COMPRESSION_RATES = [75, 50, 25]  # Sesuai paper

# Sesuaikan dengan path di Google Drive Anda
# dokumen_dir = '/content/drive/MyDrive/Tugas/PT/data_putusan/dok_putusan_txt'
# referensi_dir = '/content/drive/MyDrive/Tugas/PT/data_putusan/referensi_ringkasan2'

dokumen_dir = 'data_putusan/dok_putusan_txt'
referensi_dir = 'data_putusan/referensi_ringkasan'
referensi_dir2 = 'data_putusan/referensi_ringkasan2'


dokumen_files = sorted([f for f in os.listdir(dokumen_dir) if f.endswith('.txt')])[:MAX_DOCS]
referensi_files = sorted([f for f in os.listdir(referensi_dir) if f.endswith('.txt')])[:MAX_DOCS]

# --- Struktur baru untuk menyimpan hasil agregat ---
aggregated_results = {rate: {'rouge1': [], 'rouge2': [], 'rougeL': []} for rate in COMPRESSION_RATES}

for idx, (dok_file, ref_file) in enumerate(zip(dokumen_files, referensi_files), 1):
    print(f"\n📄 Dokumen ke-{idx}: {dok_file}")
    with open(os.path.join(dokumen_dir, dok_file), 'r', encoding='utf-8') as f:
        dokumen_text = f.read()
    with open(os.path.join(referensi_dir, ref_file), 'r', encoding='utf-8') as f:
        referensi_text = f.read()

    # --- Loop untuk setiap compression rate ---
    for rate in COMPRESSION_RATES:
        summary_ratio = (100 - rate) / 100.0  # Konversi compression rate ke ratio
        summary = textrank_summarizer(dokumen_text, ratio=summary_ratio)
        rouge_scores = evaluate_rouge(summary, referensi_text)

        print(f"\n--- Hasil untuk Compression Rate: {rate}% (Ratio: {summary_ratio}) ---")
        for metric, score in rouge_scores.items():
            p, r, f1 = score.precision, score.recall, score.fmeasure
            print(f"  {metric.upper()}: P={p:.4f}, R={r:.4f}, F1={f1:.4f}")

            # Simpan skor F1 untuk agregasi
            aggregated_results[rate][metric].append(f1)
    
    print("-" * 80)



📄 Dokumen ke-1: doc01.txt

--- Hasil untuk Compression Rate: 75% (Ratio: 0.25) ---
  ROUGE1: P=0.3475, R=0.7204, F1=0.4688
  ROUGE2: P=0.2753, R=0.5710, F1=0.3715
  ROUGEL: P=0.2631, R=0.5455, F1=0.3550

--- Hasil untuk Compression Rate: 50% (Ratio: 0.5) ---
  ROUGE1: P=0.2467, R=0.8711, F1=0.3846
  ROUGE2: P=0.2057, R=0.7266, F1=0.3206
  ROUGEL: P=0.1803, R=0.6366, F1=0.2810

--- Hasil untuk Compression Rate: 25% (Ratio: 0.75) ---
  ROUGE1: P=0.2006, R=0.9355, F1=0.3304
  ROUGE2: P=0.1816, R=0.8476, F1=0.2992
  ROUGEL: P=0.1571, R=0.7325, F1=0.2587
--------------------------------------------------------------------------------

📄 Dokumen ke-2: doc02.txt

--- Hasil untuk Compression Rate: 75% (Ratio: 0.25) ---
  ROUGE1: P=0.3463, R=0.9155, F1=0.5025
  ROUGE2: P=0.3071, R=0.8123, F1=0.4457
  ROUGEL: P=0.2705, R=0.7153, F1=0.3926

--- Hasil untuk Compression Rate: 50% (Ratio: 0.5) ---
  ROUGE1: P=0.3044, R=0.9473, F1=0.4607
  ROUGE2: P=0.2799, R=0.8716, F1=0.4237
  ROUGEL: P=0.2430, R=

In [7]:
# ===== 5. Hasil Akhir Evaluasi (Rata-rata) =====
print("\n\n" + "="*30)
print("📊 HASIL AKHIR EVALUASI RATA-RATA")
print("="*30)

for rate, metrics in aggregated_results.items():
    print(f"\n# Rata-rata untuk Compression Rate: {rate}%")
    for metric, f1_scores in metrics.items():
        avg_f1 = np.mean(f1_scores) if f1_scores else 0.0
        print(f"  - Rata-rata F1-Score {metric.upper()}: {avg_f1:.4f}")



📊 HASIL AKHIR EVALUASI RATA-RATA

# Rata-rata untuk Compression Rate: 75%
  - Rata-rata F1-Score ROUGE1: 0.4020
  - Rata-rata F1-Score ROUGE2: 0.3265
  - Rata-rata F1-Score ROUGEL: 0.2793

# Rata-rata untuk Compression Rate: 50%
  - Rata-rata F1-Score ROUGE1: 0.3335
  - Rata-rata F1-Score ROUGE2: 0.2928
  - Rata-rata F1-Score ROUGEL: 0.2495

# Rata-rata untuk Compression Rate: 25%
  - Rata-rata F1-Score ROUGE1: 0.2998
  - Rata-rata F1-Score ROUGE2: 0.2783
  - Rata-rata F1-Score ROUGEL: 0.2399


## Perbandingan dengan data referensi 2

In [None]:
# sss

In [8]:
# ===== 4. Proses Utama =====

MAX_DOCS = 10
COMPRESSION_RATES = [75, 50, 25]  # Sesuai paper

# Sesuaikan dengan path di Google Drive Anda
# dokumen_dir = '/content/drive/MyDrive/Tugas/PT/data_putusan/dok_putusan_txt'
# referensi_dir = '/content/drive/MyDrive/Tugas/PT/data_putusan/referensi_ringkasan2'

dokumen_dir = 'data_putusan/dok_putusan_txt'
referensi_dir = 'data_putusan/referensi_ringkasan2'
referensi_dir2 = 'data_putusan/referensi_ringkasan2'


dokumen_files = sorted([f for f in os.listdir(dokumen_dir) if f.endswith('.txt')])[:MAX_DOCS]
referensi_files = sorted([f for f in os.listdir(referensi_dir) if f.endswith('.txt')])[:MAX_DOCS]

# --- Struktur baru untuk menyimpan hasil agregat ---
aggregated_results = {rate: {'rouge1': [], 'rouge2': [], 'rougeL': []} for rate in COMPRESSION_RATES}

for idx, (dok_file, ref_file) in enumerate(zip(dokumen_files, referensi_files), 1):
    print(f"\n📄 Dokumen ke-{idx}: {dok_file}")
    with open(os.path.join(dokumen_dir, dok_file), 'r', encoding='utf-8') as f:
        dokumen_text = f.read()
    with open(os.path.join(referensi_dir, ref_file), 'r', encoding='utf-8') as f:
        referensi_text = f.read()

    # --- Loop untuk setiap compression rate ---
    for rate in COMPRESSION_RATES:
        summary_ratio = (100 - rate) / 100.0  # Konversi compression rate ke ratio
        summary = textrank_summarizer(dokumen_text, ratio=summary_ratio)
        rouge_scores = evaluate_rouge(summary, referensi_text)

        print(f"\n--- Hasil untuk Compression Rate: {rate}% (Ratio: {summary_ratio}) ---")
        for metric, score in rouge_scores.items():
            p, r, f1 = score.precision, score.recall, score.fmeasure
            print(f"  {metric.upper()}: P={p:.4f}, R={r:.4f}, F1={f1:.4f}")

            # Simpan skor F1 untuk agregasi
            aggregated_results[rate][metric].append(f1)
    
    print("-" * 80)



📄 Dokumen ke-1: doc01.txt

--- Hasil untuk Compression Rate: 75% (Ratio: 0.25) ---
  ROUGE1: P=0.3346, R=0.7667, F1=0.4659
  ROUGE2: P=0.2687, R=0.6159, F1=0.3741
  ROUGEL: P=0.2596, R=0.5948, F1=0.3615

--- Hasil untuk Compression Rate: 50% (Ratio: 0.5) ---
  ROUGE1: P=0.2324, R=0.9065, F1=0.3699
  ROUGE2: P=0.1973, R=0.7701, F1=0.3141
  ROUGEL: P=0.1710, R=0.6670, F1=0.2722

--- Hasil untuk Compression Rate: 25% (Ratio: 0.75) ---
  ROUGE1: P=0.1873, R=0.9653, F1=0.3137
  ROUGE2: P=0.1723, R=0.8886, F1=0.2887
  ROUGEL: P=0.1505, R=0.7756, F1=0.2521
--------------------------------------------------------------------------------

📄 Dokumen ke-2: doc02.txt

--- Hasil untuk Compression Rate: 75% (Ratio: 0.25) ---
  ROUGE1: P=0.3376, R=0.9421, F1=0.4970
  ROUGE2: P=0.3023, R=0.8441, F1=0.4452
  ROUGEL: P=0.2680, R=0.7481, F1=0.3947

--- Hasil untuk Compression Rate: 50% (Ratio: 0.5) ---
  ROUGE1: P=0.2950, R=0.9693, F1=0.4524
  ROUGE2: P=0.2750, R=0.9038, F1=0.4217
  ROUGEL: P=0.2393, R=

In [9]:
# ===== 5. Hasil Akhir Evaluasi (Rata-rata) =====
print("\n\n" + "="*30)
print("📊 HASIL AKHIR EVALUASI RATA-RATA")
print("="*30)

for rate, metrics in aggregated_results.items():
    print(f"\n# Rata-rata untuk Compression Rate: {rate}%")
    for metric, f1_scores in metrics.items():
        avg_f1 = np.mean(f1_scores) if f1_scores else 0.0
        print(f"  - Rata-rata F1-Score {metric.upper()}: {avg_f1:.4f}")



📊 HASIL AKHIR EVALUASI RATA-RATA

# Rata-rata untuk Compression Rate: 75%
  - Rata-rata F1-Score ROUGE1: 0.3952
  - Rata-rata F1-Score ROUGE2: 0.3251
  - Rata-rata F1-Score ROUGEL: 0.2815

# Rata-rata untuk Compression Rate: 50%
  - Rata-rata F1-Score ROUGE1: 0.3232
  - Rata-rata F1-Score ROUGE2: 0.2886
  - Rata-rata F1-Score ROUGEL: 0.2465

# Rata-rata untuk Compression Rate: 25%
  - Rata-rata F1-Score ROUGE1: 0.2872
  - Rata-rata F1-Score ROUGE2: 0.2706
  - Rata-rata F1-Score ROUGEL: 0.2333


## Ini yng lain: 

In [10]:
# # 1. Mount Google Drive (jika menggunakan Colab)
# from google.colab import drive
# drive.mount('/content/drive')

# ===== 2. Setup Pustaka & Inisialisasi =====
import os
import re
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [11]:
# Pastikan resource NLTK sudah ada
# Blok ini ditambahkan untuk mengatasi LookupError secara otomatis
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Inisialisasi Stemmer dan Stopwords
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))


In [12]:

# ===== 3. Fungsi-fungsi Helper =====

def clean_and_stem_sentence(text):
    """Membersihkan dan melakukan stemming pada SATU kalimat."""
    text = re.sub(r'\n+', ' ', text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text_stemmed = stemmer.stem(text)
    tokens = word_tokenize(text_stemmed)
    filtered = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(filtered)

def textrank_summarizer(text, ratio=0.25):
    """Fungsi utama TextRank."""
    raw_sentences = sent_tokenize(text)
    if len(raw_sentences) < 3:
        return text # Kembalikan teks asli jika terlalu pendek

    indexed_cleaned = [(i, clean_and_stem_sentence(sent)) for i, sent in enumerate(raw_sentences)]
    indexed_cleaned = [(i, sent) for i, sent in indexed_cleaned if sent.strip()]

    if not indexed_cleaned: return ""

    cleaned_sentences = [sent for _, sent in indexed_cleaned]
    original_indices = [i for i, _ in indexed_cleaned]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cleaned_sentences)
    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, 0)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(
        ((scores[i], raw_sentences[original_indices[i]]) for i in range(len(original_indices))),
        reverse=True
    )

    n_summary = max(1, int(len(raw_sentences) * ratio))
    top_sentences = [s for _, s in ranked_sentences[:n_summary]]
    summary = sorted(top_sentences, key=lambda s: raw_sentences.index(s))

    return ' '.join(summary)

def evaluate_sentence_overlap(system_summary, reference_summary):
    """
    MODIFIKASI: Menghitung TP, FP, FN, P, R, F1 berdasarkan tumpang tindih kalimat.
    """
    system_sents = set(sent_tokenize(system_summary.lower()))
    reference_sents = set(sent_tokenize(reference_summary.lower()))

    # True Positive: Kalimat yang ada di kedua ringkasan
    tp = len(system_sents.intersection(reference_sents))
    # False Positive: Kalimat yang ada di ringkasan sistem, tapi tidak di referensi
    fp = len(system_sents.difference(reference_sents))
    # False Negative: Kalimat yang ada di referensi, tapi tidak di ringkasan sistem
    fn = len(reference_sents.difference(system_sents))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f_measure = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return {'tp': tp, 'fp': fp, 'fn': fn, 'p': precision, 'r': recall, 'f1': f_measure}


In [13]:

# ===== 4. Proses Utama =====

MAX_DOCS = 10
COMPRESSION_RATES = [75, 50, 25]

# dokumen_dir = '/content/drive/MyDrive/Tugas/PT/data_putusan/dok_putusan_txt'
# referensi_dir = '/content/drive/MyDrive/Tugas/PT/data_putusan/referensi_ringkasan2'

dokumen_dir = 'data_putusan/dok_putusan_txt'
referensi_dir = 'data_putusan/referensi_ringkasan2'
referensi_dir2 = 'data_putusan/referensi_ringkasan2'

dokumen_files = sorted([f for f in os.listdir(dokumen_dir) if f.endswith('.txt')])[:MAX_DOCS]
referensi_files = sorted([f for f in os.listdir(referensi_dir) if f.endswith('.txt')])[:MAX_DOCS]

# Struktur untuk menyimpan hasil rinci
results = {rate: [] for rate in COMPRESSION_RATES}

print(f"🚀 Memulai Proses Evaluasi untuk {len(dokumen_files)} Dokumen...")
for idx, (dok_file, ref_file) in enumerate(zip(dokumen_files, referensi_files), 1):
    print(f"  - Memproses Dokumen {idx}: {dok_file}")
    with open(os.path.join(dokumen_dir, dok_file), 'r', encoding='utf-8') as f:
        dokumen_text = f.read()
    with open(os.path.join(referensi_dir, ref_file), 'r', encoding='utf-8') as f:
        referensi_text = f.read()

    for rate in COMPRESSION_RATES:
        summary_ratio = (100 - rate) / 100.0
        summary = textrank_summarizer(dokumen_text, ratio=summary_ratio)
        
        # Gunakan fungsi evaluasi baru
        metrics = evaluate_sentence_overlap(summary, referensi_text)
        
        # Simpan hasil rinci
        results[rate].append(metrics)

print("\nProses Selesai. Menampilkan Hasil...")

# ===== 5. Tampilan Hasil Akhir Evaluasi (Format Tabel) =====

print("\n\n" + "="*60)
print("📊 HASIL AKHIR EVALUASI RINCI")
print("="*60)

for rate in COMPRESSION_RATES:
    print(f"\n\n### Tabel Evaluasi untuk Compression Rate: {rate}% ###\n")
    # Header Tabel
    print(f"{'Dok':<5} {'TP':>4} {'FP':>4} {'FN':>4} {'P':>7} {'R':>7} {'F-1':>7}")
    print("-" * 42)

    p_scores, r_scores, f1_scores = [], [], []

    # Baris untuk setiap dokumen
    for i, doc_metrics in enumerate(results[rate]):
        p, r, f1 = doc_metrics['p'], doc_metrics['r'], doc_metrics['f1']
        p_scores.append(p)
        r_scores.append(r)
        f1_scores.append(f1)
        
        print(f"{i+1:<5} {doc_metrics['tp']:>4} {doc_metrics['fp']:>4} {doc_metrics['fn']:>4} {p:>7.2f} {r:>7.2f} {f1:>7.2f}")

    # Baris Rata-rata
    avg_p = np.mean(p_scores)
    avg_r = np.mean(r_scores)
    avg_f1 = np.mean(f1_scores)
    print("-" * 42)
    print(f"{'Rata-rata':<5} {'':>4} {'':>4} {'':>4} {avg_p:>7.2f} {avg_r:>7.2f} {avg_f1:>7.2f}")

🚀 Memulai Proses Evaluasi untuk 10 Dokumen...
  - Memproses Dokumen 1: doc01.txt
  - Memproses Dokumen 2: doc02.txt
  - Memproses Dokumen 3: doc03.txt
  - Memproses Dokumen 4: doc04.txt
  - Memproses Dokumen 5: doc05.txt
  - Memproses Dokumen 6: doc06.txt
  - Memproses Dokumen 7: doc07.txt
  - Memproses Dokumen 8: doc08.txt
  - Memproses Dokumen 9: doc09.txt
  - Memproses Dokumen 10: doc10.txt

Proses Selesai. Menampilkan Hasil...


📊 HASIL AKHIR EVALUASI RINCI


### Tabel Evaluasi untuk Compression Rate: 75% ###

Dok     TP   FP   FN       P       R     F-1
------------------------------------------
1        0   44   30    0.00    0.00    0.00
2        0   23   20    0.00    0.00    0.00
3        0   49   30    0.00    0.00    0.00
4        0   98   26    0.00    0.00    0.00
5        0  138   42    0.00    0.00    0.00
6        0   30   19    0.00    0.00    0.00
7        0   22   12    0.00    0.00    0.00
8        4   53   47    0.07    0.08    0.07
9        0   19   29    0.00    

In [15]:
# # 1. Mount Google Drive (jika menggunakan Colab)
# from google.colab import drive
# drive.mount('/content/drive')

# ===== 2. Setup Pustaka & Inisialisasi =====
import os
import re
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# Pastikan resource NLTK sudah ada
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

# ===== 3. Fungsi-fungsi Helper (REVISI) =====

def preprocess_for_tfidf(sentence):
    """Membersihkan satu kalimat SECARA AGRESIF HANYA untuk perhitungan TF-IDF."""
    # Menghapus semua selain huruf, lalu stemming & stopword removal
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', sentence).lower()
    stemmed_text = stemmer.stem(cleaned_text)
    tokens = word_tokenize(stemmed_text)
    return ' '.join([word for word in tokens if word not in stop_words])

def normalize_for_comparison(text):
    """
    Normalisasi ringan HANYA untuk perbandingan:
    - Ubah ke huruf kecil
    - Hapus spasi berlebih di awal/akhir
    - Hapus semua karakter non-alfanumerik
    """
    return re.sub(r'[^a-z0-9\s]', '', text.lower().strip())

def textrank_summarizer(text, ratio=0.25):
    """Fungsi utama TextRank dengan alur preprocessing yang diperbaiki."""
    # 1. Lakukan segmentasi kalimat pada teks yang bersih dari noise awal
    clean_text_for_sent_tokenize = re.sub(r'\n+', ' ', text)
    raw_sentences = sent_tokenize(clean_text_for_sent_tokenize)

    if len(raw_sentences) < 3:
        return text

    # 2. Buat versi super-bersih untuk TF-IDF
    processed_for_tfidf = [preprocess_for_tfidf(s) for s in raw_sentences]

    # Pastikan kalimat yang kosong setelah diproses tidak diikutkan
    # sambil melacak indeks aslinya
    indexed_sentences = [
        (i, processed) for i, processed in enumerate(processed_for_tfidf) if processed
    ]
    if not indexed_sentences: return ""

    original_indices = [i for i, _ in indexed_sentences]
    cleaned_sentences_for_tfidf = [s for _, s in indexed_sentences]

    # 3. Proses TF-IDF dan TextRank seperti biasa
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cleaned_sentences_for_tfidf)
    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, 0)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    # 4. Ambil kalimat dari 'raw_sentences' berdasarkan ranking
    ranked_indices = sorted(
        range(len(scores)), key=lambda i: scores[i], reverse=True
    )
    
    n_summary = max(1, int(len(raw_sentences) * ratio))
    
    # Pilih N kalimat teratas berdasarkan indeks aslinya
    top_indices = [original_indices[i] for i in ranked_indices[:n_summary]]

    # Urutkan kembali sesuai urutan kemunculan di teks asli
    summary_sentences = sorted([raw_sentences[i] for i in top_indices])

    return ' '.join(summary_sentences)

def evaluate_sentence_overlap(system_summary, reference_summary):
    """REVISI: Evaluasi dengan menormalkan kedua sisi terlebih dahulu."""
    # Normalisasi kedua ringkasan sebelum membandingkan
    system_sents = {normalize_for_comparison(s) for s in sent_tokenize(system_summary)}
    reference_sents = {normalize_for_comparison(s) for s in sent_tokenize(reference_summary)}
    
    # Hapus string kosong jika ada
    system_sents.discard('')
    reference_sents.discard('')

    tp = len(system_sents.intersection(reference_sents))
    fp = len(system_sents.difference(reference_sents))
    fn = len(reference_sents.difference(system_sents))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f_measure = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return {'tp': tp, 'fp': fp, 'fn': fn, 'p': precision, 'r': recall, 'f1': f_measure}

# ===== 4. Proses Utama (Tidak ada perubahan signifikan) =====
MAX_DOCS = 10
COMPRESSION_RATES = [75, 50, 25]

dokumen_dir = 'data_putusan/dok_putusan_txt'
referensi_dir = 'data_putusan/referensi_ringkasan2'
referensi_dir2 = 'data_putusan/referensi_ringkasan2'

dokumen_files = sorted([f for f in os.listdir(dokumen_dir) if f.endswith('.txt')])[:MAX_DOCS]
referensi_files = sorted([f for f in os.listdir(referensi_dir) if f.endswith('.txt')])[:MAX_DOCS]

results = {rate: [] for rate in COMPRESSION_RATES}

print(f"🚀 Memulai Proses Evaluasi untuk {len(dokumen_files)} Dokumen...")
for idx, (dok_file, ref_file) in enumerate(zip(dokumen_files, referensi_files), 1):
    print(f"  - Memproses Dokumen {idx}: {dok_file}")
    with open(os.path.join(dokumen_dir, dok_file), 'r', encoding='utf-8') as f:
        dokumen_text = f.read()
    with open(os.path.join(referensi_dir, ref_file), 'r', encoding='utf-8') as f:
        referensi_text = f.read()

    for rate in COMPRESSION_RATES:
        summary_ratio = (100 - rate) / 100.0
        summary = textrank_summarizer(dokumen_text, ratio=summary_ratio)
        metrics = evaluate_sentence_overlap(summary, referensi_text)
        results[rate].append(metrics)
print("\nProses Selesai. Menampilkan Hasil...")

# ===== 5. Tampilan Hasil Akhir Evaluasi (Tidak ada perubahan) =====
print("\n\n" + "="*60)
print("📊 HASIL AKHIR EVALUASI RINCI")
print("="*60)
for rate in COMPRESSION_RATES:
    print(f"\n\n### Tabel Evaluasi untuk Compression Rate: {rate}% ###\n")
    print(f"{'Dok':<5} {'TP':>4} {'FP':>4} {'FN':>4} {'P':>7} {'R':>7} {'F-1':>7}")
    print("-" * 42)
    p_scores, r_scores, f1_scores = [], [], []
    for i, doc_metrics in enumerate(results[rate]):
        p, r, f1 = doc_metrics['p'], doc_metrics['r'], doc_metrics['f1']
        p_scores.append(p)
        r_scores.append(r)
        f1_scores.append(f1)
        print(f"{i+1:<5} {doc_metrics['tp']:>4} {doc_metrics['fp']:>4} {doc_metrics['fn']:>4} {p:>7.2f} {r:>7.2f} {f1:>7.2f}")
    avg_p = np.mean(p_scores)
    avg_r = np.mean(r_scores)
    avg_f1 = np.mean(f1_scores)
    print("-" * 42)
    print(f"{'Rata-rata':<5} {'':>4} {'':>4} {'':>4} {avg_p:>7.2f} {avg_r:>7.2f} {avg_f1:>7.2f}")

🚀 Memulai Proses Evaluasi untuk 10 Dokumen...
  - Memproses Dokumen 1: doc01.txt
  - Memproses Dokumen 2: doc02.txt
  - Memproses Dokumen 3: doc03.txt
  - Memproses Dokumen 4: doc04.txt
  - Memproses Dokumen 5: doc05.txt
  - Memproses Dokumen 6: doc06.txt
  - Memproses Dokumen 7: doc07.txt
  - Memproses Dokumen 8: doc08.txt
  - Memproses Dokumen 9: doc09.txt
  - Memproses Dokumen 10: doc10.txt

Proses Selesai. Menampilkan Hasil...


📊 HASIL AKHIR EVALUASI RINCI


### Tabel Evaluasi untuk Compression Rate: 75% ###

Dok     TP   FP   FN       P       R     F-1
------------------------------------------
1        0   44   42    0.00    0.00    0.00
2        1   21   31    0.05    0.03    0.04
3        0   49   40    0.00    0.00    0.00
4        0  104   38    0.00    0.00    0.00
5        0  139   50    0.00    0.00    0.00
6        0   31   29    0.00    0.00    0.00
7        0   26   22    0.00    0.00    0.00
8        5   53   55    0.09    0.08    0.08
9        1   18   28    0.05    

# Perbaikan 1

In [16]:
# perbaikan 

# ===== 2. Setup Pustaka & Inisialisasi =====
import os
import re
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer # Mengembalikan ROUGE


In [17]:

# Pastikan resource NLTK sudah ada
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))


In [18]:

# ===== 3. Fungsi-fungsi Helper (REVISI) =====

def preprocess_for_tfidf(sentence):
    """Membersihkan kalimat secara agresif HANYA untuk perhitungan TF-IDF."""
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', sentence).lower()
    stemmed_text = stemmer.stem(cleaned_text)
    tokens = word_tokenize(stemmed_text)
    return ' '.join([word for word in tokens if word not in stop_words])

def normalize_for_comparison(text):
    """Normalisasi ringan untuk perbandingan: huruf kecil dan hapus spasi berlebih."""
    return text.lower().strip()

def textrank_summarizer(text, ratio=0.25):
    """Fungsi TextRank dengan perbaikan pada pengurutan kalimat akhir."""
    clean_text_for_sent_tokenize = re.sub(r'\n+', ' ', text)
    raw_sentences = sent_tokenize(clean_text_for_sent_tokenize)
    if len(raw_sentences) < 3: return text

    processed_for_tfidf = [preprocess_for_tfidf(s) for s in raw_sentences]
    
    indexed_sentences = [(i, processed) for i, processed in enumerate(processed_for_tfidf) if processed]
    if not indexed_sentences: return ""
    
    original_indices = [i for i, _ in indexed_sentences]
    cleaned_sentences_for_tfidf = [s for _, s in indexed_sentences]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cleaned_sentences_for_tfidf)
    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, 0)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    n_summary = max(1, int(len(raw_sentences) * ratio))
    top_original_indices = [original_indices[i] for i in ranked_indices[:n_summary]]
    
    # PERBAIKAN PENTING: Mengurutkan kalimat ringkasan berdasarkan urutan asli
    top_sentences = [raw_sentences[i] for i in top_original_indices]
    summary_sentences = sorted(top_sentences, key=lambda s: raw_sentences.index(s))

    return ' '.join(summary_sentences)

def evaluate(system_summary, reference_summary):
    """Fungsi evaluasi ganda: Sentence Overlap dan ROUGE."""
    # 1. Evaluasi Sentence Overlap
    system_sents = {normalize_for_comparison(s) for s in sent_tokenize(system_summary)}
    reference_sents = {normalize_for_comparison(s) for s in sent_tokenize(reference_summary)}
    system_sents.discard('')
    reference_sents.discard('')

    tp = len(system_sents.intersection(reference_sents))
    fp = len(system_sents.difference(reference_sents))
    fn = len(reference_sents.difference(system_sents))
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    paper_metrics = {'tp': tp, 'fp': fp, 'fn': fn, 'p': precision, 'r': recall, 'f1': f1}

    # 2. Evaluasi ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference_summary, system_summary)

    return paper_metrics, rouge_scores


In [None]:

# ===== 4. Proses Utama =====
MAX_DOCS = 10
COMPRESSION_RATES = [75, 50, 25]

dokumen_dir = 'data_putusan/dok_putusan_txt'
referensi_dir = 'data_putusan/referensi_ringkasan2'
referensi_dir2 = 'data_putusan/referensi_ringkasan2'

dokumen_files = sorted([f for f in os.listdir(dokumen_dir) if f.endswith('.txt')])[:MAX_DOCS]
referensi_files = sorted([f for f in os.listdir(referensi_dir) if f.endswith('.txt')])[:MAX_DOCS]

# Struktur untuk menyimpan semua hasil
results = {rate: {'paper': [], 'rouge': []} for rate in COMPRESSION_RATES}

print(f"🚀 Memulai Proses Evaluasi untuk {len(dokumen_files)} Dokumen...")
for idx, (dok_file, ref_file) in enumerate(zip(dokumen_files, referensi_files), 1):
    print(f"  - Memproses Dokumen {idx}: {dok_file}")
    with open(os.path.join(dokumen_dir, dok_file), 'r', encoding='utf-8') as f:
        dokumen_text = f.read()
    with open(os.path.join(referensi_dir, ref_file), 'r', encoding='utf-8') as f:
        referensi_text = f.read()

    for rate in COMPRESSION_RATES:
        summary_ratio = (100 - rate) / 100.0
        summary = textrank_summarizer(dokumen_text, ratio=summary_ratio)
        
        paper_metrics, rouge_scores = evaluate(summary, referensi_text)
        
        results[rate]['paper'].append(paper_metrics)
        results[rate]['rouge'].append(rouge_scores)
print("\nProses Selesai. Menampilkan Hasil...")


🚀 Memulai Proses Evaluasi untuk 10 Dokumen...
  - Memproses Dokumen 1: doc01.txt
  - Memproses Dokumen 2: doc02.txt
  - Memproses Dokumen 3: doc03.txt
  - Memproses Dokumen 4: doc04.txt
  - Memproses Dokumen 5: doc05.txt
  - Memproses Dokumen 6: doc06.txt
  - Memproses Dokumen 7: doc07.txt
  - Memproses Dokumen 8: doc08.txt
  - Memproses Dokumen 9: doc09.txt
  - Memproses Dokumen 10: doc10.txt

Proses Selesai. Menampilkan Hasil...


In [21]:
# ===== 5. Tampilan Hasil Akhir Evaluasi =====
print("\n\n" + "="*60)
print("📊 HASIL AKHIR EVALUASI")
print("="*60)

# --- Tabel 1: Metrik Evaluasi Sesuai Paper (Sentence Overlap) ---
print("\n\n--- 1. Metrik Evaluasi Paper (TP, FP, FN) ---")
for rate in COMPRESSION_RATES:
    print(f"\n### Tabel untuk Compression Rate: {rate}% ###")
    print(f"{'Dok':<5} {'TP':>4} {'FP':>4} {'FN':>4} {'P':>7} {'R':>7} {'F-1':>7}")
    print("-" * 42)
    p_scores, r_scores, f1_scores = [], [], []
    for i, metrics in enumerate(results[rate]['paper']):
        p, r, f1 = metrics['p'], metrics['r'], metrics['f1']
        p_scores.append(p)
        r_scores.append(r)
        f1_scores.append(f1)
        print(f"{i+1:<5} {metrics['tp']:>4} {metrics['fp']:>4} {metrics['fn']:>4} {p:>7.2f} {r:>7.2f} {f1:>7.2f}")
    avg_p, avg_r, avg_f1 = np.mean(p_scores), np.mean(r_scores), np.mean(f1_scores)
    print("-" * 42)
    print(f"{'Rata-rata':<5} {'':>4} {'':>4} {'':>4} {avg_p:>7.2f} {avg_r:>7.2f} {avg_f1:>7.2f}")

# --- Tabel 2: Metrik Evaluasi ROUGE (F1-Score) ---
print("\n\n--- 2. Metrik Evaluasi ROUGE (Rata-rata F1-Score) ---")
for rate in COMPRESSION_RATES:
    print(f"\n### Rata-rata F1-Score untuk Compression Rate: {rate}% ###")
    avg_r1 = np.mean([s['rouge1'].fmeasure for s in results[rate]['rouge']])
    avg_r2 = np.mean([s['rouge2'].fmeasure for s in results[rate]['rouge']])
    avg_rl = np.mean([s['rougeL'].fmeasure for s in results[rate]['rouge']])
    print(f"  - Rata-rata ROUGE-1: {avg_r1:.4f}")
    print(f"  - Rata-rata ROUGE-2: {avg_r2:.4f}")
    print(f"  - Rata-rata ROUGE-L: {avg_rl:.4f}")



📊 HASIL AKHIR EVALUASI


--- 1. Metrik Evaluasi Paper (TP, FP, FN) ---

### Tabel untuk Compression Rate: 75% ###
Dok     TP   FP   FN       P       R     F-1
------------------------------------------
1        0   44   42    0.00    0.00    0.00
2        1   21   31    0.05    0.03    0.04
3        0   51   40    0.00    0.00    0.00
4        0  106   38    0.00    0.00    0.00
5        0  140   51    0.00    0.00    0.00
6        0   30   30    0.00    0.00    0.00
7        0   26   22    0.00    0.00    0.00
8        4   54   56    0.07    0.07    0.07
9        1   18   29    0.05    0.03    0.04
10      13   46   62    0.22    0.17    0.19
------------------------------------------
Rata-rata                   0.04    0.03    0.03

### Tabel untuk Compression Rate: 50% ###
Dok     TP   FP   FN       P       R     F-1
------------------------------------------
1        0   85   42    0.00    0.00    0.00
2        2   42   30    0.05    0.06    0.05
3        1   96   39    0.01    0

In [22]:
# VERSI FINAL - FOKUS PADA EVALUASI ROUGE

# # 1. Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# ===== 2. Setup Pustaka & Inisialisasi =====
import os
import re
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

# ===== 3. Fungsi-fungsi Helper =====

def preprocess_for_tfidf(sentence):
    """Membersihkan kalimat secara agresif HANYA untuk perhitungan TF-IDF."""
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', sentence).lower()
    stemmed_text = stemmer.stem(cleaned_text)
    tokens = word_tokenize(stemmed_text)
    return ' '.join([word for word in tokens if word not in stop_words])

def textrank_summarizer(text, ratio=0.25):
    """Fungsi TextRank dengan perbaikan pada pengurutan kalimat akhir."""
    clean_text_for_sent_tokenize = re.sub(r'\n+', ' ', text)
    raw_sentences = sent_tokenize(clean_text_for_sent_tokenize)
    if len(raw_sentences) < 3: return text

    processed_for_tfidf = [preprocess_for_tfidf(s) for s in raw_sentences]
    
    indexed_sentences = [(i, processed) for i, processed in enumerate(processed_for_tfidf) if processed]
    if not indexed_sentences: return ""
    
    original_indices = [i for i, _ in indexed_sentences]
    cleaned_sentences_for_tfidf = [s for _, s in indexed_sentences]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cleaned_sentences_for_tfidf)
    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, 0)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    n_summary = max(1, int(len(raw_sentences) * ratio))
    top_original_indices = [original_indices[i] for i in ranked_indices[:n_summary]]
    
    top_sentences = [raw_sentences[i] for i in top_original_indices]
    summary_sentences = sorted(top_sentences, key=lambda s: raw_sentences.index(s))

    return ' '.join(summary_sentences)

# ===== 4. Proses Utama =====
MAX_DOCS = 10
COMPRESSION_RATES = [75, 50, 25]

dokumen_dir = 'data_putusan/dok_putusan_txt'
referensi_dir = 'data_putusan/referensi_ringkasan2'
referensi_dir2 = 'data_putusan/referensi_ringkasan2'

dokumen_files = sorted([f for f in os.listdir(dokumen_dir) if f.endswith('.txt')])[:MAX_DOCS]
referensi_files = sorted([f for f in os.listdir(referensi_dir) if f.endswith('.txt')])[:MAX_DOCS]

results = {rate: [] for rate in COMPRESSION_RATES}
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

print(f"🚀 Memulai Proses Evaluasi untuk {len(dokumen_files)} Dokumen...")
for idx, (dok_file, ref_file) in enumerate(zip(dokumen_files, referensi_files), 1):
    print(f"  - Memproses Dokumen {idx}: {dok_file}")
    with open(os.path.join(dokumen_dir, dok_file), 'r', encoding='utf-8') as f:
        dokumen_text = f.read()
    with open(os.path.join(referensi_dir, ref_file), 'r', encoding='utf-8') as f:
        referensi_text = f.read()

    for rate in COMPRESSION_RATES:
        summary_ratio = (100 - rate) / 100.0
        summary = textrank_summarizer(dokumen_text, ratio=summary_ratio)
        rouge_scores = scorer.score(referensi_text, summary)
        results[rate].append(rouge_scores)
print("\nProses Selesai. Menampilkan Hasil...")

# ===== 5. Tampilan Hasil Akhir Evaluasi ROUGE =====
print("\n\n" + "="*60)
print("📊 HASIL AKHIR EVALUASI ROUGE (Rata-rata F1-Score)")
print("="*60)

for rate in COMPRESSION_RATES:
    avg_r1 = np.mean([s['rouge1'].fmeasure for s in results[rate]])
    avg_r2 = np.mean([s['rouge2'].fmeasure for s in results[rate]])
    avg_rl = np.mean([s['rougeL'].fmeasure for s in results[rate]])
    print(f"\n### Hasil untuk Compression Rate: {rate}% ###")
    print(f"  - Rata-rata ROUGE-1: {avg_r1:.4f}")
    print(f"  - Rata-rata ROUGE-2: {avg_r2:.4f}")
    print(f"  - Rata-rata ROUGE-L: {avg_rl:.4f}")

🚀 Memulai Proses Evaluasi untuk 10 Dokumen...
  - Memproses Dokumen 1: doc01.txt
  - Memproses Dokumen 2: doc02.txt
  - Memproses Dokumen 3: doc03.txt
  - Memproses Dokumen 4: doc04.txt
  - Memproses Dokumen 5: doc05.txt
  - Memproses Dokumen 6: doc06.txt
  - Memproses Dokumen 7: doc07.txt
  - Memproses Dokumen 8: doc08.txt
  - Memproses Dokumen 9: doc09.txt
  - Memproses Dokumen 10: doc10.txt

Proses Selesai. Menampilkan Hasil...


📊 HASIL AKHIR EVALUASI ROUGE (Rata-rata F1-Score)

### Hasil untuk Compression Rate: 75% ###
  - Rata-rata ROUGE-1: 0.3947
  - Rata-rata ROUGE-2: 0.3234
  - Rata-rata ROUGE-L: 0.2772

### Hasil untuk Compression Rate: 50% ###
  - Rata-rata ROUGE-1: 0.3235
  - Rata-rata ROUGE-2: 0.2887
  - Rata-rata ROUGE-L: 0.2486

### Hasil untuk Compression Rate: 25% ###
  - Rata-rata ROUGE-1: 0.2869
  - Rata-rata ROUGE-2: 0.2701
  - Rata-rata ROUGE-L: 0.2329


# GPT1

In [23]:
# ===== 1. Setup Pustaka & Inisialisasi =====
import os
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

nltk.download('punkt')
nltk.download('stopwords')

stemmer = StemmerFactory().create_stemmer()
stop_words = set(stopwords.words('indonesian'))


# ===== 2. Preprocessing dan Helper Function =====
def clean_and_stem_sentence(text):
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    stemmed = stemmer.stem(text)
    tokens = word_tokenize(stemmed)
    filtered = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(filtered)


def preprocess_text(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = [clean_and_stem_sentence(s) for s in sentences if s.strip()]
    return sentences, cleaned_sentences


# ===== 3. Implementasi TextRank =====
def textrank_summarizer(raw_sentences, cleaned_sentences, ratio=0.25):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cleaned_sentences)
    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, 0)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(raw_sentences)), reverse=True)
    summary_size = max(1, int(len(raw_sentences) * ratio))
    summary = sorted([s for _, s in ranked_sentences[:summary_size]], key=lambda x: raw_sentences.index(x))
    return summary


# ===== 4. Evaluasi Metrik seperti Paper (TP, FP, FN, Precision, Recall, F1) =====
def evaluate_metrics(system_summary, expert_summary):
    system_set = set(system_summary)
    expert_set = set(expert_summary)

    tp = len(system_set & expert_set)
    fp = len(system_set - expert_set)
    fn = len(expert_set - system_set)

    precision = tp / (tp + fp) if (tp + fp) else 0
    recall = tp / (tp + fn) if (tp + fn) else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0

    return {'TP': tp, 'FP': fp, 'FN': fn, 'Precision': precision, 'Recall': recall, 'F1': f1}


# ===== 5. Proses Utama =====
MAX_DOCS = 10
COMPRESSION_RATES = [75, 50, 25]

dokumen_dir = 'data_putusan/dok_putusan_txt'
referensi_dir = 'data_putusan/referensi_ringkasan2'

dokumen_files = sorted([f for f in os.listdir(dokumen_dir) if f.endswith('.txt')])[:MAX_DOCS]
referensi_files = sorted([f for f in os.listdir(referensi_dir) if f.endswith('.txt')])[:MAX_DOCS]

aggregated_results = {rate: {'Precision': [], 'Recall': [], 'F1': []} for rate in COMPRESSION_RATES}

for idx, (dok_file, ref_file) in enumerate(zip(dokumen_files, referensi_files), 1):
    print(f"\n📄 Dokumen ke-{idx}: {dok_file}")

    with open(os.path.join(dokumen_dir, dok_file), 'r', encoding='utf-8') as f:
        dokumen_text = f.read()
    with open(os.path.join(referensi_dir, ref_file), 'r', encoding='utf-8') as f:
        referensi_text = f.read()

    raw_sentences, cleaned_sentences = preprocess_text(dokumen_text)
    expert_sentences, _ = preprocess_text(referensi_text)

    for rate in COMPRESSION_RATES:
        summary_ratio = (100 - rate) / 100.0
        summary_sentences = textrank_summarizer(raw_sentences, cleaned_sentences, ratio=summary_ratio)
        metrics = evaluate_metrics(summary_sentences, expert_sentences)

        print(f"\n--- Compression Rate: {rate}% ---")
        print(f"TP={metrics['TP']} FP={metrics['FP']} FN={metrics['FN']}")
        print(f"Precision={metrics['Precision']:.2f} Recall={metrics['Recall']:.2f} F1={metrics['F1']:.2f}")

        aggregated_results[rate]['Precision'].append(metrics['Precision'])
        aggregated_results[rate]['Recall'].append(metrics['Recall'])
        aggregated_results[rate]['F1'].append(metrics['F1'])

# ===== 6. Hasil Akhir Evaluasi (Rata-rata) =====
print("\n\n📊 HASIL AKHIR EVALUASI RATA-RATA")
for rate, metrics in aggregated_results.items():
    print(f"\n# Compression Rate: {rate}%")
    for metric, scores in metrics.items():
        avg_score = np.mean(scores)
        print(f"  Rata-rata {metric}: {avg_score:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\erwin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erwin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



📄 Dokumen ke-1: doc01.txt

--- Compression Rate: 75% ---
TP=0 FP=44 FN=42
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 50% ---
TP=0 FP=85 FN=42
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 25% ---
TP=1 FP=123 FN=41
Precision=0.01 Recall=0.02 F1=0.01

📄 Dokumen ke-2: doc02.txt

--- Compression Rate: 75% ---
TP=0 FP=25 FN=32
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 50% ---
TP=2 FP=42 FN=30
Precision=0.05 Recall=0.06 F1=0.05

--- Compression Rate: 25% ---
TP=5 FP=63 FN=27
Precision=0.07 Recall=0.16 F1=0.10

📄 Dokumen ke-3: doc03.txt

--- Compression Rate: 75% ---
TP=0 FP=51 FN=40
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 50% ---
TP=1 FP=97 FN=39
Precision=0.01 Recall=0.03 F1=0.01

--- Compression Rate: 25% ---
TP=1 FP=141 FN=39
Precision=0.01 Recall=0.03 F1=0.01

📄 Dokumen ke-4: doc04.txt

--- Compression Rate: 75% ---
TP=0 FP=104 FN=38
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 50% ---
TP=1 FP=242 FN=37
Precision=

In [None]:
# 2
stemmer = StemmerFactory().create_stemmer()
stop_words = set(stopwords.words('indonesian'))


# ===== 2. Preprocessing dan Helper Function =====
def clean_and_stem_sentence(text):
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    stemmed = stemmer.stem(text)
    tokens = word_tokenize(stemmed)
    filtered = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(filtered)


def preprocess_text(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = [clean_and_stem_sentence(s) for s in sentences if s.strip()]
    return sentences, cleaned_sentences


# ===== 3. Implementasi TextRank =====
def textrank_summarizer(raw_sentences, cleaned_sentences, ratio=0.25):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cleaned_sentences)
    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, 0)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(raw_sentences)), reverse=True)
    summary_size = max(1, int(len(raw_sentences) * ratio))
    summary = sorted([s for _, s in ranked_sentences[:summary_size]], key=lambda x: raw_sentences.index(x))
    return summary


# ===== 4. Evaluasi Metrik seperti Paper (TP, FP, FN, Precision, Recall, F1) =====
def evaluate_metrics(system_summary, expert_summary):
    system_set = set(system_summary)
    expert_set = set(expert_summary)

    tp = len(system_set & expert_set)
    fp = len(system_set - expert_set)
    fn = len(expert_set - system_set)

    precision = tp / (tp + fp) if (tp + fp) else 0
    recall = tp / (tp + fn) if (tp + fn) else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0

    return {'TP': tp, 'FP': fp, 'FN': fn, 'Precision': precision, 'Recall': recall, 'F1': f1}


# ===== 5. Proses Utama =====
MAX_DOCS = 10
COMPRESSION_RATES = [75, 50, 25]

dokumen_dir = 'data_putusan/dok_putusan_txt'
referensi_dir = 'data_putusan/referensi_ringkasan'
referensi_dir = 'data_putusan/referensi_ringkasan2'

dokumen_files = sorted([f for f in os.listdir(dokumen_dir) if f.endswith('.txt')])[:MAX_DOCS]
referensi_files = sorted([f for f in os.listdir(referensi_dir) if f.endswith('.txt')])[:MAX_DOCS]

aggregated_results = {rate: {'Precision': [], 'Recall': [], 'F1': []} for rate in COMPRESSION_RATES}

for idx, (dok_file, ref_file) in enumerate(zip(dokumen_files, referensi_files), 1):
    print(f"\n📄 Dokumen ke-{idx}: {dok_file}")

    with open(os.path.join(dokumen_dir, dok_file), 'r', encoding='utf-8') as f:
        dokumen_text = f.read()
    with open(os.path.join(referensi_dir, ref_file), 'r', encoding='utf-8') as f:
        referensi_text = f.read()

    raw_sentences, cleaned_sentences = preprocess_text(dokumen_text)
    expert_sentences, _ = preprocess_text(referensi_text)

    for rate in COMPRESSION_RATES:
        summary_ratio = (100 - rate) / 100.0
        summary_sentences = textrank_summarizer(raw_sentences, cleaned_sentences, ratio=summary_ratio)
        metrics = evaluate_metrics(summary_sentences, expert_sentences)

        print(f"\n--- Compression Rate: {rate}% ---")
        print(f"TP={metrics['TP']} FP={metrics['FP']} FN={metrics['FN']}")
        print(f"Precision={metrics['Precision']:.2f} Recall={metrics['Recall']:.2f} F1={metrics['F1']:.2f}")

        aggregated_results[rate]['Precision'].append(metrics['Precision'])
        aggregated_results[rate]['Recall'].append(metrics['Recall'])
        aggregated_results[rate]['F1'].append(metrics['F1'])

# ===== 6. Hasil Akhir Evaluasi (Rata-rata) =====
print("\n\n📊 HASIL AKHIR EVALUASI RATA-RATA")
for rate, metrics in aggregated_results.items():
    print(f"\n# Compression Rate: {rate}%")
    for metric, scores in metrics.items():
        avg_score = np.mean(scores)
        print(f"  Rata-rata {metric}: {avg_score:.4f}")


📄 Dokumen ke-1: doc01.txt

--- Compression Rate: 75% ---
TP=0 FP=44 FN=48
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 50% ---
TP=0 FP=85 FN=48
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 25% ---
TP=1 FP=123 FN=47
Precision=0.01 Recall=0.02 F1=0.01

📄 Dokumen ke-2: doc02.txt

--- Compression Rate: 75% ---
TP=0 FP=25 FN=34
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 50% ---
TP=2 FP=42 FN=32
Precision=0.05 Recall=0.06 F1=0.05

--- Compression Rate: 25% ---
TP=5 FP=63 FN=29
Precision=0.07 Recall=0.15 F1=0.10

📄 Dokumen ke-3: doc03.txt

--- Compression Rate: 75% ---
TP=0 FP=51 FN=44
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 50% ---
TP=1 FP=97 FN=43
Precision=0.01 Recall=0.02 F1=0.01

--- Compression Rate: 25% ---
TP=2 FP=140 FN=42
Precision=0.01 Recall=0.05 F1=0.02

📄 Dokumen ke-4: doc04.txt

--- Compression Rate: 75% ---
TP=0 FP=104 FN=46
Precision=0.00 Recall=0.00 F1=0.00

--- Compression Rate: 50% ---
TP=1 FP=242 FN=45
Precision=

In [7]:
import os
# Let's check the contents of the folder that may contain the raw text documents
data_folder_path = "data_putusan"
dok_putusan_txt_folder = os.path.join(data_folder_path, 'dok_putusan_txt')
os.listdir(dok_putusan_txt_folder)


['.DS_Store',
 '.ipynb_checkpoints',
 'doc01.txt',
 'doc02.txt',
 'doc03.txt',
 'doc04.txt',
 'doc05.txt',
 'doc06.txt',
 'doc07.txt',
 'doc08.txt',
 'doc09.txt',
 'doc10.txt',
 'doc11.txt',
 'doc12.txt',
 'doc13.txt',
 'doc14.txt',
 'doc15.txt',
 'doc16.txt',
 'doc17.txt',
 'doc18.txt',
 'doc19.txt',
 'doc20.txt',
 'doc21.txt',
 'doc22.txt',
 'doc23.txt',
 'doc24.txt',
 'doc25.txt',
 'doc26.txt',
 'doc27.txt',
 'doc28.txt',
 'doc29.txt',
 'doc30.txt',
 'doc31.txt',
 'doc32.txt',
 'doc33.txt',
 'doc34.txt',
 'doc35.txt',
 'doc36.txt',
 'doc37.txt',
 'doc38.txt',
 'doc39.txt',
 'doc40.txt',
 'doc41.txt',
 'doc42.txt',
 'doc43.txt',
 'doc44.txt',
 'doc45.txt',
 'doc46.txt',
 'doc47.txt',
 'doc48.txt',
 'doc49.txt',
 'doc50.txt']

In [29]:
# Load the content of 'doc23.txt' as a sample document for summarization
sample_doc_path = os.path.join(dok_putusan_txt_folder, 'doc23.txt')

with open(sample_doc_path, 'r', encoding='utf-8') as file:
    document_text = file.read()

# Display the first 500 characters of the document to understand its structure
document_text[:500]

'  PUTUSAN Nomor 374/Pid.Sus/2017/PN Sim  DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA  Pengadilan Negeri Simalungun yang mengadili perkara pidana  dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa :  2. Tempat lahir  : Dosin  3. Umur/Tanggal lahir  : 39/23 Januari 1978  4. Jenis kelamin  : Laki-laki  5. Kebangsaan  : Indonesia  6. Tempat tinggal  :Kampung Tengah nagori Maligas Bayu Kec. ub lik  : Surianto Alias Gundol  1. Nama le'

In [30]:
# Reading all text files in 'dok_putusan_txt' folder
texts = []
for filename in os.listdir(dok_putusan_txt_folder):
    if filename.endswith(".txt") and filename != '.DS_Store':
        file_path = os.path.join(dok_putusan_txt_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            texts.append(text)

# Display the number of text documents loaded
len(texts)  # Showing the number of documents processed


50

In [31]:
print(texts[0][:500])


ep  P U T U SN Nomor 5/Pid.Sus/2020/PN Kag  DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA  Pengadilan Negeri Kayuagung yang mengadili perkara pidana dengan  acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa :  Tempat lahir  : Palembang  Umur/Tanggal lahir  : 34 Tahun / 24 April 1985  Jenis kelamin  : Laki-laki  Kebangsaan  : Indonesia  Tempat tinggal  : Jl. Tangga Takat No. 1029 Rt. 17 Rw. 07 Kel. Tangga ub lik  : KA Ibrahim Bin KH Abdul


In [32]:
# Function to clean text: remove non-relevant elements such as headers, footers, and special characters
def clean_text(text):
    # Remove extra spaces, newline characters and special characters like digits and non-ASCII characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    return text.strip()

# Clean all documents in the texts list
cleaned_texts = [clean_text(text) for text in texts]

# Show the first 500 characters of the first cleaned text document
cleaned_texts[0][:500]  # Display the first 500 characters of the first document


'ep P U T U SN Nomor 5PidSus2020PN Kag DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Kayuagung yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa  Tempat lahir  Palembang UmurTanggal lahir  34 Tahun  24 April 1985 Jenis kelamin  Lakilaki Kebangsaan  Indonesia Tempat tinggal  Jl Tangga Takat No 1029 Rt 17 Rw 07 Kel Tangga ub lik  KA Ibrahim Bin KH Abdullah Murod Nama lengkap  Islam Peke'

In [None]:
# Tokenization: Split the text into sentences and words
from nltk.tokenize import sent_tokenize, word_tokenize

# Tokenize the cleaned documents into sentences
tokenized_sentences = [sent_tokenize(text) for text in cleaned_texts]

# Apply word tokenization to each sentence
tokenized_words = [[word_tokenize(sentence) for sentence in doc] for doc in tokenized_sentences]

['ep P U T U SN Nomor 5PidSus2020PN Kag DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Kayuagung yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa  Tempat lahir  Palembang UmurTanggal lahir  34 Tahun  24 April 1985 Jenis kelamin  Lakilaki Kebangsaan  Indonesia Tempat tinggal  Jl Tangga Takat No 1029 Rt 17 Rw 07 Kel Tangga ub lik  KA Ibrahim Bin KH Abdullah Murod Nama lengkap  Islam Pekerjaan  Belum Bekerja Pendidikan  SMA tidak tamatne Agama k Takat Kec Seberang Ulu II Kota Palembang Terdakwa KA Ibrahim Bin KH Abdullah Murod ditangkap pada tanggal 26 September 2019 dan ditahan dalam rumah tahanan negara oleh  1 Penyidik sejak tanggal 28 September 2019 sampai dengan tanggal 18 Oktober 2019 2 Penyidik Perpanjangan Oleh Penuntut Umum sejak tanggal 19 Oktober 2019 sampai dengan tanggal 27 November 2019 3 Penuntut Umum sejak tanggal 26 November 2019 sampai dengan tanggal lik seja

In [34]:
# Show the first 5 tokenized sentences of the first document
tokenized_sentences[0][:500]  # Display the first 5 sentences from the first document

['ep P U T U SN Nomor 5PidSus2020PN Kag DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Kayuagung yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa  Tempat lahir  Palembang UmurTanggal lahir  34 Tahun  24 April 1985 Jenis kelamin  Lakilaki Kebangsaan  Indonesia Tempat tinggal  Jl Tangga Takat No 1029 Rt 17 Rw 07 Kel Tangga ub lik  KA Ibrahim Bin KH Abdullah Murod Nama lengkap  Islam Pekerjaan  Belum Bekerja Pendidikan  SMA tidak tamatne Agama k Takat Kec Seberang Ulu II Kota Palembang Terdakwa KA Ibrahim Bin KH Abdullah Murod ditangkap pada tanggal 26 September 2019 dan ditahan dalam rumah tahanan negara oleh  1 Penyidik sejak tanggal 28 September 2019 sampai dengan tanggal 18 Oktober 2019 2 Penyidik Perpanjangan Oleh Penuntut Umum sejak tanggal 19 Oktober 2019 sampai dengan tanggal 27 November 2019 3 Penuntut Umum sejak tanggal 26 November 2019 sampai dengan tanggal lik seja

In [38]:
print(len(tokenized_sentences[0][:500]))
print(len(cleaned_texts[0][:500]))

1
500


In [40]:
# Step 3: Stemming and Stopword Removal
from nltk.corpus import stopwords
stopwords_list = set(stopwords.words('indonesian'))  # Add custom stopwords if necessary

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords_list]


In [41]:
# Apply stemming and stopword removal
processed_sentences = []
for sentence in sentences:
    tokens = simple_tokenize_words(sentence)
    tokens = apply_sastrawi_stemming(tokens)
    tokens = remove_stopwords(tokens)
    processed_sentences.append(' '.join(tokens))

NameError: name 'simple_tokenize_words' is not defined

In [None]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [None]:
# xx

In [10]:
# Initialize the Sastrawi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()


In [11]:
# Text cleaning function: remove headers, footers, and irrelevant symbols
def clean_text(text):
    # Remove extra spaces, newline characters and special characters like digits
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    return text.strip()

# Tokenize sentences from the document
def tokenize_sentences(text):
    return sent_tokenize(text)

# Function to apply stemming
def apply_stemming(tokens):
    return [stemmer.stem(word) for word in tokens]

# Function to remove stopwords from the text
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('indonesian'))
    return [word for word in tokens if word.lower() not in stop_words]

In [12]:
# Preprocess the document text
cleaned_text = clean_text(document_text)
sentences = tokenize_sentences(cleaned_text)

# Apply stemming and remove stopwords for each sentence
processed_sentences = []
for sentence in sentences:
    tokens = word_tokenize(sentence)
    tokens = apply_stemming(tokens)
    tokens = remove_stopwords(tokens)
    processed_sentences.append(' '.join(tokens))

# Display the first 5 processed sentences
processed_sentences[:5]

['putus nomor 374pidsus2017pn sim adil dasar tuhan maha esa adil negeri simalungun adil perkara pidana acara periksa tingkat jatuh putus perkara dakwa 2 lahir dosin 3 umurtanggal lahir 3923 januari 1978 4 jenis kelamin lakilaki 5 bangsa indonesia 6 tinggal kampung nagori maligas bayu kec ub lik surianto alias gundol 1 nama lengkap 7 agama islam 8 kerja wiraswasta k huta bayu raja kabupaten simalungunne dakwa tangkap sidik tanggal 10 april 2017 tanggal 12 april 2017 dakwa surianto alias gundol tahan tahan rutan 1 sidik tanggal 13 april 2017 tanggal 2 mei 2017 dakwa surianto alias gundol tahan tahan rutan 2 sidik tuntut tanggal 3 mei 2017 tanggal 11 juni 2017 dakwa surianto alias gundol tahan tahan rutan lik juni 2017 tanggal 11 juli 2017 dakwa surianto alias gundol tahan tahan rutan ub 4 sidik ketua adil negeri tanggal 12 juli 2017 tanggal 10 agustus 2017 dakwa surianto alias gundol tahan tahan rutan 5 tuntut tanggal 10 agustus 2017 tanggal 29 agustus 2017 6 hakim adil negeri tanggal 23

In [39]:
# Reimporting Sastrawi Stemmer for stemming the Indonesian text
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Initialize the Sastrawi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Function to apply stemming using Sastrawi
def apply_sastrawi_stemming(tokens):
    return [stemmer.stem(word) for word in tokens]

# Apply stemming to the tokenized sentences
tokenized_sentences_stemmed = []
for doc in improved_tokenized_sentences:
    stemmed_sentences = []
    for sentence in doc:
        tokens = simple_tokenize_words(sentence)
        stemmed_tokens = apply_sastrawi_stemming(tokens)
        stemmed_sentences.append(' '.join(stemmed_tokens))
    tokenized_sentences_stemmed.append(stemmed_sentences)

# Show the first 5 sentences after stemming
tokenized_sentences_stemmed[0][:5]  # Show the first 5 sentences after stemming


NameError: name 'improved_tokenized_sentences' is not defined