In [1]:
import os
import re
import PyPDF2
import networkx as nx
from nltk.tokenize import sent_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from rouge_score import rouge_scorer  # Pustaka untuk ROUGE evaluation


In [2]:

# Fungsi untuk mengekstrak teks dari file PDF (jika diperlukan)
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text

# Fungsi untuk membersihkan teks (hapus header/footer/watermark)
def clean_text(text):
    # Menghapus header, footer, watermark menggunakan regex
    text = re.sub(r'Header Pattern', '', text)
    text = re.sub(r'Footer Pattern', '', text)
    text = re.sub(r'Watermark Pattern', '', text)
    return text

# Fungsi untuk stemming
def apply_stemming(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text)

# Fungsi untuk menghapus stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('indonesian'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Fungsi untuk preprocessing teks (tokenisasi, stemming, stopwords removal)
def preprocess_text(text):
    cleaned_text = clean_text(text)
    stemmed_text = apply_stemming(cleaned_text)
    final_text = remove_stopwords(stemmed_text)
    return final_text

# Fungsi untuk memproses dokumen dalam folder
def process_txt_files_in_folder(folder_path):
    all_texts = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                processed_text = preprocess_text(text)
                all_texts.append(processed_text)
                filenames.append(filename)
    return all_texts, filenames

# Fungsi untuk menghitung TF-IDF
def compute_tfidf(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix

# Fungsi untuk menghitung cosine similarity antara kalimat-kalimat
def compute_cosine_similarity(tfidf_matrix):
    cosine_sim_matrix = cosine_similarity(tfidf_matrix)
    return cosine_sim_matrix

# Fungsi untuk menerapkan algoritma TextRank
def textrank_summarize(texts):
    # Tokenisasi kalimat
    sentences = [sent_tokenize(text) for text in texts]
    flattened_sentences = [sentence for sublist in sentences for sentence in sublist]
    
    # Menghitung TF-IDF
    tfidf_matrix = compute_tfidf(flattened_sentences)
    
    # Menghitung cosine similarity antar kalimat
    cosine_sim_matrix = compute_cosine_similarity(tfidf_matrix)
    
    # Membangun graf berdasarkan cosine similarity
    graph = nx.from_numpy_array(cosine_sim_matrix)
    scores = nx.pagerank(graph)
    
    # Menyusun kalimat berdasarkan skor tertinggi
    ranked_sentences = sorted(((score, idx) for idx, score in scores.items()), reverse=True)
    
    summary = ' '.join([flattened_sentences[idx] for score, idx in ranked_sentences[:5]])  # Ambil 5 kalimat teratas
    return summary, flattened_sentences  # Mengembalikan kalimat terpilih untuk evaluasi

# Fungsi untuk menghitung evaluasi Precision, Recall, dan F-measure
def calculate_precision_recall_f1(true_summary, generated_summary):
    true_set = set(true_summary.split())
    generated_set = set(generated_summary.split())
    
    # Precision
    precision = len(true_set.intersection(generated_set)) / len(generated_set) if generated_set else 0
    
    # Recall
    recall = len(true_set.intersection(generated_set)) / len(true_set) if true_set else 0
    
    # F1 score
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0
    
    return precision, recall, f1

# Fungsi untuk menghitung ROUGE
def calculate_rouge(true_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(true_summary, generated_summary)
    return scores

In [3]:

# Contoh penggunaan untuk folder teks yang sudah diproses
folder_path = 'data_putusan/dok_putusan_txt'  # Ganti dengan path folder yang berisi dokumen putusan dalam format .txt
texts, filenames = process_txt_files_in_folder(folder_path)

# Menyaring dan merangkum setiap dokumen, serta mengevaluasi hasilnya
summaries = {}
precision_all = []
recall_all = []
f1_all = []
rouge_scores = []

for idx, text in enumerate(texts):
    # Hasil peringkasan untuk setiap dokumen
    summary, flattened_sentences = textrank_summarize([text])  # Peringkasan untuk setiap dokumen
    summaries[filenames[idx]] = summary
    
    # Membaca ringkasan referensi dari file referensi_ringkasan (asumsikan file ini tersedia)
    reference_summary_path = os.path.join(folder_path, 'referensi_ringkasan', f'{filenames[idx]}_ref.txt')
    with open(reference_summary_path, 'r', encoding='utf-8') as ref_file:
        reference_summary = ref_file.read().strip()

    # Evaluasi dengan Precision, Recall, F1
    precision, recall, f1 = calculate_precision_recall_f1(reference_summary, summary)
    precision_all.append(precision)
    recall_all.append(recall)
    f1_all.append(f1)

    # Evaluasi dengan ROUGE
    rouge_score = calculate_rouge(reference_summary, summary)
    rouge_scores.append(rouge_score)

# Menampilkan hasil evaluasi untuk setiap file
for filename, summary in summaries.items():
    print(f"Ringkasan untuk {filename}:\n{summary}\n")

# Menampilkan hasil evaluasi
print(f"Precision: {sum(precision_all) / len(precision_all)}")
print(f"Recall: {sum(recall_all) / len(recall_all)}")
print(f"F1 Score: {sum(f1_all) / len(f1_all)}")
print(f"ROUGE Scores: {rouge_scores}")


FileNotFoundError: [Errno 2] No such file or directory: 'data_putusan/dok_putusan_txt\\referensi_ringkasan\\doc01.txt_ref.txt'