In [1]:
# Import necessary libraries
import os
import re
import string
import numpy as np
import pandas as pd
import networkx as nx
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer

# Confirm all libraries are loaded
print("✅ Pustaka berhasil diimpor.")


✅ Pustaka berhasil diimpor.


In [2]:
# Initialize the Sastrawi stemmer and stopword remover
factory = StemmerFactory()
stemmer = factory.create_stemmer()

stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

# Confirm stemmer and stopword remover are initialized
print("✅ Stemmer dan Stopword Remover berhasil diinisialisasi.")


✅ Stemmer dan Stopword Remover berhasil diinisialisasi.


In [7]:
# Step 1: Define text preprocessing functions using Sastrawi

def remove_headers_footers(text):
    # Basic regular expression to remove common header/footer (watermarks, etc.)
    text = re.sub(r'\b(Mahkamah Agung|Nomor|Tanggal)\b', '', text)
    return text

def clean_text(text):
    # Remove punctuation, numbers, extra spaces
    text = text.lower()
    text = re.sub(f"[{string.punctuation}0-9]", "", text)
    text = ' '.join(text.split())  # Remove extra spaces
    return text

def sentence_tokenize(text):
    # A simple sentence tokenizer based on punctuation
    return re.split(r'(?<!\w\.\w.)(?<=\.|\?)\s', text)

def apply_stemming(sentence):
    # Apply Sastrawi stemmer to the sentence
    return ' '.join([stemmer.stem(word) for word in sentence.split()])

def preprocess_text_with_sastrawi(text):
    # Remove non-text elements like watermarks, header/footer
    text = remove_headers_footers(text)
    
    # Tokenization and cleaning
    text = clean_text(text)
    
    # Tokenize sentences and words
    sentences = sentence_tokenize(text)
    
    # Apply stemming using Sastrawi
    stemmed_sentences = [apply_stemming(sentence) for sentence in sentences]
    
    # Remove stopwords using Sastrawi stopword remover
    cleaned_sentences = [stopword_remover.remove(sentence) for sentence in stemmed_sentences]
    
    return cleaned_sentences

# Test preprocessing with a sample text
sample_text = "Ini adalah contoh kalimat. Ada banyak hal yang perlu dipertimbangkan."
processed_sample = preprocess_text_with_sastrawi(sample_text)
processed_sample


['adalah contoh kalimat banyak yang perlu timbang']

In [8]:
# Step 2: TF-IDF Calculation

def compute_tfidf(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    return tfidf_matrix, vectorizer

# Test TF-IDF on processed sample text
tfidf_matrix, vectorizer = compute_tfidf(processed_sample)
tfidf_matrix.toarray()  # Display the TF-IDF matrix as an array


array([[0.37796447, 0.37796447, 0.37796447, 0.37796447, 0.37796447,
        0.37796447, 0.37796447]])

In [9]:
# Step 3: Cosine Similarity Calculation

def cosine_similarity(tfidf_matrix):
    cosine_similarities = np.dot(tfidf_matrix, tfidf_matrix.T).toarray()
    return cosine_similarities

# Calculate cosine similarity for the sample text's TF-IDF matrix
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim  # Display cosine similarity matrix


array([[1.]])

In [10]:
# Step 4: TextRank Algorithm with Explanation of Sentence Ranking

def textrank(cosine_similarities, sentences, top_n=5):
    # Create similarity graph
    nx_graph = nx.from_numpy_matrix(cosine_similarities)
    scores = nx.pagerank(nx_graph)  # Compute TextRank scores
    
    # Rank sentences based on the TextRank scores
    ranked_sentences = [sentences[i] for i in sorted(scores, key=scores.get, reverse=True)[:top_n]]
    
    # Display ranking information for analysis
    ranked_info = [(sentences[i], scores[i]) for i in sorted(scores, key=scores.get, reverse=True)]
    print("\nRanking of Sentences Based on TextRank Scores:")
    for rank, (sentence, score) in enumerate(ranked_info, 1):
        print(f"Rank {rank}: Score {score:.4f} | Sentence: {sentence[:100]}...")

    return ' '.join(ranked_sentences)

# Apply TextRank on processed sample text
generated_summary = textrank(cosine_sim, processed_sample, top_n=3)
generated_summary


AttributeError: module 'networkx' has no attribute 'from_numpy_matrix'

In [11]:
# Step 5: ROUGE Evaluation

def evaluate_rouge(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(metrics=['rouge1', 'rouge2', 'rougeL'], lang='en')
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Evaluate using ROUGE
reference_summary = "Ini adalah ringkasan referensi yang dibuat oleh ahli."
rouge_scores = evaluate_rouge(reference_summary, generated_summary)
rouge_scores


# Step 6: Precision, Recall, F-Measure Calculation

def evaluate_precision_recall_fmeasure(reference_summary, generated_summary):
    reference_tokens = set(reference_summary.split())
    generated_tokens = set(generated_summary.split())
    
    true_positive = len(reference_tokens & generated_tokens)
    false_positive = len(generated_tokens - reference_tokens)
    false_negative = len(reference_tokens - generated_tokens)
    
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    f_measure = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f_measure

# Evaluate using Precision, Recall, F-Measure
precision, recall, f_measure = evaluate_precision_recall_fmeasure(reference_summary, generated_summary)
precision, recall, f_measure


NameError: name 'generated_summary' is not defined