In [1]:
import os
import re
import string
import networkx as nx
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rouge_score import rouge_scorer
from sastrawi.stemmer import Stemmer
from sastrawi.stopwordremover import StopWordRemoverFactory

ModuleNotFoundError: No module named 'sastrawi'

In [3]:
!pip show sastrawi


Name: Sastrawi
Version: 1.0.1
Summary: Library for stemming Indonesian (Bahasa) text
Home-page: https://github.com/har07/sastrawi
Author: Hanif Amal Robbani
Author-email: dev.har07@gmail.com
License: MIT
Location: C:\Users\erwin\VDBQdrant\Lib\site-packages
Requires: 
Required-by: 


In [None]:


# Step 1: Define text preprocessing functions using Sastrawi
def remove_headers_footers(text):
    # Basic regular expression to remove common header/footer (watermarks, etc.)
    text = re.sub(r'\b(Mahkamah Agung|Nomor|Tanggal)\b', '', text)
    return text

def clean_text(text):
    # Remove punctuation, numbers, extra spaces
    text = text.lower()
    text = re.sub(f"[{string.punctuation}0-9]", "", text)
    text = ' '.join(text.split())  # Remove extra spaces
    return text

def sentence_tokenize(text):
    # A simple sentence tokenizer based on punctuation (could be expanded)
    return re.split(r'(?<!\w\.\w.)(?<=\.|\?)\s', text)

def apply_stemming(sentence):
    stemmer = Stemmer()
    return ' '.join([stemmer.stem(word) for word in sentence.split()])

def preprocess_text_with_sastrawi(text):
    # Remove non-text elements like watermarks, header/footer
    text = remove_headers_footers(text)
    
    # Tokenization and cleaning
    text = clean_text(text)
    
    # Tokenize sentences and words
    sentences = sentence_tokenize(text)
    
    # Apply stemming using Sastrawi
    stemmed_sentences = [apply_stemming(sentence) for sentence in sentences]
    
    # Remove stopwords using Sastrawi stopword remover
    stopwords_factory = StopWordRemoverFactory()
    stopwords = stopwords_factory.get_stopwords()  # Sastrawi's stopword list
    cleaned_sentences = [' '.join([word for word in sentence.split() if word not in stopwords]) for sentence in stemmed_sentences]
    
    return cleaned_sentences

# Step 2: TF-IDF Calculation
def compute_tfidf(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    return tfidf_matrix, vectorizer

# Step 3: Cosine Similarity Calculation
def cosine_similarity(tfidf_matrix):
    cosine_similarities = np.dot(tfidf_matrix, tfidf_matrix.T).toarray()
    return cosine_similarities

# Step 4: TextRank Algorithm
def textrank(cosine_similarities, sentences, top_n=5):
    # Create similarity graph
    nx_graph = nx.from_numpy_matrix(cosine_similarities)
    scores = nx.pagerank(nx_graph)  # Compute TextRank scores
    ranked_sentences = [sentences[i] for i in sorted(scores, key=scores.get, reverse=True)[:top_n]]
    return ' '.join(ranked_sentences)

# Step 5: ROUGE Evaluation
def evaluate_rouge(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(metrics=['rouge1', 'rouge2', 'rougeL'], lang='en')
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Step 6: Precision, Recall, F-Measure Calculation
def evaluate_precision_recall_fmeasure(reference_summary, generated_summary):
    reference_tokens = set(reference_summary.split())
    generated_tokens = set(generated_summary.split())
    
    true_positive = len(reference_tokens & generated_tokens)
    false_positive = len(generated_tokens - reference_tokens)
    false_negative = len(reference_tokens - generated_tokens)
    
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    f_measure = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f_measure

# Step 7: Load and process all text files from the folder
data_folder_path = "data_putusan"
dok_putusan_txt_folder = os.path.join(data_folder_path, 'dok_putusan_txt')

# Reading all text files in 'dok_putusan_txt' folder
texts = []
for filename in os.listdir(dok_putusan_txt_folder):
    if filename.endswith(".txt") and filename != '.DS_Store':
        file_path = os.path.join(dok_putusan_txt_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            texts.append(text)

# Display the number of text documents loaded
print(f"Number of documents loaded: {len(texts)}")

# Process the first document for summarization
sample_doc_text = texts[0]
print(f"Sample document (first 500 characters): {sample_doc_text[:500]}")

# Example Usage - Summarize and Evaluate the First Document
reference_summary = "Masukkan ringkasan referensi di sini."  # Replace with your reference summary

# Preprocess the text
sentences = preprocess_text_with_sastrawi(sample_doc_text)

# Compute TF-IDF matrix
tfidf_matrix, vectorizer = compute_tfidf(sentences)

# Calculate Cosine Similarity
cosine_similarities = cosine_similarity(tfidf_matrix)

# Apply TextRank
generated_summary = textrank(cosine_similarities, sentences, top_n=5)

# Evaluate using ROUGE
rouge_scores = evaluate_rouge(reference_summary, generated_summary)

# Evaluate using Precision, Recall, F-Measure
precision, recall, f_measure = evaluate_precision_recall_fmeasure(reference_summary, generated_summary)

# Print Results
print("\nGenerated Summary:", generated_summary)
print("\nROUGE Scores:", rouge_scores)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F-Measure:", f_measure)


In [None]:
# sss

In [7]:
import os
# Let's check the contents of the folder that may contain the raw text documents
data_folder_path = "data_putusan"
dok_putusan_txt_folder = os.path.join(data_folder_path, 'dok_putusan_txt')
os.listdir(dok_putusan_txt_folder)


['.DS_Store',
 '.ipynb_checkpoints',
 'doc01.txt',
 'doc02.txt',
 'doc03.txt',
 'doc04.txt',
 'doc05.txt',
 'doc06.txt',
 'doc07.txt',
 'doc08.txt',
 'doc09.txt',
 'doc10.txt',
 'doc11.txt',
 'doc12.txt',
 'doc13.txt',
 'doc14.txt',
 'doc15.txt',
 'doc16.txt',
 'doc17.txt',
 'doc18.txt',
 'doc19.txt',
 'doc20.txt',
 'doc21.txt',
 'doc22.txt',
 'doc23.txt',
 'doc24.txt',
 'doc25.txt',
 'doc26.txt',
 'doc27.txt',
 'doc28.txt',
 'doc29.txt',
 'doc30.txt',
 'doc31.txt',
 'doc32.txt',
 'doc33.txt',
 'doc34.txt',
 'doc35.txt',
 'doc36.txt',
 'doc37.txt',
 'doc38.txt',
 'doc39.txt',
 'doc40.txt',
 'doc41.txt',
 'doc42.txt',
 'doc43.txt',
 'doc44.txt',
 'doc45.txt',
 'doc46.txt',
 'doc47.txt',
 'doc48.txt',
 'doc49.txt',
 'doc50.txt']

In [29]:
# Load the content of 'doc23.txt' as a sample document for summarization
sample_doc_path = os.path.join(dok_putusan_txt_folder, 'doc23.txt')

with open(sample_doc_path, 'r', encoding='utf-8') as file:
    document_text = file.read()

# Display the first 500 characters of the document to understand its structure
document_text[:500]

'  PUTUSAN Nomor 374/Pid.Sus/2017/PN Sim  DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA  Pengadilan Negeri Simalungun yang mengadili perkara pidana  dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa :  2. Tempat lahir  : Dosin  3. Umur/Tanggal lahir  : 39/23 Januari 1978  4. Jenis kelamin  : Laki-laki  5. Kebangsaan  : Indonesia  6. Tempat tinggal  :Kampung Tengah nagori Maligas Bayu Kec. ub lik  : Surianto Alias Gundol  1. Nama le'

In [30]:
# Reading all text files in 'dok_putusan_txt' folder
texts = []
for filename in os.listdir(dok_putusan_txt_folder):
    if filename.endswith(".txt") and filename != '.DS_Store':
        file_path = os.path.join(dok_putusan_txt_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            texts.append(text)

# Display the number of text documents loaded
len(texts)  # Showing the number of documents processed


50

In [31]:
print(texts[0][:500])


ep  P U T U SN Nomor 5/Pid.Sus/2020/PN Kag  DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA  Pengadilan Negeri Kayuagung yang mengadili perkara pidana dengan  acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa :  Tempat lahir  : Palembang  Umur/Tanggal lahir  : 34 Tahun / 24 April 1985  Jenis kelamin  : Laki-laki  Kebangsaan  : Indonesia  Tempat tinggal  : Jl. Tangga Takat No. 1029 Rt. 17 Rw. 07 Kel. Tangga ub lik  : KA Ibrahim Bin KH Abdul


In [32]:
# Function to clean text: remove non-relevant elements such as headers, footers, and special characters
def clean_text(text):
    # Remove extra spaces, newline characters and special characters like digits and non-ASCII characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    return text.strip()

# Clean all documents in the texts list
cleaned_texts = [clean_text(text) for text in texts]

# Show the first 500 characters of the first cleaned text document
cleaned_texts[0][:500]  # Display the first 500 characters of the first document


'ep P U T U SN Nomor 5PidSus2020PN Kag DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Kayuagung yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa  Tempat lahir  Palembang UmurTanggal lahir  34 Tahun  24 April 1985 Jenis kelamin  Lakilaki Kebangsaan  Indonesia Tempat tinggal  Jl Tangga Takat No 1029 Rt 17 Rw 07 Kel Tangga ub lik  KA Ibrahim Bin KH Abdullah Murod Nama lengkap  Islam Peke'

In [None]:
# Tokenization: Split the text into sentences and words
from nltk.tokenize import sent_tokenize, word_tokenize

# Tokenize the cleaned documents into sentences
tokenized_sentences = [sent_tokenize(text) for text in cleaned_texts]

# Apply word tokenization to each sentence
tokenized_words = [[word_tokenize(sentence) for sentence in doc] for doc in tokenized_sentences]

['ep P U T U SN Nomor 5PidSus2020PN Kag DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Kayuagung yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa  Tempat lahir  Palembang UmurTanggal lahir  34 Tahun  24 April 1985 Jenis kelamin  Lakilaki Kebangsaan  Indonesia Tempat tinggal  Jl Tangga Takat No 1029 Rt 17 Rw 07 Kel Tangga ub lik  KA Ibrahim Bin KH Abdullah Murod Nama lengkap  Islam Pekerjaan  Belum Bekerja Pendidikan  SMA tidak tamatne Agama k Takat Kec Seberang Ulu II Kota Palembang Terdakwa KA Ibrahim Bin KH Abdullah Murod ditangkap pada tanggal 26 September 2019 dan ditahan dalam rumah tahanan negara oleh  1 Penyidik sejak tanggal 28 September 2019 sampai dengan tanggal 18 Oktober 2019 2 Penyidik Perpanjangan Oleh Penuntut Umum sejak tanggal 19 Oktober 2019 sampai dengan tanggal 27 November 2019 3 Penuntut Umum sejak tanggal 26 November 2019 sampai dengan tanggal lik seja

In [34]:
# Show the first 5 tokenized sentences of the first document
tokenized_sentences[0][:500]  # Display the first 5 sentences from the first document

['ep P U T U SN Nomor 5PidSus2020PN Kag DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Kayuagung yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa  Tempat lahir  Palembang UmurTanggal lahir  34 Tahun  24 April 1985 Jenis kelamin  Lakilaki Kebangsaan  Indonesia Tempat tinggal  Jl Tangga Takat No 1029 Rt 17 Rw 07 Kel Tangga ub lik  KA Ibrahim Bin KH Abdullah Murod Nama lengkap  Islam Pekerjaan  Belum Bekerja Pendidikan  SMA tidak tamatne Agama k Takat Kec Seberang Ulu II Kota Palembang Terdakwa KA Ibrahim Bin KH Abdullah Murod ditangkap pada tanggal 26 September 2019 dan ditahan dalam rumah tahanan negara oleh  1 Penyidik sejak tanggal 28 September 2019 sampai dengan tanggal 18 Oktober 2019 2 Penyidik Perpanjangan Oleh Penuntut Umum sejak tanggal 19 Oktober 2019 sampai dengan tanggal 27 November 2019 3 Penuntut Umum sejak tanggal 26 November 2019 sampai dengan tanggal lik seja

In [38]:
print(len(tokenized_sentences[0][:500]))
print(len(cleaned_texts[0][:500]))

1
500


In [40]:
# Step 3: Stemming and Stopword Removal
from nltk.corpus import stopwords
stopwords_list = set(stopwords.words('indonesian'))  # Add custom stopwords if necessary

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords_list]


In [41]:
# Apply stemming and stopword removal
processed_sentences = []
for sentence in sentences:
    tokens = simple_tokenize_words(sentence)
    tokens = apply_sastrawi_stemming(tokens)
    tokens = remove_stopwords(tokens)
    processed_sentences.append(' '.join(tokens))

NameError: name 'simple_tokenize_words' is not defined

In [None]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [None]:
# xx

In [10]:
# Initialize the Sastrawi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()


In [11]:
# Text cleaning function: remove headers, footers, and irrelevant symbols
def clean_text(text):
    # Remove extra spaces, newline characters and special characters like digits
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    return text.strip()

# Tokenize sentences from the document
def tokenize_sentences(text):
    return sent_tokenize(text)

# Function to apply stemming
def apply_stemming(tokens):
    return [stemmer.stem(word) for word in tokens]

# Function to remove stopwords from the text
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('indonesian'))
    return [word for word in tokens if word.lower() not in stop_words]

In [12]:
# Preprocess the document text
cleaned_text = clean_text(document_text)
sentences = tokenize_sentences(cleaned_text)

# Apply stemming and remove stopwords for each sentence
processed_sentences = []
for sentence in sentences:
    tokens = word_tokenize(sentence)
    tokens = apply_stemming(tokens)
    tokens = remove_stopwords(tokens)
    processed_sentences.append(' '.join(tokens))

# Display the first 5 processed sentences
processed_sentences[:5]

['putus nomor 374pidsus2017pn sim adil dasar tuhan maha esa adil negeri simalungun adil perkara pidana acara periksa tingkat jatuh putus perkara dakwa 2 lahir dosin 3 umurtanggal lahir 3923 januari 1978 4 jenis kelamin lakilaki 5 bangsa indonesia 6 tinggal kampung nagori maligas bayu kec ub lik surianto alias gundol 1 nama lengkap 7 agama islam 8 kerja wiraswasta k huta bayu raja kabupaten simalungunne dakwa tangkap sidik tanggal 10 april 2017 tanggal 12 april 2017 dakwa surianto alias gundol tahan tahan rutan 1 sidik tanggal 13 april 2017 tanggal 2 mei 2017 dakwa surianto alias gundol tahan tahan rutan 2 sidik tuntut tanggal 3 mei 2017 tanggal 11 juni 2017 dakwa surianto alias gundol tahan tahan rutan lik juni 2017 tanggal 11 juli 2017 dakwa surianto alias gundol tahan tahan rutan ub 4 sidik ketua adil negeri tanggal 12 juli 2017 tanggal 10 agustus 2017 dakwa surianto alias gundol tahan tahan rutan 5 tuntut tanggal 10 agustus 2017 tanggal 29 agustus 2017 6 hakim adil negeri tanggal 23

In [39]:
# Reimporting Sastrawi Stemmer for stemming the Indonesian text
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Initialize the Sastrawi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Function to apply stemming using Sastrawi
def apply_sastrawi_stemming(tokens):
    return [stemmer.stem(word) for word in tokens]

# Apply stemming to the tokenized sentences
tokenized_sentences_stemmed = []
for doc in improved_tokenized_sentences:
    stemmed_sentences = []
    for sentence in doc:
        tokens = simple_tokenize_words(sentence)
        stemmed_tokens = apply_sastrawi_stemming(tokens)
        stemmed_sentences.append(' '.join(stemmed_tokens))
    tokenized_sentences_stemmed.append(stemmed_sentences)

# Show the first 5 sentences after stemming
tokenized_sentences_stemmed[0][:5]  # Show the first 5 sentences after stemming


NameError: name 'improved_tokenized_sentences' is not defined