In [35]:
try:
    self.stopwords = set(nltk.corpus.stopwords.words(nltk_lang))
except OSError as e:
    print(f"Warning: Stopwords for {nltk_lang} not found. Using empty stopwords set.")
    self.stopwords = set()

# 2. In segment_text method, fix the pattern matching warning:
try:
    match = re.search(pattern, text, re.DOTALL)
    if match:
        section_text = match.group()
        section_text = re.sub(r'^.*?\n', '', section_text, 1)
        sections[section_name] = section_text.strip()
    else:
        sections[section_name] = None
except Exception as e:
    print(f"Warning: Pattern matching failed for section {section_name}: {str(e)}")
    sections[section_name] = None

NameError: name 'nltk_lang' is not defined

In [37]:
import os
import re
from typing import Dict, List, Optional, Union
from PyPDF2 import PdfReader
from transformers import AutoTokenizer
import nltk
import ssl

# Download required NLTK data
try:
    _create_unverified_https_context = ssl._create_unverified_context
    def __init__(self, language: str = 'indonesia'):
        self.language = language
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
        
        try:
            nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
            os.makedirs(nltk_data_dir, exist_ok=True)
            nltk.data.path.append(nltk_data_dir)
        nltk.data.find(f'tokenizers/{package}')
    except LookupError:
        print(f"Downloading {package}...")
        nltk.download(package, download_dir=nltk_data_dir, quiet=True)

class JournalPreprocessor:
    """A class to handle preprocessing of academic journal PDFs."""
    
    def __init__(self, language: str = 'indonesia'):
        self.language = language
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
        
        try:
            try:
                _create_unverified_https_context = ssl._create_unverified_context
            except AttributeError:
                pass
            else:
                ssl._create_default_https_context = _create_unverified_https_context

            nltk.data.path.append(os.path.join(os.getcwd(), 'nltk_data'))
            
            for package in ['punkt', 'stopwords']:
                try:
                    nltk.data.find(f'tokenizers/{package}')
                except LookupError:
                    print(f"Downloading {package}...")
                    nltk.download(package, download_dir=os.path.join(os.getcwd(), 'nltk_data'))

            nltk_lang = 'indonesian' if language == 'indonesia' else language
            
            try:
                self.stopwords = set(nltk.corpus.stopwords.words(nltk_lang))
            except OSError as e:
                print(f"Warning: Stopwords for {nltk_lang} not found. Using empty stopwords set.")
                self.stopwords = set()
                
        except Exception as e:
            print(f"Warning: NLTK initialization failed: {str(e)}")
            print("Continuing with basic tokenization...")
            self.stopwords = set()

        self.section_patterns = {
            'abstrak': r'(?i)(ABSTRAK|ABSTRACT|Abstrak|Abstract)[\s\S]*?(?=\n(?:PENDAHULUAN|INTRODUCTION|BAB\s+I|I\.|1\.|\d{1,2}\.?))',
            'pendahuluan': r'(?i)(PENDAHULUAN|INTRODUCTION|BAB\s+I|I\.|1\.)[\s\S]*?(?=\n(?:METODE|METHODOLOGY|BAB\s+II|II\.|2\.|\d{1,2}\.?))',
            'metode': r'(?i)(METODE(?:\s+PENELITIAN)?|METHODOLOGY|METHODS|BAB\s+II|II\.|2\.)[\s\S]*?(?=\n(?:HASIL|RESULTS|BAB\s+III|III\.|3\.|\d{1,2}\.?))',
            'hasil': r'(?i)(HASIL(?:\s+DAN\s+PEMBAHASAN)?|RESULTS(?:\s+AND\s+DISCUSSION)?|BAB\s+III|III\.|3\.)[\s\S]*?(?=\n(?:KESIMPULAN|CONCLUSION|BAB\s+IV|IV\.|4\.|\d{1,2}\.?))',
            'kesimpulan': r'(?i)(KESIMPULAN(?:\s+DAN\s+SARAN)?|CONCLUSION(?:S)?(?:\s+AND\s+RECOMMENDATION(?:S)?)?|BAB\s+[IVX]+|[IVX]+\.|[4-9]\.)'
        }

    def extract_text_from_pdf(self, file_path: str) -> str:
        try:
            reader = PdfReader(file_path)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
        except Exception as e:
            print(f"Error extracting text from {file_path}: {str(e)}")
            return ""

    def clean_text(self, text: str) -> str:
        if not text:
            return ""
        text = re.sub(r'\b\d+\s*\|\s*Page|\b\d+\s*/\s*\d+|^\d+$', '', text, flags=re.MULTILINE)
        text = re.sub(r'\[[\d,\s-]+\]', '', text)
        text = re.sub(r'\((?:\w+\s*(?:et al\.?|dkk\.?),?\s*\d{4}(?:;\s*)?)+\)', '', text)
        text = re.sub(r'\S+@\S+\.\S+', '', text)
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def tokenize_and_filter(self, text: str) -> List[str]:
        if not text:
            return []
            
        try:
            sentences = nltk.sent_tokenize(text)
        except Exception as e:
            print(f"Warning: NLTK sentence tokenization failed: {str(e)}")
            sentences = [s.strip() for s in text.split('.') if s.strip()]

        tokens = []
        for sentence in sentences:
            try:
                sentence = re.sub(r'[^\w\s]|[\d]', '', sentence)
                sentence_tokens = self.tokenizer.tokenize(sentence)
                filtered_tokens = [
                    token for token in sentence_tokens
                    if token.lower() not in self.stopwords
                    and len(token) > 2
                    and not token.startswith('##')
                ]
                tokens.extend(filtered_tokens)
            except Exception as e:
                print(f"Warning: Tokenization failed for sentence: {str(e)}")
                continue
                
        return tokens

    def segment_text(self, text: str) -> Dict[str, Optional[str]]:
        sections = {key: None for key in self.section_patterns.keys()}
        
        for section_name, pattern in self.section_patterns.items():
            try:
                match = re.search(pattern, text, re.DOTALL)
                if match:
                    section_text = match.group()
                    section_text = re.sub(r'^.*?\n', '', section_text, 1)
                    sections[section_name] = section_text.strip()
            except Exception as e:
                print(f"Warning: Pattern matching failed for section {section_name}: {str(e)}")
                continue
                
        return sections

    def preprocess_single_journal(self, file_path: str) -> Dict[str, Union[List[str], None]]:
        try:
            raw_text = self.extract_text_from_pdf(file_path)
            if not raw_text:
                print(f"Warning: No text extracted from {file_path}")
                return {}
                
            sections = self.segment_text(raw_text)
            processed_sections = {}
            for section_name, content in sections.items():
                if content:
                    cleaned_text = self.clean_text(content)
                    tokens = self.tokenize_and_filter(cleaned_text)
                    processed_sections[section_name] = tokens
                else:
                    processed_sections[section_name] = None
                    
            return processed_sections
        except Exception as e:
            print(f"Error in preprocessing {file_path}: {str(e)}")
            return {}

    def preprocess_multiple_journals(self, folder_path: str) -> Dict[str, Dict[str, Union[List[str], None]]]:
        results = {}
        if not os.path.exists(folder_path):
            print(f"Error: Folder not found: {folder_path}")
            return results
            
        for filename in os.listdir(folder_path):
            if filename.endswith('.pdf'):
                file_path = os.path.join(folder_path, filename)
                print(f"Processing: {filename}")
                try:
                    results[filename] = self.preprocess_single_journal(file_path)
                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")
                    results[filename] = {}
        return results

# Example usage
if __name__ == "__main__":
    nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
    os.makedirs(nltk_data_dir, exist_ok=True)
    nltk.data.path.insert(0, nltk_data_dir)
    
    preprocessor = JournalPreprocessor(language='indonesia')
    folder_path = "datasets/"
    results = preprocessor.preprocess_multiple_journals(folder_path)
    
    for doc_name, sections in results.items():
        print(f"\n=== Document: {doc_name} ===")
        for section_name, tokens in sections.items():
            print(f"\n--- {section_name.upper()} ---")
            if tokens:
                print(f"Token count: {len(tokens)}")
                print(f"Sample tokens: {' '.join(tokens[:50])}...")
            else:
                print("No content found.")

SyntaxError: expected 'except' or 'finally' block (657698841.py, line 20)

In [23]:
from transformers import pipeline, AutoTokenizer, TFAutoModelForSeq2SeqLM

# Pilih model yang kompatibel dengan TensorFlow
model_name = "t5-small"  # Atau "t5-base", tergantung kebutuhan
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

# Buat pipeline untuk summarization
summarizer = pipeline("summarization", model=model,
                      tokenizer=tokenizer, framework="tf")

# Fungsi untuk merangkum teks


def summarize_text(text, max_length=200, min_length=50):
    summary = summarizer(
        text,
        max_length=max_length,
        min_length=min_length,
        truncation=True
    )
    return summary[0]['summary_text']

# Pipeline untuk merangkum seluruh dokumen


def summarize_documents(preprocessed_documents):
    summaries = {}
    for doc_name, sections in preprocessed_documents.items():
        summaries[doc_name] = {}
        print(f"Processing Document: {doc_name}")
        for section, tokens in sections.items():
            if tokens:
                text = " ".join(tokens)  # Gabungkan kembali token menjadi teks
                try:
                    summary = summarize_text(
                        text, max_length=150, min_length=50)
                    summaries[doc_name][section] = summary
                except Exception as e:
                    print(f"Error summarizing {section} in {doc_name}: {e}")
                    summaries[doc_name][section] = None
            else:
                summaries[doc_name][section] = None
    return summaries


# Contoh penggunaan dengan hasil preprocessing sebelumnya
summaries = summarize_documents(preprocessed_documents)

# Tampilkan ringkasan
for doc_name, sections in summaries.items():
    print(f"\n=== Document: {doc_name} ===")
    for section, summary in sections.items():
        print(f"\n--- {section.upper()} ---")
        if summary:
            print(f"Summary: {summary}")
        else:
            print("No content found or unable to summarize.")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]




All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0
Your max_length is set to 150, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


Processing Document: ID+8098.pdf


Your max_length is set to 150, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 150, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)
Your max_length is set to 150, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 150, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)
Your m

Processing Document: ID+8102.pdf


Your max_length is set to 150, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 150, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 150, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Your max_length is set to 150, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


Processing Document: ID+8108.pdf


Your max_length is set to 150, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 150, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 150, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_length is set to 150, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your 

Processing Document: ID+8117.pdf


Your max_length is set to 150, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 150, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 150, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 150, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Your max

Processing Document: ID+8208.pdf


Your max_length is set to 150, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 150, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 150, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_length is set to 150, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)



=== Document: ID+8098.pdf ===

--- ABSTRAK ---
Summary: abs ##tra ##k's 'abs' is the latest in a series of 'boss' series . 'i'm not a big fan of this blog,' says abs' 'tra'

--- PENDAHULUAN ---
Summary: pen ##dah ##ulu ##an . ##nan ## an ##ang ##a ##in a 'stupid' ##un ## . an 'a' .

--- METODE ---
Summary: met ##ode con ##vo ##lu ##tion ##al neural network . yang mer ##up ##aka ##n bag ##ian dar ##u deep learning ata .

--- HASIL ---
Summary: has ##il un ##tu ##k ## . ##c ##l ##un ##e ##n ##a ##w ##p ##h ##g ##j ##d ##i ##f ##m ##b ##x .

--- KESIMPULAN ---
Summary: pen ##eli ##tian in ##i ada ##lah un ##tu ##k da ##pa . ke ##si's ##mp ##ula' ##n dar .

=== Document: ID+8102.pdf ===

--- ABSTRAK ---
Summary: abs ##tra ##k's 'abs' is the latest in a series of 'boss' series . 'i'm not a big fan of this blog,' says abs' 'tra'

--- PENDAHULUAN ---
No content found or unable to summarize.

--- METODE ---
Summary: met design thinking dan lean startup yang set ##ia ##p met ##ode ##nya . met 

In [21]:
pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.5.1-cp311-cp311-win_amd64.whl (203.1 MB)
     ---------------------------------------- 0.0/203.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/203.1 MB 2.0 MB/s eta 0:01:40
     ---------------------------------------- 0.2/203.1 MB 2.3 MB/s eta 0:01:28
     ---------------------------------------- 0.3/203.1 MB 2.2 MB/s eta 0:01:32
     ---------------------------------------- 0.4/203.1 MB 2.4 MB/s eta 0:01:26
     ---------------------------------------- 0.5/203.1 MB 2.2 MB/s eta 0:01:31
     ---------------------------------------- 0.6/203.1 MB 2.1 MB/s eta 0:01:36
     ---------------------------------------- 0.7/203.1 MB 2.2 MB/s eta 0:01:34
     ---------------------------------------- 0.7/203.1 MB 2.0 MB/s eta 0:01:42
     ---------------------------------------- 0.8/203.1 MB 1.9 MB/s eta 0:01:45
     ---------------------------------------- 0.9/203.1 MB 1.9 MB/s eta 0:01:49
     ---------------------------------------- 1