In [6]:
# Required Libraries
import re
from transformers import pipeline, AutoTokenizer
import pdfplumber
import spacy



# File paths
pdf_path = "../../2.Phishing/78_Teach_GPT_To_Phish.pdf"
output_text_file = "extracted_text.txt"
summary_output_file = "new_summary.txt"

# Load SpaCy for sentence tokenization
nlp = spacy.load('en_core_web_sm')

# Step 1: Extract and clean text from the PDF
try:
    with pdfplumber.open(pdf_path) as pdf:
        extracted_text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Avoid NoneType issues
                extracted_text += page_text

    # Clean the extracted text
    cleaned_text = re.sub(r'\s+', ' ', extracted_text)  # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'[^\x00-\x7F]+', ' ', cleaned_text)  # Remove non-ASCII characters

    # Save the cleaned text to a file
    with open(output_text_file, "w", encoding='utf-8') as text_file:
        text_file.write(cleaned_text)

    print(f"Text extracted, cleaned, and saved to {output_text_file}")
except Exception as e:
    print(f"An error occurred while extracting text: {e}")

# Step 2: Tokenize sentences using SpaCy
try:
    sentences = [sent.text for sent in nlp(cleaned_text).sents]
except Exception as e:
    print(f"Error during sentence tokenization: {e}")
    sentences = []

# Step 3: Load summarization pipeline and tokenizer
try:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)  # Use CPU
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    max_token_length = 1024  # Token limit for the model
except Exception as e:
    print(f"Error loading summarization pipeline: {e}")
    exit()

# Step 4: Create chunks based on token length
try:
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        token_length = len(tokenizer.tokenize(sentence))
        if current_length + token_length <= max_token_length:
            current_chunk.append(sentence)
            current_length += token_length
        else:
            if current_chunk:  # Ensure non-empty chunks
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = token_length

    if current_chunk:  # Add the last chunk if not empty
        chunks.append(' '.join(current_chunk))
except Exception as e:
    print(f"Error during text chunking: {e}")
    chunks = []

# Step 5: Summarize each chunk and combine results
try:
    summaries = []
    for chunk in chunks:
        if chunk.strip():  # Ensure chunk is not empty
            summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False)
            summaries.append(summary[0]["summary_text"])

    # Combine summaries into a full summary
    full_summary = " ".join(summaries)

    # Save the summary to a file
    with open(summary_output_file, "w", encoding="utf-8") as file:
        file.write(full_summary)

    print(f"Summary saved to {summary_output_file}")
except Exception as e:
    print(f"Error during summarization: {e}")




Text extracted, cleaned, and saved to extracted_text.txt



Device set to use cpu


Summary saved to new_summary.txt
