<a href="https://colab.research.google.com/github/BillyMuthiani/Ultra_fast-Book_summarizer/blob/main/Book_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pymupdf==1.24.7 bert-extractive-summarizer==0.10.1 nltk sentence-transformers python-docx fpdf tqdm torch huggingface-hub transformers numpy safetensors

Collecting pymupdf==1.24.7
  Downloading PyMuPDF-1.24.7-cp311-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting bert-extractive-summarizer==0.10.1
  Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl.metadata (15 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting PyMuPDFb==1.24.6 (from pymupdf==1.24.7)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-

In [5]:
import os
import fitz  # PyMuPDF for PDF processing
import nltk
import speech_recognition as sr
import requests
from transformers import pipeline
import multiprocessing as mp
import docx
from fpdf import FPDF
import uuid
import time
from tqdm import tqdm
import logging
import unicodedata
import torch

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Download required NLTK data
nltk.download('punkt')

# Set multiprocessing start method for CUDA compatibility
if torch.cuda.is_available():
    mp.set_start_method('spawn', force=True)

# Cache summarization pipeline globally
MODEL_NAME = 'sshleifer/distilbart-cnn-6-6'

def load_summarizer():
    try:
        logging.info(f"Loading summarization pipeline: {MODEL_NAME}")
        # Use GPU if available (device=0), else CPU (device=-1)
        device = 0 if torch.cuda.is_available() else -1
        summarizer = pipeline("summarization", model=MODEL_NAME, tokenizer=MODEL_NAME, device=device)
        return summarizer
    except Exception as e:
        logging.error(f"Failed to load {MODEL_NAME}: {e}")
        return None

SUMMARIZER = load_summarizer()

# Function to preprocess text for PDF compatibility
def preprocess_text_for_pdf(text):
    try:
        # Normalize Unicode characters to ASCII
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
        return text
    except Exception as e:
        logging.error(f"Error preprocessing text: {e}")
        return text

# Function to truncate text to token limit
def truncate_to_token_limit(text, max_tokens=512):
    try:
        sentences = nltk.sent_tokenize(text)
        truncated_text = ""
        token_count = 0
        for sentence in sentences:
            # Estimate tokens (rough: 1 word ~ 1.3 tokens)
            sentence_tokens = len(sentence.split()) * 1.3
            if token_count + sentence_tokens <= max_tokens:
                truncated_text += sentence + " "
                token_count += sentence_tokens
            else:
                break
        return truncated_text.strip()
    except Exception as e:
        logging.error(f"Error truncating text: {e}")
        return text[:max_tokens]

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    start_time = time.time()
    try:
        doc = fitz.open(pdf_path)
        text = ""
        # Sample every 10th page for more content
        for page_num in range(0, len(doc), 10):
            text += doc[page_num].get_text()
        doc.close()
        logging.info(f"PDF extraction time: {time.time() - start_time:.2f} seconds")
        return text
    except Exception as e:
        logging.error(f"Error reading PDF: {e}")
        return None

# Function to extract text from DOCX
def extract_text_from_docx(docx_path):
    start_time = time.time()
    try:
        doc = docx.Document(docx_path)
        text = ""
        total_paras = len(doc.paragraphs)
        # Extract first 10% of paragraphs
        for i, para in enumerate(doc.paragraphs):
            if i < total_paras * 0.10:
                text += para.text + "\n"
        logging.info(f"DOCX extraction time: {time.time() - start_time:.2f} seconds")
        return text
    except Exception as e:
        logging.error(f"Error reading DOCX: {e}")
        return None

# Function for speech-to-text conversion
def speech_to_text():
    start_time = time.time()
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("Listening for input...")
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)
        try:
            text = recognizer.recognize_google(audio)
            logging.info(f"Speech-to-text time: {time.time() - start_time:.2f} seconds")
            print("Transcribed text:", text)
            return text
        except sr.UnknownValueError:
            logging.error("Could not understand audio")
            return None
        except sr.RequestError as e:
            logging.error(f"Speech recognition error: {e}")
            return None


# Function to download a file with progress bar
def download_file(url, local_filename):
    start_time = time.time()
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            with open(local_filename, 'wb') as f, tqdm(
                desc=f"Downloading {local_filename}",
                total=total_size,
                unit='iB',
                unit_scale=True,
                unit_divisor=1024,
            ) as bar:
                for data in r.iter_content(chunk_size=1024):
                    size = f.write(data)
                    bar.update(size)
        logging.info(f"Download time: {time.time() - start_time:.2f} seconds")
    except Exception as e:
        logging.error(f"Error downloading file: {e}")
        raise

# Function to summarize a single chunk
def summarize_chunk(chunk):
    try:
        if SUMMARIZER is None:
            logging.error("Summarizer pipeline not loaded")
            return ""
        # Ensure chunk is not empty and within model limits
        if not chunk.strip():
            logging.warning("Empty chunk received")
            return ""
        # Truncate to max length (DistilBART supports ~1024 tokens)
        max_input_length = 500
        chunk = chunk[:max_input_length]
        summary = SUMMARIZER(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
        logging.info(f"Successfully summarized chunk of length {len(chunk)}")
        return summary
    except Exception as e:
        logging.error(f"Error summarizing chunk: {e}")
        return ""

# Function to summarize text using parallel processing with progress bar
def summarize_text(text, max_pages=5):
    start_time = time.time()
    try:
        # Split text into chunks (500 characters each for speed)
        chunk_size = 500
        text_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        # Limit to 15 chunks to balance content and speed
        text_chunks = text_chunks[:15]
        logging.info(f"Split text into {len(text_chunks)} chunks")

        # Parallel processing with multiprocessing and progress bar
        with mp.Pool(processes=4) as pool:
            summaries = list(tqdm(pool.imap(summarize_chunk, text_chunks), total=len(text_chunks), desc="Summarizing chunks"))

        # Combine summaries and refine
        combined_summary = " ".join([s for s in summaries if s])  # Filter out empty summaries
        if not combined_summary:
            logging.warning("No valid summaries generated")
            return None
        # Truncate combined summary to 512 tokens for final summarization
        combined_summary = truncate_to_token_limit(combined_summary, max_tokens=512)
        if SUMMARIZER is None:
            logging.error("Summarizer pipeline not loaded")
            return None
        final_summary = SUMMARIZER(combined_summary, max_length=300, min_length=50, do_sample=False)[0]['summary_text']
        logging.info(f"Summarization time: {time.time() - start_time:.2f} seconds")
        return final_summary
    except Exception as e:
        logging.error(f"Error summarizing text: {e}")
        return None

# Function to generate PDF summary
def generate_pdf_summary(summary, output_path, title="Summary"):
    start_time = time.time()
    try:
        pdf = FPDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()
        pdf.set_font("Arial", size=12)

        # Add title
        pdf.set_font("Arial", "B", 16)
        title = preprocess_text_for_pdf(title)  # Preprocess title
        pdf.cell(0, 10, title, 0, 1, "C")
        pdf.ln(10)

        # Add summary text
        pdf.set_font("Arial", size=12)
        summary = preprocess_text_for_pdf(summary)  # Preprocess summary
        pdf.multi_cell(0, 10, summary)

        # Save PDF
        pdf.output(output_path)
        logging.info(f"PDF generation time: {time.time() - start_time:.2f} seconds")
        logging.info(f"PDF summary saved to {output_path}")
    except Exception as e:
        logging.error(f"Error generating PDF: {e}")

# Main function to run the app
def main():
    total_start_time = time.time()
    print("Welcome to the Ultra-Fast Book Summarization App!")
    print("Summarize books in seconds, including locally downloaded PDFs.")
    print("1. Summarize from PDF")
    print("2. Summarize from DOCX")


    choice = input("Select an option (1-2): ")
    text = None
    title = "Summary"
    local_path = None

    if choice == "1":
        file_path = input("Enter PDF file path: ")
        if os.path.exists(file_path):
            text = extract_text_from_pdf(file_path)
            title = os.path.basename(file_path)
            local_path = file_path
        else:
            logging.error("File not found")
            print("File not found")
            return
    elif choice == "2":
        file_path = input("Enter DOCX file path: ")
        if os.path.exists(file_path):
            text = extract_text_from_docx(file_path)
            title = os.path.basename(file_path)
            local_path = file_path
        else:
            logging.error("File not found")
            print("File not found")
            return
    elif choice == "3":
        use_speech = input("Use speech-to-text? (y/n): ").lower() == 'y'
        if use_speech:
            text = speech_to_text()
            title = "Speech Summary"

    else:
        logging.error("Invalid choice")
        print("Invalid choice")
        return

    if text:
        summary = summarize_text(text)
        if summary:
            output_path = f"summary_{uuid.uuid4()}.pdf"
            generate_pdf_summary(summary, output_path, title)
        else:
            logging.error("Summarization failed")
            print("Summarization failed")
    else:
        logging.error("No text to summarize")
        print("No text to summarize")

    logging.info(f"Total execution time: {time.time() - total_start_time:.2f} seconds")
    print(f"Total execution time: {time.time() - total_start_time:.2f} seconds")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cpu


Welcome to the Ultra-Fast Book Summarization App!
Summarize books in seconds, including locally downloaded PDFs.
1. Summarize from PDF
2. Summarize from DOCX
Select an option (1-2): 1
Enter PDF file path: /content/The Alchemist (Paulo Coelho) (Z-Library).pdf


Summarizing chunks:   0%|          | 0/15 [00:00<?, ?it/s]Your max_length is set to 150, but your input_length is only 137. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)
Your max_length is set to 150, but your input_length is only 135. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)
Your max_length is set to 150, but your input_length is only 141. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=70)
Your max_length is set to 150, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing 

Total execution time: 192.88 seconds
