In [3]:
%pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import re
from PyPDF2 import PdfReader

def preprocess_text(text):
    """
    Clean and preprocess text by removing unwanted characters and extra spaces.
    """
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s.,;!?]', '', text)  # Remove non-alphanumeric characters
    return text.strip()


In [2]:
def split_text(text, max_chunk_size=1000):
    """
    Split text into chunks within the token limit.
    """
    words = text.split()
    for i in range(0, len(words), max_chunk_size):
        yield " ".join(words[i:i + max_chunk_size])


In [3]:
from transformers import pipeline

# Load the summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_chunks(text_chunks, max_length=130, min_length=30):
    """
    Summarize each chunk and combine the summaries.
    """
    summaries = []
    for i, chunk in enumerate(text_chunks):
        try:
            summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
            summaries.append(summary)
        except Exception as e:
            print(f"Error summarizing chunk {i}: {e}")
    return " ".join(summaries)


  from .autonotebook import tqdm as notebook_tqdm
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
def extract_and_summarize(pdf_file):
    """
    Extract text from a PDF file, preprocess, and summarize.
    """
    # Step 1: Read the PDF
    reader = PdfReader(pdf_file)
    document_text = ""
    for page in reader.pages:
        document_text += page.extract_text()

    # Step 2: Preprocess the text
    cleaned_text = preprocess_text(document_text)

    # Step 3: Split the text into chunks
    text_chunks = list(split_text(cleaned_text, max_chunk_size=1000))

    # Step 4: Summarize each chunk
    summary = summarize_chunks(text_chunks)
    return summary


In [5]:
import streamlit as st

# Streamlit interface
st.title("Intelligent PDF Summarizer")
st.write("Upload a large PDF document to summarize its content.")

# File upload
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

if uploaded_file:
    with st.spinner("Processing..."):
        try:
            # Extract and summarize the PDF content
            summary = extract_and_summarize(uploaded_file)
            st.success("Summarization complete!")
            st.subheader("Summary")
            st.write(summary)
        except Exception as e:
            st.error(f"An error occurred: {e}")


2024-11-29 15:28:41.109 
  command:

    streamlit run /home/omnesvera45/miniconda3/envs/testingopenai/lib/python3.8/site-packages/ipykernel_launcher.py [ARGUMENTS]
