# Code for PDF Data Summarizer

In [3]:
import streamlit as st
import PyPDF2
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import os


### Set page Configuration

In [4]:
st.set_page_config(page_title="PDF Data Summarizer", page_icon="C://Users//Jai//Downloads//data.PNG")


### CSS to center the file uploader

In [5]:
st.markdown(
    """
    <style>
    .block-container {
        display: flex;
        justify-content: center;
        align-items: center;
        height: 100vh;
    }
    </style>
    """,
    unsafe_allow_html=True,
)


2024-06-29 23:17:41.996 
  command:

    streamlit run C:\Users\Jai\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

### App title

In [6]:
st.title("PDF Data Summarizer")

DeltaGenerator()

### Create 'uploads' directory if it doesn't exist

In [7]:
if not os.path.exists('uploads'):
    os.makedirs('uploads')

### File uploader widget

In [8]:
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

### NLTKTokenizer to seperate sentences and words

In [9]:
class NLTKTokenizer:
    @staticmethod
    def to_sentences(text):
        return sent_tokenize(text)

    @staticmethod
    def to_words(text):
        return word_tokenize(text)

In [10]:
def summarize_text(text, num_sentences=5):
    parser = PlaintextParser.from_string(text, NLTKTokenizer)
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    summary_text = "\n".join([str(sentence) for sentence in summary])
    return summary_text

In [11]:
if uploaded_file is not None:
    try:
        # Open the PDF file
        pdf_reader = PyPDF2.PdfReader(uploaded_file)

        # Print the number of pages in the PDF
        st.write(f'The PDF document has {len(pdf_reader.pages)} pages.')

        # Extract text from each page
        full_text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            full_text += text

        # Display the full text
        st.subheader("Full Text")
        st.write(full_text)

        # Summarize the text
        summary = summarize_text(full_text)
        st.subheader("Summary")
        st.write(summary)

        # Optionally save the uploaded file
        with open(os.path.join("uploads", uploaded_file.name), "wb") as f:
            f.write(uploaded_file.getbuffer())
        st.success("File saved successfully!")
        
    except Exception as e:
        st.error(f"An error occurred: {e}")

else:
    st.write("No file uploaded yet. Please upload a PDF file.")