<a href="https://colab.research.google.com/github/AkshathaNapanda/PDF_Insight_Extractor/blob/main/PDF_Insight_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install pdfplumber transformers spacy

# Import libraries
import pdfplumber
import re
from transformers import pipeline
import spacy
from google.colab import files

nlp = spacy.load("en_core_web_sm")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

# Function to clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.strip()

def chunk_text(text, chunk_size=400):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i:i + chunk_size])

# Function to extract key sections
def extract_key_sections(text):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    sections = {
        "Future Growth Prospects": ["growth", "expansion", "prospects", "opportunities"],
        "Key Changes in the Business": ["changes", "restructuring", "partnerships", "strategy"],
        "Material Effect on Earnings/Growth": ["earnings", "growth", "costs", "margins", "profits"]
    }

    results = {}
    for section, keywords in sections.items():
        section_text = " ".join(
            [sent.text for sent in nlp(text).sents if any(kw in sent.text.lower() for kw in keywords)]
        )
        if not section_text or len(section_text.split()) < 10:
            results[section] = "No sufficient information available."
        else:
            try:
                summarized_text = ""
                for chunk in chunk_text(section_text):
                    summary = summarizer(chunk, max_length=50, min_length=10, do_sample=False)
                    summarized_text += summary[0]['summary_text'] + " "
                results[section] = summarized_text.strip()
            except Exception as e:
                results[section] = f"Error during summarization: {str(e)}"
    return results

# Main function to process the PDF
def process_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    if not text.strip():
        return "The PDF is empty or contains no readable text."

    clean_text_data = clean_text(text)
    key_sections = extract_key_sections(clean_text_data)

    output = "=== Summary ===\n"
    for section, content in key_sections.items():
        output += f"=== {section} ===\n- {content}\n\n"
    return output

print("Please upload the PDF file:")
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]
result = process_pdf(pdf_path)
print(result)


Please upload the PDF file:


Saving SJS Transcript Call.pdf to SJS Transcript Call (3).pdf
=== Summary ===
=== Future Growth Prospects ===
- Consolidated revenue at Rs.1,172.5 million has grown at 13.6% YoY on the back of strong growth in automotive segment and exports. Walter Pack Q1 witnessed a strong revenue growth of 21% Yo Pro forma revenue for Q1 FY24 would have been Rs.1,528.7 million, a YoY growth of 48.2%. Pro forma EBITDA margin were at 27.3%, witnessing 120 bps Sanjay Thapar: SJS will continue to deliver on its robust financial and operational guidance of FY24 with 50% YoY growth in the SJS consolidated revenues. Inorganic growth through Walter Pack will be over and above the Sanjay Thapar: We don't provide guidance specifically, but as I said earlier in my commentary, on QoQ basis, exports has jumped up double, about 90% growth overall, this last quarter versus the quarter a Walter Pack will be growing at a CAGR of 20% to 25%. Inorganic acquisitions will add to this growth above this. So that is a grea