In [None]:
! pip install pymupdf transformers torch


In [None]:
import fitz
from transformers import pipeline
import json
import os

# pdf
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# summarize
def summarize_text(text, max_length=200, min_length=50):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    chunks = [text[i:i+1024] for i in range(0, len(text), 1024)]
    summary = ""
    for chunk in chunks:
        result = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
        summary += result[0]['summary_text'] + " "
    return summary.strip()

# output json
def save_to_json(pdf_path, original_text, summary, output_path="summary_output.json"):
    data = {
        "file_name": os.path.basename(pdf_path),
        "original_text": original_text,
        "summary": summary
    }
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"Output saved to {output_path}")

# main for summary as JSON output format
if __name__ == "__main__":
    pdf_path = "Base.pdf"  #file path
    text = extract_text_from_pdf(pdf_path)
    summary = summarize_text(text)
    save_to_json(pdf_path, text, summary)

#main for summary as text format
if __name__ == "__main__":
    pdf_path = "sample.pdf"  #file path
    text = extract_text_from_pdf(pdf_path)
    summary = summarize_text(text)
    print("Summary:\n", summary)


