<a href="https://colab.research.google.com/github/Anjali9888/Anjali/blob/main/py_keyword_summary_tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber pymupdf

import os
import pdfplumber
import pandas as pd
import nltk
import fitz  # PyMuPDF
from nltk.tokenize import sent_tokenize
from google.colab import files

# --- Setup ---
nltk.download('punkt')
nltk.download('punkt_tab')
OUTPUT_SUMMARY = "summary.csv"
OUTPUT_FREQ = "keyword_frequency.csv"
HIGHLIGHT_DIR = "highlighted_pdfs"
UPLOAD_DIR = "uploaded_pdfs"
for folder in [HIGHLIGHT_DIR, UPLOAD_DIR]:
    os.makedirs(folder, exist_ok=True)

# --- Upload PDFs ---
print("📤 Upload your PDF files")
uploaded = files.upload()
for name, data in uploaded.items():
    with open(os.path.join(UPLOAD_DIR, name), 'wb') as f:
        f.write(data)

# --- Get Search Term ---
keyword = input("🔍 Enter keyword or phrase: ").strip().lower()

# --- PDF Text Extraction ---
def extract_text(file_path):
    with pdfplumber.open(file_path) as pdf:
        return [(i+1, p.extract_text() or '') for i, p in enumerate(pdf.pages)]

# --- Keyword Match ---
def find_matches(pages, keyword):
    return [(pg, s.strip()) for pg, text in pages for s in sent_tokenize(text) if keyword in s.lower()]

# --- Highlight PDF ---
def highlight_pdf(src, keyword, dest):
    try:
        doc = fitz.open(src)
        for page in doc:
            for match in page.search_for(keyword, flags=fitz.TEXT_DEHYPHENATE):
                page.add_highlight_annot(match)
        doc.save(dest)
    except Exception as e:
        print(f"❌ Failed to highlight {src}: {e}")

# --- Frequency Count ---
def count_keyword(pages, keyword):
    return ' '.join([t.lower() for _, t in pages]).count(keyword)

# --- Process PDFs ---
summary_list, freq_list = [], []
for pdf_file in os.listdir(UPLOAD_DIR):
    if not pdf_file.endswith('.pdf'): continue
    print(f"📄 Processing: {pdf_file}")
    path = os.path.join(UPLOAD_DIR, pdf_file)
    pages = extract_text(path)

    # Save summary
    matches = find_matches(pages, keyword)
    for pg, sent in matches:
        summary_list.append({"File": pdf_file, "Page": pg, "Sentence": sent})

    # Save highlights
    highlight_pdf(path, keyword, os.path.join(HIGHLIGHT_DIR, f"highlighted_{pdf_file}"))

    # Frequency count
    freq = count_keyword(pages, keyword)
    freq_list.append({"File": pdf_file, "Frequency": freq})

# --- Save Outputs ---
if summary_list:
    pd.DataFrame(summary_list).to_csv(OUTPUT_SUMMARY, index=False)
if freq_list:
    pd.DataFrame(freq_list).to_csv(OUTPUT_FREQ, index=False)

# --- Display Results ---
print("\n✅ DONE!")
if os.path.exists(OUTPUT_SUMMARY):
    print("📑 Summary:")
    display(pd.read_csv(OUTPUT_SUMMARY))
else:
    print("📑 No keyword matches found, so no summary was generated.")

if os.path.exists(OUTPUT_FREQ):
    print("\n📊 Keyword Frequency:")
    display(pd.read_csv(OUTPUT_FREQ))
else:
    print("\n📊 No keyword matches found, so no frequency report was generated.")


print("\n📁 Highlighted PDFs:")
print(os.listdir(HIGHLIGHT_DIR))

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


📤 Upload your PDF files


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Saving artificial intelligence.pdf to artificial intelligence.pdf
🔍 Enter keyword or phrase: ai
📄 Processing: artificial intelligence.pdf

✅ DONE!
📑 Summary:


Unnamed: 0,File,Page,Sentence
0,artificial intelligence.pdf,2,"Secretary, U.S. Department of Education\nRober..."
1,artificial intelligence.pdf,2,These materials may\ncontain the views and rec...
2,artificial intelligence.pdf,2,Licensing and Availability\nThis report is in ...
3,artificial intelligence.pdf,2,Requests for alternate format documents such a...
4,artificial intelligence.pdf,2,Notice to Limited English Proficient Persons\n...
...,...,...,...
602,artificial intelligence.pdf,70,Computers and\nEducation: Artificial Intellige...
603,artificial intelligence.pdf,71,https://doi.org/10.3102/00028312031002369\nWhi...
604,artificial intelligence.pdf,71,https://www.whitehouse.gov/ostp/ai-bill-of-rig...
605,artificial intelligence.pdf,71,"http://hdl.handle.net/20.500.12265/159\nZhai, ..."



📊 Keyword Frequency:


Unnamed: 0,File,Frequency
0,artificial intelligence.pdf,873



📁 Highlighted PDFs:
['highlighted_artificial intelligence.pdf']
