# **Installing Required Packages**

In [1]:
!pip install --quiet PyMuPDF python-docx pdf2image pytesseract \
                  transformers keybert streamlit pyyaml sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m116.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m109.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# PyMuPDF: Extracts text from PDF documents (fast and accurate).
# python-docx: Reads .docx files and extracts text.
# pdf2image:	Converts scanned PDFs into images for OCR.
# pytesseract:	Optical Character Recognition (OCR) from images (extracted by pdf2image).
# transformers:	For summarization using facebook/bart-large-cnn.
# keybert:	Extracts top keywords using BERT embeddings.
# streamlit:	Used to create the web app frontend.
# pyyaml: Outputs metadata in YAML format (in addition to JSON).
# sentence-transformers:	Required by KeyBERT to use MiniLM or other semantic models.


Import Libraries

In [4]:
import fitz  # PyMuPDF
import docx
import pytesseract
from pdf2image import convert_from_path
from transformers import pipeline
from keybert import KeyBERT
import json
import yaml
import os
from pathlib import Path

**Load Summarization and Keyword Models**

In [5]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
kw_model = KeyBERT("sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

**Document Extraction Functions**


In [10]:
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return "\n".join(page.get_text() for page in doc).strip()

def extract_text_from_docx(path):
    doc = docx.Document(path)
    return "\n".join(para.text for para in doc.paragraphs).strip()

def extract_text_via_ocr(path):
    images = convert_from_path(path)
    return "\n".join(pytesseract.image_to_string(img) for img in images).strip()


**Metadata Generation Function**

In [13]:
def summarize_long_text(text, chunk_size=1000, max_chunks=5):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    summaries = []
    for chunk in chunks[:max_chunks]:
        result = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
        summaries.append(result[0]['summary_text'])
    return " ".join(summaries)

def generate_metadata(text, doc_type="PDF"):
    summary = summarize_long_text(text)
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=8)
    metadata = {
        "title": summary.split('.')[0],
        "summary": summary,
        "keywords": [kw[0] for kw in keywords],
        "document_type": doc_type,
        "word_count": len(text.split())
    }
    return metadata

**Smart File Handler**


In [14]:
def process_file(file_path):
    ext = Path(file_path).suffix.lower()
    text, doc_type = "", "Unknown"

    if ext == ".pdf":
        text = extract_text_from_pdf(file_path)
        if not text:
            text = extract_text_via_ocr(file_path)
            doc_type = "Scanned PDF"
        else:
            doc_type = "PDF"

    elif ext == ".docx":
        text = extract_text_from_docx(file_path)
        doc_type = "DOCX"

    elif ext == ".txt":
        text = Path(file_path).read_text(encoding="utf-8")
        doc_type = "TXT"

    return text, doc_type


**Sample Run**

In [18]:
file_path = "pdf1.pdf"  # ← Replace with your document path

if os.path.exists(file_path):
    raw_text, doc_type = process_file(file_path)
    if raw_text:
        metadata = generate_metadata(raw_text, doc_type)

        print("\n JSON Metadata Output:\n")
        print(json.dumps(metadata, indent=4))

        print("\n YAML Metadata Output:\n")
        print(yaml.dump(metadata, sort_keys=False))
    else:
        print(" No text found in the document.")
else:
    print(" File not found. Please update the 'file_path' variable.")


 JSON Metadata Output:

{
    "title": "Of section 14, it seems to me, they would be bringing themselves under the bar of section 18 (2) They cannot therefore claim that the loss of the  \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 goods was explosion damage within the meaning of the",
    "summary": "Of section 14, it seems to me, they would be bringing themselves under the bar of section 18 (2) They cannot therefore claim that the loss of the  \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 goods was explosion damage within the meaning of the. section 14. The loss was not \"due  to or did not in any way arise ont of the explosion\" in  order to a void the bar under section 18. Appeal dismissed. Agent for the appellants: Mohan Behari Lal.Agent for the respondents: I. N. Shroff. Chief Justice of India: Shri Harilal KANIA. It is not p1\u00b7oper to depart fron1 this rule of law. It will convert the High Court into a fact finding authority. The parties concerned have 