**Frontend for AutoMeta**

In [3]:
!pip install streamlit PyMuPDF python-docx pdf2image pytesseract \
            transformers keybert pyyaml sentence-transformers


Collecting streamlit
  Downloading streamlit-1.46.0-py3-none-any.whl.metadata (9.0 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading n

In [4]:
import streamlit as st
import os
from pathlib import Path
from tempfile import NamedTemporaryFile
import json
import yaml

from transformers import pipeline
from keybert import KeyBERT
import fitz  # PyMuPDF
import docx
import pytesseract
from pdf2image import convert_from_path

**Load Models**

In [5]:
@st.cache_resource
def load_models():
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    kw_model = KeyBERT("sentence-transformers/all-MiniLM-L6-v2")
    return summarizer, kw_model

summarizer, kw_model = load_models()


2025-06-24 09:38:01.241 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



**Extract Text**

In [6]:
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return "\n".join(page.get_text() for page in doc).strip()

def extract_text_from_docx(path):
    doc = docx.Document(path)
    return "\n".join(p.text for p in doc.paragraphs).strip()

def extract_text_via_ocr(path):
    images = convert_from_path(path)
    return "\n".join(pytesseract.image_to_string(img) for img in images).strip()


**Chunking-based Summarization**

In [7]:
def summarize_long_text(text, chunk_size=1000, max_chunks=5):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    summaries = []
    for chunk in chunks[:max_chunks]:
        result = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
        summaries.append(result[0]['summary_text'])
    return " ".join(summaries)


**Metadata Generator**

In [8]:
def generate_metadata(text, doc_type="Unknown"):
    summary = summarize_long_text(text)
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 2),
        stop_words='english',
        top_n=8
    )
    return {
        "title": summary.split('.')[0],
        "summary": summary,
        "keywords": [kw[0] for kw in keywords],
        "document_type": doc_type,
        "word_count": len(text.split())
    }


**UI Layout**

In [10]:
st.set_page_config(page_title="AutoMeta Metadata Generator", layout="wide")

with st.container():
    st.markdown("""
        <h1 style='text-align: center; color: #4A6FA5;'>📄 AutoMeta</h1>
        <h4 style='text-align: center; color: #606060;'>AI-Powered Smart Metadata Generator for PDFs, DOCX, and TXT</h4>
        <hr style='margin-top:0;'>
    """, unsafe_allow_html=True)

uploaded = st.file_uploader(" Upload Document", type=["pdf", "docx", "txt"], help="Supported formats: PDF, DOCX, TXT")

if uploaded:
    with NamedTemporaryFile(delete=False, suffix=Path(uploaded.name).suffix) as tmp_file:
        tmp_file.write(uploaded.read())
        tmp_path = tmp_file.name

    ext = Path(uploaded.name).suffix.lower()
    text, doc_type = "", "Unknown"

    if ext == ".pdf":
        text = extract_text_from_pdf(tmp_path)
        if not text:
            text = extract_text_via_ocr(tmp_path)
            doc_type = "Scanned PDF"
        else:
            doc_type = "PDF"

    elif ext == ".docx":
        text = extract_text_from_docx(tmp_path)
        doc_type = "DOCX"

    elif ext == ".txt":
        text = Path(tmp_path).read_text(encoding="utf-8")
        doc_type = "TXT"

    if text:
        st.success(" Document processed successfully.")
        metadata = generate_metadata(text, doc_type)

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("###  JSON Metadata")
            st.json(metadata)

        with col2:
            st.markdown("### YAML Metadata")
            st.code(yaml.dump(metadata, sort_keys=False), language="yaml")

        with st.expander("Full Text Extracted"):
            st.text_area("Extracted Text", text, height=200)
    else:
        st.error(" Could not extract text from the file.")

else:
    st.info("Please upload a document to begin.")

