# 📄 Automated Metadata Generation System

This notebook demonstrates how to extract content from PDF, DOCX, and TXT files, generate semantic metadata using NLP, and export structured metadata for further use.

In [None]:
# 📦 Install required libraries (if running locally)
# !pip install fitz PyMuPDF python-docx pytesseract spacy keybert textract
# !python -m spacy download en_core_web_sm

In [None]:
# 📚 Import libraries
import os
import fitz  # PyMuPDF
import docx
import pytesseract
import textract
from keybert import KeyBERT
import spacy
import json
from PIL import Image

nlp = spacy.load("en_core_web_sm")
kw_model = KeyBERT()

In [None]:
# 📄 Extract text from different file types
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    elif ext == ".docx":
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    elif ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    else:
        return textract.process(file_path).decode("utf-8")

In [None]:
# 🧠 Generate semantic metadata
def generate_metadata(text):
    doc = nlp(text)
    keywords = kw_model.extract_keywords(text, stop_words='english', top_n=5)
    metadata = {
        "word_count": len(text.split()),
        "summary": " ".join([sent.text for sent in list(doc.sents)[:3]]),
        "keywords": [kw[0] for kw in keywords]
    }
    return metadata

In [None]:
# 📤 Example Usage
file_path = "sample_docs/sample.txt"  # replace with your file path
text = extract_text(file_path)
meta = generate_metadata(text)
print(json.dumps(meta, indent=2))