**NOTEBOOK-BASED METADATA GENERATOR: RUNS LOCALLY ON CPU**

In [None]:
import numpy as np
print("✅ NumPy version:", np.__version__)


In [None]:
!pip install --quiet \
    transformers \
    pytesseract \
    torch \
    nltk \
    pillow \
    python-docx \
    pymupdf \
    sentencepiece \
    keybert \
    sentence-transformers


In [None]:
import re
import fitz
import pytesseract
from PIL import Image
from io import BytesIO
from transformers import pipeline
import docx
import nltk
from collections import Counter
from nltk.corpus import stopwords

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Text Extraction
def extract_text(file_input):
    ext = file_input.name.split('.')[-1].lower()
    file_data = file_input.read()
    file_input.seek(0)
    if ext == 'pdf':
        with fitz.open(stream=file_data, filetype="pdf") as doc:
            return "\n".join(p.get_text() for p in doc)
    elif ext == 'docx':
        d = docx.Document(BytesIO(file_data))
        return "\n".join(p.text for p in d.paragraphs)
    elif ext == 'txt':
        return file_data.decode("utf-8", errors="ignore")
    else:
        return "Unsupported file"

# Clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text):
    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)][:2]
    summaries = [summarizer(chunk)[0]['summary_text'] for chunk in chunks]
    return " ".join(summaries)

# Keywords
def extract_keywords(text):
    words = re.findall(r'\w+', text.lower())
    stop = set(stopwords.words('english'))
    filtered = [w for w in words if w not in stop and len(w) > 3]
    return [w for w, _ in Counter(filtered).most_common(10)]

# Final metadata function
def generate_metadata(file_input):
    text = extract_text(file_input)
    text = clean_text(text)
    summary = summarize_text(text)
    title = summary.split('.')[0]
    keywords = extract_keywords(text)
    return {
        "📌 Title": title,
        "📄 Summary": summary,
        "🏷️ Keywords": keywords
    }


In [None]:
from ipywidgets import FileUpload
from IPython.display import display
import io

def upload_file_widget():
    uploader = FileUpload(accept='.pdf,.docx,.txt', multiple=False)
    display(uploader)
    
    def get_file():
        if uploader.value:
            for fname, item in uploader.value.items():
                print(f"📂 Uploaded: {fname}")
                return io.BytesIO(item['content']), fname
        else:
            print("⚠️ Please upload a file.")
            return None, None

    return uploader, get_file


In [None]:
uploader, get_file = upload_file_widget()


In [None]:
file_obj, filename = get_file()
if file_obj:
    file_obj.name = filename  
    metadata = generate_metadata(file_obj)
    print(metadata)
