#**AUTOMATED METADATA GENERATOR**

##Live App: https://automated-metadata-generator-app.streamlit.app

##Video Demo: https://drive.google.com/file/d/1JzmD3WbEOZeGhpquPUpZegwIeKFOzQQf/view?usp=sharing

#Installing the necesary Libraries and Modules

In [None]:
# System-level dependencies for OCR
!apt-get update -qq && \
apt-get install -y -qq poppler-utils tesseract-ocr

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [None]:
!pip install --quiet \
    pdfminer.six \
    python-docx \
    pytesseract \
    pillow \
    spacy \
    keybert \
    transformers \
    sentence-transformers \
    sentencepiece \
    matplotlib \
    numpy \
    wordcloud \
    streamlit \
    pyngrok \
    easyocr

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.8/422.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m969.6/969.6 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.9/292.9 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


#Code for Metadata Generation

In [None]:
%%writefile metadata_gen.py

# Import modules
import re, io
from pdfminer.high_level import extract_text as extract_pdf_text
import docx
import easyocr
from PIL import Image
import spacy
from keybert import KeyBERT
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud

# Load models
nlp_model = spacy.load("en_core_web_sm")
keyword_extractor = KeyBERT()
text_summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
ocr_reader = easyocr.Reader(['en'], gpu=False)

# Function for reading the document uploaded on the app
def read_document(path, extension):
    try:
        if extension == ".pdf":
            return extract_pdf_text(path)
        elif extension == ".docx":
            return "\n".join(para.text for para in docx.Document(path).paragraphs)
        elif extension in [".png", ".jpg", ".jpeg"]:
            lines = ocr_reader.readtext(path, detail=0)
            return "\n".join(lines)
        elif extension == ".txt":
            return open(path, encoding="utf-8").read()
    except Exception as e:
        return f"Could not read file: {e}"
    return ""

# Function to find out if the line is a Potential Heading
def is_potential_heading(line):
    text = line.strip()
    return (
        0 < len(text) <= 100 and (
            text.isupper() or
            text.endswith(":") or
            re.match(r"^\d+[\.\)]", text) or
            text.lower() in [
                "introduction", "background", "challenges", "limitations", "results",
                "discussion", "methodology", "methods", "conclusion", "summary",
                "references", "abstract", "future work"
            ] or
            len(text.split()) <= 5
        )
    )

# Function to divide the text into sections
def segment_text_into_sections(raw_text):
    lines = [line for line in raw_text.split("\n") if line.strip()]
    structured = {}
    heading, buffer = None, []

    for line in lines:
        if is_potential_heading(line):
            if heading and buffer:
                structured[heading] = "\n".join(buffer).strip()
            heading, buffer = line.strip(), []
        else:
            if heading:
                buffer.append(line)

    if heading and buffer:
        structured[heading] = "\n".join(buffer).strip()

    if len(structured) < 3 or all(len(sec.split()) < 50 for sec in structured.values()):
        paras = [p.strip() for p in raw_text.split("\n\n") if len(p.split()) > 30]
        if len(paras) < 3:
            structured = {"Main Content": "\n\n".join(paras)}
        else:
            vectors = sbert_model.encode(paras, normalize_embeddings=True)
            groups = util.community_detection(vectors, min_community_size=1, threshold=0.75)
            structured = {f"Section {i+1}": "\n\n".join(paras[idx] for idx in cluster) for i, cluster in enumerate(groups)}

    return structured

# Function to extract metadata from the text
def extract_metadata(text, fast=True):
    sections = segment_text_into_sections(text)
    summaries, key_terms, keyword_scores = [], [], {}
    named_entities = set()

    if not fast:
        for title, body in sorted(sections.items(), key=lambda x: len(x[1]), reverse=True)[:3]:
            try:
                summary = text_summarizer(body[:1024], max_length=120, min_length=30, do_sample=False)[0]['summary_text']
                summaries.append(f"### {title}\n{summary}")
            except:
                continue
    else:
        try:
            summaries.append(text_summarizer(text[:1024], max_length=150, min_length=40, do_sample=False)[0]['summary_text'])
        except:
            summaries.append("Summary generation failed.")

    if len(sections) > 6:
        top_keywords = keyword_extractor.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words="english", top_n=10)
        key_terms = [term for term, _ in top_keywords]
        keyword_scores = dict(top_keywords)
        named_entities = {(ent.text, ent.label_) for ent in nlp_model(text).ents}
    else:
        for _, content in sections.items():
            for term, score in keyword_extractor.extract_keywords(content, keyphrase_ngram_range=(1,2), stop_words="english", top_n=3):
                if term not in keyword_scores:
                    key_terms.append(term)
                    keyword_scores[term] = score
            named_entities.update((ent.text, ent.label_) for ent in nlp_model(content).ents)

    return {
        "summary": "\n\n".join(summaries),
        "structured_metadata": sections,
        "keywords": key_terms,
        "keyword_scores": keyword_scores,
        "named_entities": list(named_entities)
    }

# Function to get word count, sentence count etc. from the text
def get_document_stats(text):
    doc = nlp_model(text)
    return {
        "word_count": len(text.split()),
        "sentence_count": len(list(doc.sents)),
        "entity_count": len(doc.ents)
    }

# Function to create the Wordcloud for the document
def create_wordcloud_image(text):
    return WordCloud(width=800, height=400, background_color="white").generate(text).to_image()

# Function to generate the keyword relevance chart
def keyword_score_bar_image(scores):
    keys, values = list(scores.keys()), list(scores.values())
    fig, ax = plt.subplots(figsize=(6, 4))
    colors = plt.get_cmap("Blues")(plt.Normalize(min(values), max(values))(values))
    bars = ax.barh(keys[::-1], values[::-1], color=colors[::-1])
    ax.set_title("Top Keywords (Relevance)")
    for bar, val in zip(bars, values[::-1]):
        ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
                f"{val:.2f}", va='center', fontsize=9)
    plt.tight_layout()
    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    return Image.open(buf)

# Function to render NER tags with HTML highlighting
def visualize_named_entities(text):
    highlight_colors = {
        "ORG": "#ffd966", "PERSON": "#f4cccc", "GPE": "#c9daf8",
        "DATE": "#d9ead3", "MONEY": "#e6b8af", "PRODUCT": "#b4a7d6", "EVENT": "#a2c4c9"
    }
    doc = nlp_model(text)
    result_html, last_idx = "", 0
    for ent in doc.ents:
        result_html += text[last_idx:ent.start_char]
        shade = highlight_colors.get(ent.label_, "#e0e0e0")
        result_html += f"<span style='background:{shade};padding:2px 5px;border-radius:5px;margin:1px;'>{ent.text}<sub style='font-size:10px;color:#333;'>({ent.label_})</sub></span>"
        last_idx = ent.end_char
    result_html += text[last_idx:]
    return result_html

Overwriting metadata_gen.py


#Code for UI

In [None]:
%%writefile app.py

#Import modules
import streamlit as st
import os, tempfile, json, hashlib
import metadata_gen as mg

#Streamlit page setup
st.set_page_config(
    page_title="Metadata Generator",
    layout="wide",
    initial_sidebar_state="collapsed"
)
st.markdown("# Automated Metadata Generator")

#File uploader
uploaded_files = st.file_uploader(
    "📁 Upload documents (PDF, DOCX, TXT, Image)",
    type=["pdf", "docx", "txt", "png", "jpg", "jpeg"],
    accept_multiple_files=True
)

#Processing the uploaded files
if uploaded_files:
    for uploaded_file in uploaded_files:
        file_ext = os.path.splitext(uploaded_file.name)[1].lower()

        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext)
        tmp_file.write(uploaded_file.read())
        tmp_file.close()

        #Extracting text and analyzing
        raw_text = mg.read_document(tmp_file.name, file_ext)
        metadata = mg.extract_metadata(raw_text)
        stats = mg.get_document_stats(raw_text)

        doc_hash = hashlib.md5(uploaded_file.name.encode()).hexdigest()

        st.markdown("----")
        st.markdown(f"### 📝 Document ID: `{doc_hash}`")
        st.metric("📄 Filename", uploaded_file.name)

        with st.expander("📈 File Details", expanded=True):
            col1, col2, col3 = st.columns(3)
            col1.metric("📦 File Size", f"{os.path.getsize(tmp_file.name) / 1024:.2f} KB")
            col2.metric("📁 File Type", "Document")
            col3.metric("🧾 Content Type", uploaded_file.type)

        #Summary and keywords
        with st.expander("📑 Summary & Keywords", expanded=True):
            st.subheader("📝 Summary")
            st.write(metadata["summary"])

            st.subheader("🔑 Keywords")
            st.markdown(
                "".join(
                    f"<span style='display:inline-block;background:#e0f0ff;color:#004080;"
                    f"padding:5px 10px;border-radius:15px;margin:2px;font-size:14px;'>{kw}</span>"
                    for kw in metadata["keywords"]
                ),
                unsafe_allow_html=True
            )

        #Basic metrics
        with st.expander("📈 Document Metrics", expanded=True):
            col1, col2, col3 = st.columns(3)
            col1.metric("Word Count", stats["word_count"])
            col2.metric("Sentences", stats["sentence_count"])
            col3.metric("Named Entities", stats["entity_count"])

        with st.expander("🧾 Structured Metadata"):
            for title, content in metadata["structured_metadata"].items():
                st.markdown(f"#### {title}")
                st.write(content)

        #Wordcloud & Keyword bar chart
        st.markdown("### 🔎 Keyword Insights")
        col_wc, col_bar = st.columns(2)

        with col_wc:
            st.markdown("#### ☁️ Word Cloud")
            st.image(mg.create_wordcloud_image(raw_text).resize((700, 400)))

        with col_bar:
            st.markdown("#### 📊 Keyword Relevance")
            st.image(mg.keyword_score_bar_image(metadata["keyword_scores"]).resize((700, 400)))

        with st.expander("🧠 Named Entity Recognition"):
            st.markdown(mg.visualize_named_entities(raw_text), unsafe_allow_html=True)

        #Download JSON metadata of the document
        json_filename = f"{uploaded_file.name}_metadata.json"
        with open(json_filename, "w") as json_out:
            json.dump(metadata, json_out, indent=2)
        with open(json_filename, "rb") as json_in:
            st.download_button(
                label="⬇️ Download Metadata JSON",
                data=json_in,
                file_name=json_filename,
                mime="application/json"
            )

Overwriting app.py


#Code for Launching Streamlit App and Live Demo via Ngrok

In [None]:
#Import modules
import os
import subprocess
import time
from pyngrok import ngrok

ngrok.set_auth_token("2yoommfgMH2tyH5ZXQIQqheeyKA_7hEH3TrDHUWYwNf5RCRpS")

try:
    subprocess.run(["pkill", "streamlit"], check=False)
except Exception:
    pass
ngrok.kill()

print("Launching Streamlit app")
streamlit_process = subprocess.Popen(["streamlit", "run", "app.py"])
time.sleep(5)

# Open ngrok tunnel
print("Creating public tunnel with ngrok")
public_url = ngrok.connect(8501)

print(f"Click on this link to view live app demo: {public_url}")

Launching Streamlit app
Creating public tunnel with ngrok
Click on this link to view live app demo: NgrokTunnel: "https://3519-34-44-183-12.ngrok-free.app" -> "http://localhost:8501"
