In [47]:
!pip install pdfplumber docx langchain requests python-docx




In [3]:
# Detects file type and extracts text from PDF, DOCX, or TXT (with pdfplumber for tables)
import os
import pdfplumber
from docx import Document

In [23]:
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == '.pdf':
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    elif ext == '.docx':
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    elif ext == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
    else:
        raise ValueError("Unsupported file type")

    return text

In [24]:
text=extract_text_from_file('/content/swiggy.pdf')



In [25]:
# Cleans and normalizes extracted text using regex + NLP techniques
import re
import spacy
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

def preprocess_text(text: str) -> str:
    # Regex cleaning
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n', text)
    text = text.replace('\xa0', ' ')
    text = text.encode('ascii', 'ignore').decode()

    # Remove repeating headers/footers
    lines = text.splitlines()
    line_counts = {}
    for line in lines:
        line_counts[line] = line_counts.get(line, 0) + 1
    lines = [line for line in lines if line_counts[line] < 5]
    text = " ".join(lines)

    # NLP preprocessing: stopword removal + lemmatization
    doc = nlp(text)
    cleaned = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]
    return " ".join(cleaned)


In [27]:
final_text=preprocess_text(text)

In [133]:
# divide overall data in chunks

In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [72]:
def split_text_into_chunks(text, chunk_size=1000, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

In [30]:
split_text_into_chunks(final_text)

['Goyal consumer tech platform half decade previously handle business finance vice president senior vice president new role NSoawu rRaeva disin eg xpecte lead initiative delivery partner SAURAV GOYAL SWIGGY SVP delivery previous stint Goyal Ola Consumer Flipkart Tata Communications business finance role lead Business Finance bit energy shift heart Swiggy mission elevate quality life urban consumer offer unparalleled convenience amazing delivery partner backbone reliability reach Goyal share post linkedin connect Swiggy Goyal lead Business Finance replacement appoint month Swiggy appoint Flipkart exec Ankit Jain Senior Vice President Operations previous stint Flipkart Jain responsible platform end end grocery operation large supply chain include design Flipkart quick commerce arm Minutes',
 'operation large supply chain include design Flipkart quick commerce arm Minutes supply chain Goyal appointment come competition quick commerce sector heat player fight consumer wallet share dark sto

In [48]:
PROMPT_TEMPLATE = """
You are an intelligent assistant designed to understand documents and extract structured information from them.

Your task is to:
1. Extract the following metadata:
   - Title (if mentioned)
   - Author (if available)
   - Date of publication or document creation (if available)
   - Keywords or topics covered
   - Type of document (choose from: research paper, legal notice, resume, report, book chapter, article, business proposal, letter, others)
2. Generate a concise summary of the content (3-5 sentences).

Read the content below and return your answer in this JSON format:
{{
  "title": "",
  "author": "",
  "date": "",
  "keywords": [],
  "document_type": "",
  "summary": ""
}}

Content:
\"\"\"{content_chunk}\"\"\"
"""


In [38]:
# 5. llm_call.py
import os
import requests

In [42]:
os.environ["MISTRAL_API_URL"] = "https://api.mistral.ai/v1/chat/completions"
os.environ["MISTRAL_API_KEY"] = "xLy5WZAJHVd0AkGgkAcOO6X1psZWo0jY "


In [50]:
MISTRAL_API_URL = os.getenv("MISTRAL_API_URL")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

def call_llm_on_chunk(chunk):
    headers = {
        "Authorization": f"Bearer {os.getenv('MISTRAL_API_KEY')}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "open-mistral-7b",
        "messages": [
            {"role": "user", "content": PROMPT_TEMPLATE.format(content_chunk=chunk)}
        ],
        "temperature": 0.3
    }
    response = requests.post(os.getenv("MISTRAL_API_URL"), headers=headers, json=data)
    if response.status_code != 200:
        print(f"❌ Error {response.status_code}: {response.text}")
        return "ERROR"
    return response.json()['choices'][0]['message']['content']


In [40]:
def summarize_document_chunks(chunks):
    results = []
    for chunk in chunks:
        result = call_llm_on_chunk(chunk)
        results.append(result)
    return results

In [129]:
text = extract_text_from_file('/content/swiggy_instamart.pdf')
clean_text = preprocess_text(text)
chunks = split_text_into_chunks(clean_text)

In [97]:
!pip install keybert sentence-transformers

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

In [101]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [130]:
print(f"✅ Total Chunks: {len(chunks)}")

results = []
for i, chunk in enumerate(chunks):
    print(f"\n--- Generating summary for Chunk {i+1}/{len(chunks)} ---")
    summary = call_llm_on_chunk(chunk)
    print(summary)
    results.append(summary)




✅ Total Chunks: 4

--- Generating summary for Chunk 1/4 ---
{
  "title": "Swiggy Instamart Expands Quick Commerce Services in India",
  "author": "Not specified in the content",
  "date": "Not specified in the content",
  "keywords": ["Swiggy Instamart", "quick commerce", "expansion", "India", "festive season", "new cities", "products", "fmcg", "local favorites"],
  "document_type": "article",
  "summary": "The text discusses Swiggy Instamart's expansion of its quick commerce services in various cities across India, bringing a wide range of products to customers' doorsteps within minutes. The company aims to make everyday needs more accessible and convenient, with Swiggy Instamart being India's pioneer in quick commerce platforms."
}

--- Generating summary for Chunk 2/4 ---
{
  "title": "Swiggy Instamart Expands Quick Commerce Service in Tier Cities",
  "author": "Not specified",
  "date": "Not specified",
  "keywords": ["Swiggy Instamart", "quick commerce", "expansion", "tier cities"

In [131]:
# After summarizing all chunks:
combined_summaries = "\n\n".join(results)

combine_prompt = f"""
You are a smart assistant. Below are multiple partial summaries of a document, generated from different parts.

Your task is to combine them into a **single metadata + summary JSON**, like this:
{{
  "title": "",
  "author": "",
  "date": "",
  "keywords": [],
  "document_type": "",
  "summary": ""
}}

Summaries:
\"\"\"{combined_summaries}\"\"\"
"""


In [None]:
# def call_llm_merge_summary(prompt):
#     headers = {
#         "Authorization": f"Bearer {os.getenv('MISTRAL_API_KEY')}",
#         "Content-Type": "application/json"
#     }
#     data = {
#         "model": "open-mistral-7b",
#         "messages": [{"role": "user", "content": prompt}],
#         "temperature": 0.3
#     }
#     response = requests.post(os.getenv("MISTRAL_API_URL"), headers=headers, json=data)
#     return response.json()['choices'][0]['message']['content']

# final_output = call_llm_merge_summary(combine_prompt)

# # ============================
# # 🎯 Final Output Handling
# # ============================
# try:
#     parsed = json.loads(final_output)

#     # ✅ Improve keywords using KeyBERT
#     kw_model = KeyBERT(model=SentenceTransformer('all-MiniLM-L6-v2'))
#     kb_keywords = kw_model.extract_keywords(
#         clean_text,
#         keyphrase_ngram_range=(1, 2),
#         stop_words='english',
#         top_n=10,
#         use_maxsum=True,
#         nr_candidates=20
#     )
#     final_keywords = [kw for kw, score in kb_keywords]
#     parsed["keywords"] = final_keywords

#     # ✅ Pretty output
#     print("\n✅ Final Metadata:")
#     print(json.dumps(parsed, indent=2))

#     print("\n✅ Final Summary:")
#     print(textwrap.fill(parsed["summary"], width=100))

# except json.JSONDecodeError:
#     print("⚠️ Could not parse JSON. Showing raw output:")
#     print(final_output)

In [None]:
import json
import textwrap
import re
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer


In [None]:
def call_llm_merge_summary(prompt):
    headers = {
        "Authorization": f"Bearer {os.getenv('MISTRAL_API_KEY')}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "open-mistral-7b",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.3
    }
    response = requests.post(os.getenv("MISTRAL_API_URL"), headers=headers, json=data)
    return response.json()['choices'][0]['message']['content']

final_output = call_llm_merge_summary(combine_prompt)

In [132]:
# ============================
# 🎯 Final Output Handling
# ============================
try:
    # Use regex to extract the JSON part from the string
    json_match = re.search(r'```json\n(.*?)\n```', final_output, re.DOTALL)
    if json_match:
        json_string = json_match.group(1)
        parsed = json.loads(json_string)

        # ✅ Improve keywords using KeyBERT
        # Check if clean_text is available before using KeyBERT
        if 'clean_text' in locals():
            kw_model = KeyBERT(model=SentenceTransformer('all-MiniLM-L6-v2'))
            kb_keywords = kw_model.extract_keywords(
                clean_text,
                keyphrase_ngram_range=(1, 2),
                stop_words='english',
                top_n=10,
                use_maxsum=True,
                nr_candidates=20
            )
            final_keywords = [kw for kw, score in kb_keywords]
            parsed["keywords"] = final_keywords
        else:
            print("Warning: 'clean_text' not available for keyword extraction using KeyBERT.")


        # ✅ Pretty output
        print("\n✅ Final Metadata:")
        print(json.dumps(parsed, indent=2))

        print("\n✅ Final Summary:")
        if "summary" in parsed and parsed["summary"]:
             print(textwrap.fill(parsed["summary"], width=100))
        else:
            print("Summary not available in the parsed output.")


    else:
        print("⚠️ Could not find the JSON object within the final output string.")
        print("Showing raw output:")
        print(final_output)

except json.JSONDecodeError:
    print("⚠️ Could not parse JSON. Showing raw output:")
    print(final_output)
except KeyError as e:
     print(f"⚠️ KeyError: {e} - Check if expected keys are present in the JSON output.")
     print("Showing parsed dictionary (if available):")
     if 'parsed' in locals():
         print(json.dumps(parsed, indent=2))
     else:
         print("Parsed dictionary not available.")


✅ Final Metadata:
{
  "title": "Swiggy Instamart Expands Quick Commerce Services in India",
  "author": "Not specified",
  "date": "Not specified",
  "keywords": [
    "customer india",
    "instamart bring",
    "benefit swiggy",
    "innovation swiggy",
    "approach swiggy",
    "use swiggys",
    "instamart indias",
    "food swiggy",
    "instamarts store",
    "instamart use"
  ],
  "document_type": "Article",
  "summary": "The text discusses Swiggy Instamart's expansion of its quick commerce services, aiming to make everyday needs more accessible and convenient in various cities across India, including tier cities. The company is India's pioneer in quick commerce platforms, continually innovating and integrating new services like Swiggy Dineout and Swiggy Genie. They leverage cutting-edge technology and offer a membership program with benefits for food and quick commerce dine-and-drop services. Swiggy aims to provide a superior consumer experience and has recently launched Swig

In [106]:
# def call_llm_merge_summary(prompt):
#     headers = {
#         "Authorization": f"Bearer {os.getenv('MISTRAL_API_KEY')}",
#         "Content-Type": "application/json"
#      }
#     data = {
#         "model": "open-mistral-7b",
#         "messages": [{"role": "user", "content": prompt}],
#         "temperature": 0.3
#     }
#     response = requests.post(os.getenv("MISTRAL_API_URL"), headers=headers, json=data)
#     return response.json()['choices'][0]['message']['content']

# final_output = call_llm_merge_summary(combine_prompt)
# print("\n✅ Final Combined Metadata & Summary:\n")
# print(final_output)

In [107]:
# import json

# # Let's say this is the response string
# final_output = call_llm_merge_summary(combine_prompt)

# try:
#     parsed = json.loads(final_output)
#     print("✅ Summary:\n", parsed["summary"])
# except json.JSONDecodeError:
#     print("❌ Could not parse JSON. Showing raw output:\n")
#     print(final_output)


In [108]:
# import textwrap

# summary_text = parsed["summary"]
# wrapped_summary = textwrap.fill(summary_text, width=100)
# print("✅ Final Summary:\n")
# print(wrapped_summary)

In [111]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.46.0-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hI

In [112]:
import streamlit as st

In [114]:
# Streamlit UI
st.title("📄 Auto Metadata & Summary Generator")
file = st.file_uploader("Upload PDF, DOCX or TXT file", type=["pdf", "docx", "txt"])

if file:
    with st.spinner("Extracting and processing file..."):
        raw_text = extract_text_from_file(file)
        clean_text = preprocess_text(raw_text)
        chunks = split_text_into_chunks(clean_text)
        summaries = [call_mistral(PROMPT_TEMPLATE.format(content_chunk=chunk)) for chunk in chunks]
        combined = "\n\n".join(summaries)

        # Final summary generation
        final_prompt = f"""
You are a smart assistant. Below are multiple partial summaries of a document, generated from different parts.
Your task is to combine them into a **single metadata + summary JSON**, like this:
{{
  "title": "",
  "author": "",
  "date": "",
  "keywords": [],
  "document_type": "",
  "summary": ""
}}

Summaries:
{combined}
"""
        final_output = call_mistral(final_prompt)

    try:
        parsed = json.loads(final_output)
        kb_keywords = kw_model.extract_keywords(
            clean_text,
            keyphrase_ngram_range=(1, 2),
            stop_words='english',
            top_n=10,
            use_maxsum=True,
            nr_candidates=20
        )
        final_keywords = [kw for kw, _ in kb_keywords]
        parsed['keywords'] = final_keywords

        st.subheader("📌 Extracted Metadata")
        st.json(parsed)

        st.subheader("📝 Wrapped Summary")
        st.text(textwrap.fill(parsed["summary"], width=100))

        # Download summary
        st.download_button(
            label="💾 Download Summary",
            data=parsed["summary"],
            file_name="summary.txt",
            mime="text/plain"
        )
    except Exception as e:
        st.error(f"Failed to parse output: {e}")

2025-06-20 10:54:29.176 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


In [115]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Downloading pyngrok-7.2.11-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.11


In [116]:
code = '''
# Streamlit UI
st.title("📄 Auto Metadata & Summary Generator")
file = st.file_uploader("Upload PDF, DOCX or TXT file", type=["pdf", "docx", "txt"])

if file:
    with st.spinner("Extracting and processing file..."):
        raw_text = extract_text_from_file(file)
        clean_text = preprocess_text(raw_text)
        chunks = split_text_into_chunks(clean_text)
        summaries = [call_mistral(PROMPT_TEMPLATE.format(content_chunk=chunk)) for chunk in chunks]
        combined = "\n\n".join(summaries)

        # Final summary generation
        final_prompt = f"""
You are a smart assistant. Below are multiple partial summaries of a document, generated from different parts.
Your task is to combine them into a **single metadata + summary JSON**, like this:
{{
  "title": "",
  "author": "",
  "date": "",
  "keywords": [],
  "document_type": "",
  "summary": ""
}}

Summaries:
{combined}
"""
        final_output = call_mistral(final_prompt)

    try:
        parsed = json.loads(final_output)
        kb_keywords = kw_model.extract_keywords(
            clean_text,
            keyphrase_ngram_range=(1, 2),
            stop_words='english',
            top_n=10,
            use_maxsum=True,
            nr_candidates=20
        )
        final_keywords = [kw for kw, _ in kb_keywords]
        parsed['keywords'] = final_keywords

        st.subheader("📌 Extracted Metadata")
        st.json(parsed)

        st.subheader("📝 Wrapped Summary")
        st.text(textwrap.fill(parsed["summary"], width=100))

        # Download summary
        st.download_button(
            label="💾 Download Summary",
            data=parsed["summary"],
            file_name="summary.txt",
            mime="text/plain"
        )
    except Exception as e:
        st.error(f"Failed to parse output: {e}")
'''
with open("streamlit_metadata_app.py", "w") as f:
    f.write(code)

ERROR:pyngrok.process.ngrok:t=2025-06-20T11:05:07+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"


PyngrokNgrokError: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.