In [3]:
import os
import pytesseract
import cv2
import nltk
import json
from pdfminer.high_level import extract_text as extract_pdf_text
from docx import Document
from langdetect import detect
from transformers import pipeline
from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime
from transformers import pipeline
from docx import Document

nltk.download('punkt')
nltk.download('stopwords')
print("Setup done ")

Setup done 


[nltk_data] Downloading package punkt to /Users/pirachi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pirachi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def extract_text_from_pdf(path):
    return extract_pdf_text(path)

def extract_text_from_docx(path):
    doc = Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_txt(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def extract_text_from_image(path):
    image = cv2.imread(path)
    return pytesseract.image_to_string(image)

def extract_text_auto(path):
    ext = path.split('.')[-1].lower()
    if ext == 'pdf':
        return extract_text_from_pdf(path)
    elif ext == 'docx':
        return extract_text_from_docx(path)
    elif ext == 'txt':
        return extract_text_from_txt(path)
    elif ext in ['jpg', 'jpeg', 'png']:
        return extract_text_from_image(path)
    else:
        return ""

In [7]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

def split_into_chunks(text, max_chars=1000):
    paragraphs = text.split('\n')
    chunks, current = [], ''
    for para in paragraphs:
        if len(current) + len(para) < max_chars:
            current += para + '\n'
        else:
            chunks.append(current.strip())
            current = para + '\n'
    if current:
        chunks.append(current.strip())
    return chunks

def summarize_text_limited(text, max_chars=1000, max_chunks=3):
    chunks = split_into_chunks(text, max_chars=max_chars)
    summaries = []
    for i, chunk in enumerate(chunks[:max_chunks]):
        try:
            summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(summary)
        except Exception as e:
            print(f"⚠️ Skipping chunk {i}: {e}")
    return "\n".join(summaries)

def extract_keywords(text, top_n=10):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=top_n)
    X = vectorizer.fit_transform([text])
    return vectorizer.get_feature_names_out()

Device set to use mps:0


In [9]:
def generate_metadata(path):
    text = extract_text_auto(path)

    if not text or len(text.strip()) < 50:
        return {"error": "Insufficient text extracted."}

    lang = detect(text)
    summary = summarize_text_limited(text)
    keywords = extract_keywords(text)

    metadata = {
        "filename": path.split("/")[-1],
        "extracted_on": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "language": lang,
        "summary": summary,
        "keywords": keywords.tolist(),
        "num_characters": len(text),
        "num_words": len(text.split())
    }

    output_path = path + ".metadata.json"
    with open(output_path, "w") as f:
        json.dump(metadata, f, indent=4)

    print("✅ Metadata saved to:", output_path)
    return metadata

In [11]:
# generate_metadata("../sample_docs/sample.pdf")
generate_metadata("../sample_docs/sample.docx")
# generate_metadata("../sample_docs/sample.txt")
# generate_metadata("../sample_docs/image.jpg")

✅ Metadata saved to: ../sample_docs/sample.docx.metadata.json


{'filename': 'sample.docx',
 'extracted_on': '2025-06-24 20:26:54',
 'language': 'en',
 'summary': ' Alphabet Inc. registered pursuant to Section 12(b) of the SECURITIES EXCHANGE ACT of 1934 . The Securities and Exchange Commission filed a form of 10-Q at the time of the quarter ended September 30, 2024 .\n Indicate by check mark whether the registrant has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 .\n As of October 22, 2024, there were 5,843 million shares of Alphabet’s Class A stock outstanding . Indicate by check mark whether the registrant is a shell company (as defined in Rule 12b-2 of the Exchange Act).',
 'keywords': ['2023',
  '2024',
  '30',
  'billion',
  'class',
  'ended',
  'google',
  'months',
  'revenues',
  'september'],
 'num_characters': 124809,
 'num_words': 18792}

In [None]:
# # Extracting Text
# pdf_path = "../sample_docs/sample.pdf"

# def split_into_chunks(text, max_chars=1000):
#     paragraphs = text.split('\n')
#     chunks, current_chunk = [], ''
#     for para in paragraphs:
#         if len(current_chunk) + len(para) < max_chars:
#             current_chunk += para + '\n'
#         else:
#             chunks.append(current_chunk.strip())
#             current_chunk = para + '\n'
#     if current_chunk:
#         chunks.append(current_chunk.strip())
#     return chunks

In [None]:
# def summarize_text_limited(text, summarizer, max_chars=1000, max_chunks=3):
#     chunks = split_into_chunks(text, max_chars)
#     summaries = []

#     for i, chunk in enumerate(chunks[:max_chunks]):
#         try:
#             summary = summarize_text_limited(text, summarizer, max_chars=1000, max_chunks=3)
#             summaries.append(summary)
#         except Exception as e:
#             print(f"Error summarizing chunk {i}: {e}")
#             continue

#     return "\n".join(summaries)

In [None]:
# # ----------------------------
# # Main API Function
# # ----------------------------

# def generate_metadata(path):
#     print(f"\n📄 Processing: {path}")
#     text = extract_text_auto(path)

#     if not text or len(text.strip()) < 50:
#         return {"error": "Insufficient content extracted."}

#     # Load model once
#     summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

#     lang = detect(text)
#     summary = summarize_text(text, summarizer)
#     keywords = extract_keywords(text, top_n=10)

#     metadata = {
#         "filename": os.path.basename(path),
#         "extracted_on": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
#         "language": lang,
#         "summary": summary,
#         "keywords": keywords.tolist() if hasattr(keywords, 'tolist') else list(keywords),
#         "num_characters": len(text),
#         "num_words": len(text.split())
#     }

#     # Save as JSON
#     out_path = path + ".metadata.json"
#     with open(out_path, "w") as f:
#         json.dump(metadata, f, indent=4)

#     print(f"✅ Metadata saved to: {out_path}")
#     return metadata

In [None]:
# # from metadata_generator import generate_metadata

# # Works with any supported file
# generate_metadata("sample_docs/sample.pdf")
# # generate_metadata("sample_docs/sample.docx")
# generate_metadata("sample_docs/sample.txt")
# # generate_metadata("sample_docs/scan.jpg")