In [18]:
import streamlit as st
import tempfile
import json
from pathlib import Path
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
import pytesseract
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage, SystemMessage
from dotenv import load_dotenv
from PIL import Image
import io
import fitz  
from docx.opc.constants import RELATIONSHIP_TYPE as RT

In [19]:
import os
load_dotenv()


True

In [20]:
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [21]:
llm = ChatGroq(
    model="llama3-8b-8192",
    api_key=os.getenv("GROQ_API_KEY")
)
print(llm)

client=<groq.resources.chat.completions.Completions object at 0x0000029D7791ACC0> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x0000029D77919DF0> model_name='llama3-8b-8192' model_kwargs={} groq_api_key=SecretStr('**********')


In [22]:
pytesseract.pytesseract.tesseract_cmd = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

In [23]:
def extract_text_and_inline_images(uploaded_file):
    suffix = Path(uploaded_file.name).suffix.lower()
    text = ""
    ocr_text = ""
    image_list = []
    ocr_texts_per_image = []

    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
        tmp_file.write(uploaded_file.read())
        tmp_file_path = tmp_file.name

    try:
        if suffix == ".pdf":
            try:
                reader = PdfReader(tmp_file_path)
                text = "\n".join([page.extract_text() or "" for page in reader.pages])
            except Exception:
                text = ""

            pdf_doc = fitz.open(tmp_file_path)
            for page in pdf_doc:
                images = page.get_images(full=False)
                for img in images:
                    xref = img[0]
                    base_image = pdf_doc.extract_image(xref)
                    image_data = base_image["image"]
                    image = Image.open(io.BytesIO(image_data)).convert("RGB")
                    image_list.append(image)
                    ocr_img_text = pytesseract.image_to_string(image)
                    ocr_texts_per_image.append(ocr_img_text)
            ocr_text = "\n".join(ocr_texts_per_image)

        elif suffix == ".docx":
            doc = DocxDocument(tmp_file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            rels = doc.part._rels
            for rel in rels:
                rel = rels[rel]
                if rel.reltype == RT.IMAGE:
                    image_data = rel.target_part.blob
                    image = Image.open(io.BytesIO(image_data)).convert("RGB")
                    image_list.append(image)
                    ocr_img_text = pytesseract.image_to_string(image)
                    ocr_texts_per_image.append(ocr_img_text)
            ocr_text = "\n".join(ocr_texts_per_image)

        elif suffix == ".txt":
            with open(tmp_file_path, "r", encoding="utf-8") as f:
                text = f.read()

        else:
            text = f"Unsupported file type: {suffix}"

    finally:
        try:
            os.remove(tmp_file_path)
        except PermissionError:
            pass

    return {
        "text": text.strip(),
        "ocr_text": ocr_text.strip(),
        "images": image_list,
        "ocr_texts_per_image": ocr_texts_per_image
    }

In [24]:
def generate_metadata(text):
    prompt = f"""
You are a professional and wonderful metadata assistant.
Analyze the following document,idenitfy and leverage most meaningful sections of document and return structured metadata in JSON format with fields:
- title
- summary (at least 15-20 lines in detail covering all important points of document)
- keywords (comma-separated)
- topics (broad subject categories)
- author (if mentioned)
- document_type
-At end  rewrite the detailed summary(15-20 lines) so that user can see the summary alone.Please dont miss any important sections of document in this detailed summary
Document Content:
{text.strip()}
"""
    response = llm([
        SystemMessage(content="You are a metadata extraction assistant."),
        HumanMessage(content=prompt)
    ])
    return response.content.strip()

In [25]:
def summarize_ocr_text(ocr_text):
    if not ocr_text.strip():
        return "No OCR content found to summarize."

    prompt = (
        "You are a professional assistant. "
        "Start your response with 'The following image ...'. "
        "Summarize the following OCR-extracted content in a clear, well-organized, and visually appealing markdown format in about 5-10 lines "
        "Your summary should include:\n"
        "- A short title or heading for the content\n"
        "- Key points or highlights as a bullet list\n"
        "- Detected names, dates, numbers, or keywords (if any)\n"
        "- A concise paragraph summarizing the main idea or purpose\n"
        "If the content is a graph or chart, explain axes and key trends. "
        "If it's a table, highlight main comparisons or figures. "
        "If it's a scanned paragraph, summarize the main idea. "
        "Avoid assumptions. If content is unclear, mention it.\n\n"
        "If it's none of the above, just describe the content of picture, background, etc.\n"
        f"OCR Text:\n{ocr_text}"
    )

    response = llm([
        SystemMessage(content="You summarize OCR-extracted content in structured markdown."),
        HumanMessage(content=prompt)
    ])
    return response.content.strip()

### Testing for DOCX,PDF AND TXT

In [26]:
from pathlib import Path
import io

file_path = r'C:\Users\mahap\OneDrive\Desktop\C++,JS python codes\.vscode\ML-DL-NLP\Langchain\OpenProject3\test.docx'
with open(file_path, 'rb') as f:
    uploaded_file = io.BytesIO(f.read())
    uploaded_file.name = Path(file_path).name  

results = extract_text_and_inline_images(uploaded_file)
print(results['text'])
print(results['ocr_texts_per_image'])

The mean and standard deviation are two fundamental statistical measures with distinct yet complementary properties. The mean, or average, represents the central tendency of a dataset, providing a single value that summarizes all the numbers in the set. It is sensitive to every value in the data, meaning that even a single extremely large or small value (an outlier) can significantly affect the mean. On the other hand, the standard deviation measures the amount of variation or dispersion in the dataset. A low standard deviation indicates that the data points are closely clustered around the mean, while a high standard deviation suggests that the values are spread out over a wider range. Both the mean and standard deviation are widely used in statistics to describe and compare datasets, and they are especially important in fields such as science, finance, and engineering for understanding patterns, consistency, and reliability in data. 
The z-test and p-value are important concepts in s

In [27]:
response=generate_metadata(results['text'])
print(response)

After analyzing the document, I have extracted the most meaningful sections and structured the metadata in JSON format with the following fields:

```
{
  "title": "Statistical Measures and Distributions",
  "summary": "This document explores the fundamental concepts of statistical measures, including the mean and standard deviation. It also delves into the z-test and p-value, which are crucial tools for hypothesis testing. Additionally, it discusses the 67-95-99 rule and the log-normal distribution, which is commonly used to model data that cannot be negative and often displays a right-skewed, long-tailed pattern. The document highlights the importance of these concepts in statistics and their applications in various fields.",
  "keywords": "statistical measures, mean, standard deviation, z-test, p-value, 67-95-99 rule, log-normal distribution",
  "topics": ["Statistics", "Data Analysis", "Mathematics", "Probability Theory"],
  "author": "Unknown",
  "document_type": "Technical Docume

In [28]:
for text in results['ocr_texts_per_image']:
    response=summarize_ocr_text(text)
    print(response)

The following image is a mathematical formula or concept:

**Probability Distribution Parameters**

* **Key Points:**
	+ Parameters `a` and `b` are integers with `b` greater than `a`.
	+ `nm` is equal to `b-a+1`.
	+ Support is from `a` to `b-1` and `b`.
	+ Mean is `a+b/2`.
	+ Median is also `a+b/2`.
	+ Mode is not applicable (N/A).

**Detected keywords:** Probability Distribution, PMF, CDF, Mean, Median, Mode

**Summary:** This formula outlines the parameters for a specific probability distribution, including the support range, mean, median, and mode. The parameters `a` and `b` define the range of values, and the mean and median are calculated as the average of `a` and `b`.
The following image is an OCR-extracted text that appears to be a mathematical notation.

**Title:** Lognormal Distribution Notation

**Key Points:**

* Lognormal distribution with parameters 4 and a logarithmic scale
* Support is from 0 to infinity
* PDF (probability density function) is given, but unclear due to O

In [29]:
from pathlib import Path
import io

file_path = r'C:\Users\mahap\OneDrive\Desktop\C++,JS python codes\.vscode\ML-DL-NLP\Langchain\OpenProject3\test.pdf'
with open(file_path, 'rb') as f:
    uploaded_file = io.BytesIO(f.read())
    uploaded_file.name = Path(file_path).name  

results = extract_text_and_inline_images(uploaded_file)
print(results['text'])
print(results['ocr_texts_per_image'])

The tiger ( Panthera tigris ) is the largest member of the cat family and one of the world ’s most iconic 
and powerful predators. Easily recognized by its striking orange coat with black stripes, the tiger is 
native to various parts of Asia, inhabiting forests, grasslands, and mangrove swamps. Tigers are 
solitary and territorial animals, relying on stealth and strength to hunt prey such as deer, wild boar, 
and buffalo. Sadly, their populations have declined sharply due to habitat loss, poaching, and huma n-
wildlife conflict, leaving fewer than 4,000 tigers in the wild today. As an endangered species and a 
keystone predator, the tiger plays a crucial role in maintaining the health of its ecosystem. Revered in 
many cultures as a symbol of strength and courag e, the tiger ’s survival depends on ongoing 
conservation efforts to protect both the species and its natural habitats.  
 
The elephant is the largest living land animal, easily recognized by its massive body, long trunk, lar

In [30]:
response=generate_metadata(results['text'])
print(response)

Here is the extracted metadata in JSON format:

```
{
  "title": "The Tiger, Elephant, Parrot, and Dog: Understanding Their Roles in Conservation",
  "summary": "This document provides an overview of the tiger, elephant, parrot, and dog, highlighting their unique characteristics, habits, and importance in their ecosystems. The tiger is the largest member of the cat family, with a striking orange coat and black stripes, while the elephant is the largest living land animal. The parrot is a vibrant and intelligent bird known for its colorful feathers and ability to mimic sounds. The dog is a domesticated mammal known for its intelligence, loyalty, and companionship. Unfortunately, all four species face threats from habitat loss, poaching, and human-wildlife conflict, making conservation efforts crucial for their survival. The document emphasizes the importance of protecting these species and their habitats to maintain the health of their ecosystems and ensure their survival for future gen

In [31]:
for text in results['ocr_texts_per_image']:
    response=summarize_ocr_text(text)
    print(response)

The following image is an OCR-extracted text:

**Summer**

• Key points:
	+ Best
	+ Summer
	+ Ever

**Summary:**
This text appears to be a simple phrase expressing enthusiasm and positivity towards summer. The words "Best", "Summer", and "Ever" convey a sense of excitement and joy, suggesting that the author is looking forward to or is already experiencing a wonderful summer.

Note: The content is short and straightforward, with no additional information or context provided.
The following image is a passage of text.

**Title:** Excerpt from Charles Dickens' "A Tale of Two Cities"

**Key Points:**

* The text describes the contrast between two eras
* It highlights the coexistence of wisdom and foolishness
* The passage is an opening quote from Charles Dickens' novel "A Tale of Two Cities"

**Detected keywords:** wisdom, foolishness, best, worst, times

**Summary:** This passage is an iconic opening quote from Charles Dickens' novel "A Tale of Two Cities". It sets the tone for the rest o

In [32]:
from pathlib import Path
import io

file_path = r'C:\Users\mahap\OneDrive\Desktop\C++,JS python codes\.vscode\ML-DL-NLP\Langchain\OpenProject3\test.txt'
with open(file_path, 'rb') as f:
    uploaded_file = io.BytesIO(f.read())
    uploaded_file.name = Path(file_path).name  

results = extract_text_and_inline_images(uploaded_file)
print(results['text'])
print(results['ocr_texts_per_image'])

nth_super_ugly_number:
  problem:
    description: >
      Find the nth super ugly number, where a super ugly number is a positive integer 
      whose prime factors are only from the given list `primes`.

    input:
      n: The index (1-based) of the super ugly number to return.
      primes: A list of prime numbers used to generate super ugly numbers.

    output:
      dp[n]: The nth super ugly number.

  definitions:
    dp[i]: >
      The i-th super ugly number, where:
      - dp[1] is initialized to 1 (the first super ugly number),
      - dp[i] is built using previously computed dp values and the prime list.

    a[j]: >
      The index pointer for each prime number `primes[j]`, initially pointing to dp[1].
      - It tracks which multiple of a given prime to consider next.

  approach:
    type: Dynamic Programming (with multiple moving pointers)
    steps:
      - Initialize `dp` as a vector of size (n+1), with dp[1] = 1.
      - Initialize an index pointer array `a` of size 

In [33]:
response=generate_metadata(results['text'])
print(response)

Here is the extracted metadata in JSON format:

```
{
  "title": "nth Super Ugly Number",
  "summary": "This problem is about finding the nth super ugly number, where a super ugly number is a positive integer whose prime factors are only from the given list `primes`. The approach is to use dynamic programming with multiple moving pointers. The problem starts by initializing a dynamic programming array `dp` of size `n+1` and an index pointer array `a` of size `m` (number of primes). Then, for each `i` from 2 to `n`, it finds the minimum value among `dp[a[j]] * primes[j]` for each prime, sets `dp[i]` to this minimum value, and increments the respective pointer `a[j]` to avoid duplicates. The time complexity is O(n * k), where `k` is the number of primes, and the space complexity is O(n + k).",
  "keywords": "super ugly number, dynamic programming, moving pointers, prime numbers",
  "topics": ["Algorithms", "Dynamic Programming", "Number Theory"],
  "author": "Dean Hujisen",
  "document_t

In [34]:
for text in results['ocr_texts_per_image']:
    response=summarize_ocr_text(text)
    print(response)
print(results['ocr_texts_per_image'])
# There is no inline texts for txt files so that part will be empty

[]


In [39]:
# This will create a ~8MB text file named 'large_test.txt'
with open('large_test.txt', 'w', encoding='utf-8') as f:
    for _ in range(150000):  # 10 bytes per line * 800,000 = ~8MB
        f.write('This is a test line for large file.\n')
