In [54]:
import streamlit as st
import tempfile
import json
from pathlib import Path
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
import pytesseract
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage, SystemMessage
from dotenv import load_dotenv
from PIL import Image
import io
import fitz  
from docx.opc.constants import RELATIONSHIP_TYPE as RT

In [55]:
import os
load_dotenv()
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [56]:
llm = ChatGroq(
    model="llama3-8b-8192",
    api_key=os.getenv("GROQ_API_KEY")
)
print(llm)

client=<groq.resources.chat.completions.Completions object at 0x000001EEC70F3E60> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000001EEC70F26C0> model_name='llama3-8b-8192' model_kwargs={} groq_api_key=SecretStr('**********')


In [57]:
pytesseract.pytesseract.tesseract_cmd = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

In [58]:
def extract_text_and_inline_images(uploaded_file):
    suffix = Path(uploaded_file.name).suffix.lower()
    text = ""
    ocr_text = ""
    image_list = []
    ocr_texts_per_image = []

    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
        tmp_file.write(uploaded_file.read())
        tmp_file_path = tmp_file.name

    try:
        if suffix == ".pdf":
            try:
                reader = PdfReader(tmp_file_path)
                text = "\n".join([page.extract_text() or "" for page in reader.pages])
            except Exception:
                text = ""

            pdf_doc = fitz.open(tmp_file_path)
            for page in pdf_doc:
                images = page.get_images(full=False)
                for img in images:
                    xref = img[0]
                    base_image = pdf_doc.extract_image(xref)
                    image_data = base_image["image"]
                    image = Image.open(io.BytesIO(image_data)).convert("RGB")
                    image_list.append(image)
                    ocr_img_text = pytesseract.image_to_string(image)
                    ocr_texts_per_image.append(ocr_img_text)
            ocr_text = "\n".join(ocr_texts_per_image)

        elif suffix == ".docx":
            doc = DocxDocument(tmp_file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            rels = doc.part._rels
            for rel in rels:
                rel = rels[rel]
                if rel.reltype == RT.IMAGE:
                    image_data = rel.target_part.blob
                    image = Image.open(io.BytesIO(image_data)).convert("RGB")
                    image_list.append(image)
                    ocr_img_text = pytesseract.image_to_string(image)
                    ocr_texts_per_image.append(ocr_img_text)
            ocr_text = "\n".join(ocr_texts_per_image)

        elif suffix == ".txt":
            with open(tmp_file_path, "r", encoding="utf-8") as f:
                text = f.read()

        else:
            text = f"Unsupported file type: {suffix}"

    finally:
        try:
            os.remove(tmp_file_path)
        except PermissionError:
            pass

    return {
        "text": text.strip(),
        "ocr_text": ocr_text.strip(),
        "images": image_list,
        "ocr_texts_per_image": ocr_texts_per_image
    }

In [59]:
def generate_metadata(text):
    prompt = f"""
You are a professional and wonderful metadata assistant.
Analyze the following document,idenitfy and leverage most meaningful sections of document and return structured metadata in JSON format with fields:
- title
- summary (at least 15-20 lines in detail covering all important points of document)
- keywords (comma-separated)
- topics (broad subject categories)
- author (if mentioned)
- document_type
-At end only rewrite the detailed summary(15-20 lines) so that user can see the summary alone.
Document Content:
{text.strip()}
"""
    response = llm([
        SystemMessage(content="You are a metadata extraction assistant."),
        HumanMessage(content=prompt)
    ])
    return response.content.strip()

In [60]:
def summarize_ocr_text(ocr_text):
    if not ocr_text.strip():
        return "No OCR content found to summarize."

    prompt = (
        "You are a professional assistant. "
        "Start your response with 'The following image ...'. "
        "Summarize the following OCR-extracted content in a clear, well-organized, and visually appealing markdown format in about 5-10 lines "
        "Your summary should include:\n"
        "- A short title or heading for the content\n"
        "- Key points or highlights as a bullet list\n"
        "- Detected names, dates, numbers, or keywords (if any)\n"
        "- A concise paragraph summarizing the main idea or purpose\n"
        "If the content is a graph or chart, explain axes and key trends. "
        "If it's a table, highlight main comparisons or figures. "
        "If it's a scanned paragraph, summarize the main idea. "
        "Avoid assumptions. If content is unclear, mention it.\n\n"
        "If it's none of the above, just describe the content of picture, background, etc.\n"
        f"OCR Text:\n{ocr_text}"
    )

    response = llm([
        SystemMessage(content="You summarize OCR-extracted content in structured markdown."),
        HumanMessage(content=prompt)
    ])
    return response.content.strip()

### Testing for DOCX,PDF AND TXT

In [61]:
from pathlib import Path
import io

file_path = r'C:\Users\mahap\OneDrive\Desktop\C++,JS python codes\.vscode\ML-DL-NLP\Langchain\OpenProject3\test.docx'
with open(file_path, 'rb') as f:
    uploaded_file = io.BytesIO(f.read())
    uploaded_file.name = Path(file_path).name  

results = extract_text_and_inline_images(uploaded_file)
print(results['text'])
print(results['ocr_texts_per_image'])

The mean and standard deviation are two fundamental statistical measures with distinct yet complementary properties. The mean, or average, represents the central tendency of a dataset, providing a single value that summarizes all the numbers in the set. It is sensitive to every value in the data, meaning that even a single extremely large or small value (an outlier) can significantly affect the mean. On the other hand, the standard deviation measures the amount of variation or dispersion in the dataset. A low standard deviation indicates that the data points are closely clustered around the mean, while a high standard deviation suggests that the values are spread out over a wider range. Both the mean and standard deviation are widely used in statistics to describe and compare datasets, and they are especially important in fields such as science, finance, and engineering for understanding patterns, consistency, and reliability in data. 
The z-test and p-value are important concepts in s

In [62]:
response=generate_metadata(results['text'])
print(response)

Here is the extracted metadata in JSON format:

```
{
  "title": "Mean, Standard Deviation, and Log-Normal Distribution in Statistics",
  "summary": "This document discusses the fundamental statistical measures of mean and standard deviation, their properties, and applications. It also introduces the z-test and p-value in hypothesis testing, the 67-95-99 rule for normal distribution, and the log-normal distribution. The document highlights the importance of these concepts in statistics, science, finance, and engineering for understanding patterns, consistency, and reliability in data.

The mean and standard deviation are two fundamental statistical measures with distinct yet complementary properties. The mean represents the central tendency of a dataset, while the standard deviation measures the amount of variation or dispersion. Both are widely used in statistics to describe and compare datasets.

The z-test is used to determine whether there is a significant difference between sample

In [63]:
for text in results['ocr_texts_per_image']:
    response=summarize_ocr_text(text)
    print(response)

The following image is a set of parameters with formulas and descriptions:

**Parameters**

* Key points:
	+ `a` and `b` are integers with `b > a`
	+ `nm` is equal to `b - a + 1`
	+ `Support` is a range from `a` to `b-1` and `b`
	+ `PMF` (Probability Mass Function) is described as `1`
	+ `n` is not defined
	+ `CDF` (Cumulative Distribution Function) is described as `-a+1` and `n`
	+ `Mean` is equal to `(a+b)/2`
	+ `Median` is equal to `(a+b)/2`
	+ `Mode` is not applicable (N/A)

The content appears to be describing a set of parameters for a statistical distribution, including the support range, probability mass function, cumulative distribution function, mean, median, and mode.
The following image is a snippet of OCR-extracted content:

**Lognormal Distribution Notation**

• Lognormal distribution with parameters 4 and " "
• Parameters: 4 € ( — 00,-+00 ) (logarithm of scale), o>0
• Support: z €(0,+00)
• PDF: 1 (Ine —p)? ——— ep(- soir 20?

**Summary:**
The content appears to be a notati

In [64]:
from pathlib import Path
import io

file_path = r'C:\Users\mahap\OneDrive\Desktop\C++,JS python codes\.vscode\ML-DL-NLP\Langchain\OpenProject3\test.pdf'
with open(file_path, 'rb') as f:
    uploaded_file = io.BytesIO(f.read())
    uploaded_file.name = Path(file_path).name  

results = extract_text_and_inline_images(uploaded_file)
print(results['text'])
print(results['ocr_texts_per_image'])

The tiger ( Panthera tigris ) is the largest member of the cat family and one of the world ’s most iconic 
and powerful predators. Easily recognized by its striking orange coat with black stripes, the tiger is 
native to various parts of Asia, inhabiting forests, grasslands, and mangrove swamps. Tigers are 
solitary and territorial animals, relying on stealth and strength to hunt prey such as deer, wild boar, 
and buffalo. Sadly, their populations have declined sharply due to habitat loss, poaching, and huma n-
wildlife conflict, leaving fewer than 4,000 tigers in the wild today. As an endangered species and a 
keystone predator, the tiger plays a crucial role in maintaining the health of its ecosystem. Revered in 
many cultures as a symbol of strength and courag e, the tiger ’s survival depends on ongoing 
conservation efforts to protect both the species and its natural habitats.  
 
The elephant is the largest living land animal, easily recognized by its massive body, long trunk, lar

In [65]:
response=generate_metadata(results['text'])
print(response)

Here is the structured metadata in JSON format:

```
{
  "title": "Wildlife Conservation: Tigers, Elephants, and Parrots",
  "summary": "This document discusses the importance of wildlife conservation, specifically focusing on the tiger, elephant, and parrot. It highlights the unique characteristics, habits, and habitats of each species, as well as the threats they face due to habitat loss, poaching, and human-wildlife conflict. The document also emphasizes the crucial role these species play in maintaining the health of their ecosystems and the need for ongoing conservation efforts to protect them. Finally, it touches on the importance of human-animal relationships, specifically the bond between humans and dogs, and how this relationship has shaped the evolution of dogs as a species.",
  "keywords": "wildlife conservation, tiger, elephant, parrot, habitat loss, poaching, human-wildlife conflict, ecosystem health, conservation efforts",
  "topics": ["Wildlife Conservation", "Biodiversi

In [66]:
for text in results['ocr_texts_per_image']:
    response=summarize_ocr_text(text)
    print(response)

The following image is an OCR-extracted text:

**Summer**

**Key Points:**

* Best
* Summer
* Ever

**Summary:**
This text is a simple and enthusiastic phrase that appears to be a declaration or expression of excitement about summer. The words "Best" and "Ever" suggest that the author is extremely pleased with the season, but the context is unclear.

**Detected Keywords:** Summer, Best, Ever
The following image is an OCR-extracted text from a literary work.

**Title:** A Quote from Charles Dickens' "A Tale of Two Cities"

**Key Points:**

• A contrast between two states of being
• The coexistence of wisdom and foolishness
• A reflection of the social and economic conditions of the time

**Detected Keywords:** wisdom, foolishness, best, worst, age

**Summary:**
This quote is an opening passage from Charles Dickens' novel "A Tale of Two Cities", which explores the themes of contrasts and dualities. The quote highlights the paradoxical nature of human experience, where wisdom and foolishn

In [67]:
from pathlib import Path
import io

file_path = r'C:\Users\mahap\OneDrive\Desktop\C++,JS python codes\.vscode\ML-DL-NLP\Langchain\OpenProject3\test.txt'
with open(file_path, 'rb') as f:
    uploaded_file = io.BytesIO(f.read())
    uploaded_file.name = Path(file_path).name  

results = extract_text_and_inline_images(uploaded_file)
print(results['text'])
print(results['ocr_texts_per_image'])

nth_super_ugly_number:
  problem:
    description: >
      Find the nth super ugly number, where a super ugly number is a positive integer 
      whose prime factors are only from the given list `primes`.

    input:
      n: The index (1-based) of the super ugly number to return.
      primes: A list of prime numbers used to generate super ugly numbers.

    output:
      dp[n]: The nth super ugly number.

  definitions:
    dp[i]: >
      The i-th super ugly number, where:
      - dp[1] is initialized to 1 (the first super ugly number),
      - dp[i] is built using previously computed dp values and the prime list.

    a[j]: >
      The index pointer for each prime number `primes[j]`, initially pointing to dp[1].
      - It tracks which multiple of a given prime to consider next.

  approach:
    type: Dynamic Programming (with multiple moving pointers)
    steps:
      - Initialize `dp` as a vector of size (n+1), with dp[1] = 1.
      - Initialize an index pointer array `a` of size 

In [68]:
response=generate_metadata(results['text'])
print(response)

Here is the extracted metadata in JSON format:

```
{
  "title": "nth Super Ugly Number",
  "summary": "This document describes a problem and solution for finding the nth super ugly number. A super ugly number is a positive integer whose prime factors are only from a given list of prime numbers. The problem is solved using dynamic programming with multiple moving pointers. The algorithm generates the next smallest number whose prime divisors are limited to the provided list of primes. The solution has a time complexity of O(n * k), where k is the number of primes, and a space complexity of O(n + k).",
  "keywords": "super ugly number, dynamic programming, prime numbers, algorithm",
  "topics": ["Algorithms", "Computer Science", "Mathematics"],
  "author": "Dean Hujisen",
  "document_type": "Algorithm Description"
}
```

And here is the detailed summary (15-20 lines):

This document describes a problem and solution for finding the nth super ugly number. A super ugly number is a positive

In [69]:
for text in results['ocr_texts_per_image']:
    response=summarize_ocr_text(text)
    print(response)
print(results['ocr_texts_per_image'])
# There is no inline texts for txt files so that part will be empty

[]
