In [1]:
import streamlit as st
import tempfile
import json
from pathlib import Path
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
import pytesseract
from dotenv import load_dotenv
from PIL import Image
import io
import fitz
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage, SystemMessage

In [2]:
import os
load_dotenv()

True

In [3]:
os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Path set in environment variables for tesseract to work 

### Function to extract full text and all inline images from pdf and docx as txt files wont contain images

In [4]:
def extract_text_and_inline_images(uploaded_file):
    suffix = Path(uploaded_file.name).suffix.lower()
    text = ""
    image_list = []
    ocr_texts_per_image = []

    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
        tmp_file.write(uploaded_file.read())
        tmp_file_path = tmp_file.name

    try:
        if suffix == ".pdf":
            try:
                reader = PdfReader(tmp_file_path)
                text = "\n".join([page.extract_text() or "" for page in reader.pages])
            except Exception:
                text = ""
            pdf_doc = fitz.open(tmp_file_path)
            for page in pdf_doc:
                images = page.get_images(full=False)
                for img in images:
                    xref = img[0] if isinstance(img, tuple) else img
                    base_image = pdf_doc.extract_image(xref)
                    image_data = base_image["image"]
                    image = Image.open(io.BytesIO(image_data)).convert("RGB")
                    image_list.append(image)
                    ocr_img_text = pytesseract.image_to_string(image)
                    ocr_texts_per_image.append(ocr_img_text)
        elif suffix == ".docx":
            doc = DocxDocument(tmp_file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            rels = doc.part._rels
            for rel in rels:
                rel = rels[rel]
                if rel.reltype == RT.IMAGE:
                    image_data = rel.target_part.blob
                    image = Image.open(io.BytesIO(image_data)).convert("RGB")
                    image_list.append(image)
                    ocr_img_text = pytesseract.image_to_string(image)
                    ocr_texts_per_image.append(ocr_img_text)
        elif suffix == ".txt":
            with open(tmp_file_path, "r", encoding="utf-8") as f:
                text = f.read()
        else:
            text = f"Unsupported file type: {suffix}"
    finally:
        try:
            os.remove(tmp_file_path)
        except PermissionError:
            pass

    return {
        "text": text.strip(),
        "images": image_list,
        "ocr_texts_per_image": ocr_texts_per_image
    }

### Function to chunk the large text into chunks of size 2800 assuming 1 token is approximately 4 chars

In [5]:
def chunk_text_by_tokens(text, max_tokens=700):
    chunk_size = 2800
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        current_chunk.append(word)
        current_length += len(word) + 1
        if current_length >= chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

In [6]:
import io
with open("test.txt", "rb") as f:
    file_bytes = f.read()

uploaded_file = io.BytesIO(file_bytes)
uploaded_file.name = "test.txt" 

result = extract_text_and_inline_images(uploaded_file)

print(result['text'])
print(result['images'])
print(result['ocr_texts_per_image'])


1)nth_super_ugly_number:
  problem:
    description: >
      Find the nth super ugly number, where a super ugly number is a positive integer 
      whose prime factors are only from the given list `primes`.

    input:
      n: The index (1-based) of the super ugly number to return.
      primes: A list of prime numbers used to generate super ugly numbers.

    output:
      dp[n]: The nth super ugly number.

  definitions:
    dp[i]: >
      The i-th super ugly number, where:
      - dp[1] is initialized to 1 (the first super ugly number),
      - dp[i] is built using previously computed dp values and the prime list.

    a[j]: >
      The index pointer for each prime number `primes[j]`, initially pointing to dp[1].
      - It tracks which multiple of a given prime to consider next.

  approach:
    type: Dynamic Programming (with multiple moving pointers)
    steps:
      - Initialize `dp` as a vector of size (n+1), with dp[1] = 1.
      - Initialize an index pointer array `a` of siz

In [7]:
chunks=chunk_text_by_tokens(result['text'])
for chunk in chunks:
    print(chunk,len(chunk))

1)nth_super_ugly_number: problem: description: > Find the nth super ugly number, where a super ugly number is a positive integer whose prime factors are only from the given list `primes`. input: n: The index (1-based) of the super ugly number to return. primes: A list of prime numbers used to generate super ugly numbers. output: dp[n]: The nth super ugly number. definitions: dp[i]: > The i-th super ugly number, where: - dp[1] is initialized to 1 (the first super ugly number), - dp[i] is built using previously computed dp values and the prime list. a[j]: > The index pointer for each prime number `primes[j]`, initially pointing to dp[1]. - It tracks which multiple of a given prime to consider next. approach: type: Dynamic Programming (with multiple moving pointers) steps: - Initialize `dp` as a vector of size (n+1), with dp[1] = 1. - Initialize an index pointer array `a` of size m (number of primes), all set to 1. - For each i from 2 to n: - Set val = infinity. - For each prime: - Comput

### Summarising with Groq model

In [8]:
def summarize_with_groq(text, api_key, model="llama-3.3-70b-versatile", max_tokens=700):
    llm = ChatGroq(model=model, api_key=api_key, max_tokens=max_tokens)
    prompt = f"Summarize the following text in detail,extract the meaningful sections of document like author names,important keywords etc, but keep the summary under {max_tokens} tokens:\n\n{text}"
    response = llm([
        SystemMessage(content="You are a professional summarization assistant."),
        HumanMessage(content=prompt)
    ])
    return response.content.strip()

Hierarchical summarization is a technique designed to handle very large documents that exceed the token limits of language models like Groq’s Llama 3.3 70B Versatile. First, the document is divided into manageable chunks, and each chunk is individually summarized. Then, all these chunk summaries are combined and, if necessary, summarized again to produce a concise final summary within the model’s token limit. This approach ensures that important information from the entire document is retained and condensed efficiently, making it possible to generate high-quality summaries even for massive documents that would otherwise overwhelm the model’s input size. The main advantages are scalability, comprehensive coverage, and improved summary quality.

In [9]:
def summarize_pdf_text(full_text):
    chunks = chunk_text_by_tokens(full_text, max_tokens=700)
    summaries = []
    for chunk in chunks:
        summary = summarize_with_groq(chunk, api_key=GROQ_API_KEY, model="llama-3.3-70b-versatile", max_tokens=700)
        summaries.append(summary)
    combined_summary = "\n".join(summaries)
    if len(summaries) > 1 or len(combined_summary.split()) > 700:
        combined_summary = summarize_with_groq(combined_summary, api_key=GROQ_API_KEY, model="llama-3.3-70b-versatile", max_tokens=700)
    return combined_summary.strip()

### Function to generate Metadata using the summarised text by groq model

In [10]:
def generate_metadata(summarized_text):
    llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=GROQ_API_KEY, max_tokens=700)
    prompt = f"""
You are a professional and wonderful metadata assistant.

Analyze the following document summary and return structured metadata in JSON format with fields: 
- title
- summary (at least 15-20 lines in detail covering all important points)
- keywords (comma-separated)
- topics (broad subject categories)
- author (if mentioned)
- document_type
Extract and leverage the important sections of summary
Document Summary:
{summarized_text}
"""
    try:
        response = llm([
            SystemMessage(content="You are a metadata extraction assistant."),
            HumanMessage(content=prompt)
        ])
        return response.content.strip()
    except Exception as e:
        st.error(f"Metadata extraction failed: {e}")
        return '{"error": "Metadata extraction failed."}'

In [11]:
def summarize_ocr_text(ocr_text):
    if not ocr_text.strip():
        return "No OCR content found to summarize."
    llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=GROQ_API_KEY)
    prompt = (
        "You are a professional assistant. "
        "Start your response with 'The following image ...'. "
        "Summarize the following OCR-extracted content in a clear, well-organized, and visually appealing markdown format. "
        "Your summary should include:\n"
        "- A short title or heading for the content\n"
        "- Key points or highlights as a bullet list\n"
        "- Detected names, dates, numbers, or keywords (if any)\n"
        "- A concise paragraph summarizing the main idea or purpose\n"
        "If the content is a graph or chart, explain axes and key trends. "
        "If it's a table, highlight main comparisons or figures. "
        "If it's a scanned paragraph, summarize the main idea. "
        "Avoid assumptions. If content is unclear, mention it.\n\n"
        f"OCR Text:\n{ocr_text}"
    )
    try:
        response = llm([
            SystemMessage(content="You summarize OCR-extracted content in structured markdown."),
            HumanMessage(content=prompt)
        ])
        return response.content.strip()
    except Exception as e:
        return f"OCR summarization failed: {e}"

### Testing each fuctions

### 1)TXT File

In [12]:
import io
with open("test.txt", "rb") as f:
    file_bytes = f.read()

uploaded_file = io.BytesIO(file_bytes)
uploaded_file.name = "test.txt" 

result = extract_text_and_inline_images(uploaded_file)

print(result['text'])
print(result['images'])
print(result['ocr_texts_per_image'])

1)nth_super_ugly_number:
  problem:
    description: >
      Find the nth super ugly number, where a super ugly number is a positive integer 
      whose prime factors are only from the given list `primes`.

    input:
      n: The index (1-based) of the super ugly number to return.
      primes: A list of prime numbers used to generate super ugly numbers.

    output:
      dp[n]: The nth super ugly number.

  definitions:
    dp[i]: >
      The i-th super ugly number, where:
      - dp[1] is initialized to 1 (the first super ugly number),
      - dp[i] is built using previously computed dp values and the prime list.

    a[j]: >
      The index pointer for each prime number `primes[j]`, initially pointing to dp[1].
      - It tracks which multiple of a given prime to consider next.

  approach:
    type: Dynamic Programming (with multiple moving pointers)
    steps:
      - Initialize `dp` as a vector of size (n+1), with dp[1] = 1.
      - Initialize an index pointer array `a` of siz

In [13]:
response=summarize_pdf_text(result['text'])
print(response)

  response = llm([


**Summary**

The provided text discusses several problems related to dynamic programming, including finding the nth super ugly number, counting the number of Longest Increasing Subsequences (LIS) in an array, and calculating the Longest Common Subsequence (LCS) between two strings. 

**Key Points**

1. **nth Super Ugly Number Problem**: The problem involves finding the nth super ugly number using dynamic programming with multiple moving pointers. The time complexity is O(n * k), where k is the number of primes, and the space complexity is O(n + k).
2. **Counting Longest Increasing Subsequences (LIS) Problem**: The problem involves computing the number of LIS in a given array using dynamic programming. The approach tracks both the length of the LIS ending at each element and the number of such LIS.
3. **Longest Common Subsequence (LCS) Problem**: The problem finds the length of the longest common subsequence between two strings using dynamic programming with a time complexity of O(n*m) 

In [14]:
metadata=generate_metadata(response)
print(metadata)

Here is the extracted metadata in JSON format:

```json
{
  "title": "Dynamic Programming Problems",
  "summary": "The document discusses several dynamic programming problems, including finding the nth super ugly number, counting the number of Longest Increasing Subsequences (LIS) in an array, and calculating the Longest Common Subsequence (LCS) between two strings. The problems are solved using dynamic programming techniques with varying time and space complexities. The nth super ugly number problem uses multiple moving pointers with a time complexity of O(n * k) and space complexity of O(n + k). The LIS problem computes the number of LIS in a given array by tracking the length of the LIS ending at each element and the number of such LIS. The LCS problem finds the length of the longest common subsequence between two strings using dynamic programming with a time complexity of O(n*m) and space complexity of O(n*m). The Longest Palindromic Subsequence (LPS) problem finds the length of th

### 2)PDF

In [15]:
import io
with open("test.pdf", "rb") as f:
    file_bytes = f.read()

uploaded_file = io.BytesIO(file_bytes)
uploaded_file.name = "test.pdf" 

result = extract_text_and_inline_images(uploaded_file)

print(result['text'])
# print(result['images'])
print(result['ocr_texts_per_image'])

The tiger ( Panthera tigris ) is the largest member of the cat family and one of the world ’s most iconic 
and powerful predators. Easily recognized by its striking orange coat with black stripes, the tiger is 
native to various parts of Asia, inhabiting forests, grasslands, and mangrove swamps. Tigers are 
solitary and territorial animals, relying on stealth and strength to hunt prey such as deer, wild boar, 
and buffalo. Sadly, their populations have declined sharply due to habitat loss, poaching, and huma n-
wildlife conflict, leaving fewer than 4,000 tigers in the wild today. As an endangered species and a 
keystone predator, the tiger plays a crucial role in maintaining the health of its ecosystem. Revered in 
many cultures as a symbol of strength and courag e, the tiger ’s survival depends on ongoing 
conservation efforts to protect both the species and its natural habitats.  
 
The elephant is the largest living land animal, easily recognized by its massive body, long trunk, lar

In [16]:
response=summarize_pdf_text(result['text'])
print(response)

Here is a detailed summary of the provided text, extracting meaningful sections, author names, and important keywords, all within the 700-token limit:

The text describes three iconic species: the tiger, elephant, and parrot, highlighting their unique characteristics and the threats they face. The tiger, a powerful predator, is native to Asia and recognized by its distinctive orange coat with black stripes. However, its population has declined due to habitat loss, poaching, and human-wildlife conflict. The elephant, the largest living land animal, is highly intelligent and social, playing a crucial role in shaping its ecosystem. Unfortunately, elephant populations have also declined due to habitat loss, poaching, and human-wildlife conflict. The parrot, a vibrant and intelligent bird, is known for its colorful feathers and ability to mimic sounds.

Important keywords extracted from the text include habitat loss, poaching, human-wildlife conflict, conservation, keystone species, ecosyst

In [17]:
for text in result['ocr_texts_per_image']:
    print(summarize_ocr_text(text))

The following image appears to be a scanned paragraph or text excerpt with a short and enthusiastic message. 
### Summer Enthusiasm
* The text expresses excitement about summer
* The tone is positive and upbeat
Detected keywords: Summer
The main idea of this text is to convey that the speaker believes the current summer is the best one ever, expressing a high level of enthusiasm and excitement. However, the content is quite brief and lacks specific details, making it unclear what makes this summer stand out.
The following image contains a scanned paragraph of text. 
### Introduction to a Timeless Tale
* The passage describes a time of contrasting experiences
* It highlights the coexistence of wisdom and foolishness
* The text is a famous opening to a classic novel
Detected keywords: wisdom, foolishness, best of times, worst of times. 
The main idea of this passage is to introduce a historical period marked by extremes, where good and bad, wisdom and foolishness, all existed simultaneou

In [18]:
metadata=generate_metadata(response)
print(metadata)

Here is the extracted metadata in JSON format:

```json
{
  "title": "Iconic Species and Conservation Efforts",
  "summary": "The text describes three iconic species: the tiger, elephant, and parrot, highlighting their unique characteristics and the threats they face. The tiger, a powerful predator, is native to Asia and recognized by its distinctive orange coat with black stripes. However, its population has declined due to habitat loss, poaching, and human-wildlife conflict. The elephant, the largest living land animal, is highly intelligent and social, playing a crucial role in shaping its ecosystem. Unfortunately, elephant populations have also declined due to habitat loss, poaching, and human-wildlife conflict. The parrot, a vibrant and intelligent bird, is known for its colorful feathers and ability to mimic sounds. \n\nThe conservation status of the mentioned species is as follows: the tiger is endangered with fewer than 4,000 individuals in the wild, the elephant has a declinin

### Docx

In [19]:
import io
with open("test.docx", "rb") as f:
    file_bytes = f.read()

uploaded_file = io.BytesIO(file_bytes)
uploaded_file.name = "test.docx" 

result = extract_text_and_inline_images(uploaded_file)

print(result['text'])
# print(result['images'])
print(result['ocr_texts_per_image'])

The mean and standard deviation are two fundamental statistical measures with distinct yet complementary properties. The mean, or average, represents the central tendency of a dataset, providing a single value that summarizes all the numbers in the set. It is sensitive to every value in the data, meaning that even a single extremely large or small value (an outlier) can significantly affect the mean. On the other hand, the standard deviation measures the amount of variation or dispersion in the dataset. A low standard deviation indicates that the data points are closely clustered around the mean, while a high standard deviation suggests that the values are spread out over a wider range. Both the mean and standard deviation are widely used in statistics to describe and compare datasets, and they are especially important in fields such as science, finance, and engineering for understanding patterns, consistency, and reliability in data. 
The z-test and p-value are important concepts in s

In [20]:
response=summarize_pdf_text(result['text'])
print(response)

Here is a detailed summary of the provided text, extracting meaningful sections, keywords, and concepts, all within the 700-token limit:

The text discusses fundamental statistical concepts, including the mean, standard deviation, z-test, and p-value. These measures are crucial in statistics, particularly in fields like science, finance, and engineering. The mean represents the central tendency of a dataset, while the standard deviation measures the variation or dispersion. The z-test determines significant differences between sample and population means, and the p-value quantifies the strength of evidence against the null hypothesis.

The text also introduces the 67-95-99 rule, also known as the empirical rule, which describes the distribution of data in a normal (Gaussian) distribution. This rule states that about 68% of data falls within one standard deviation of the mean, 95% within two standard deviations, and 99.7% within three standard deviations.

In addition to these concepts,

In [21]:
for txt in result['ocr_texts_per_image']:
    print(summarize_ocr_text(txt))

The following image appears to be a scanned paragraph describing parameters of a discrete uniform distribution. 
### Discrete Uniform Distribution Parameters
* The parameters are defined by two integers `a` and `b` where `b > a`.
* The number of possible values `n` is calculated as `n = b - a + 1`.
* The support of the distribution is the set of integers from `a` to `b`.
* Key statistics include:
  + Mean: `(a + b) / 2`
  + Median: `(a + b) / 2`
  + Mode: Not applicable (N/A)
* Detected keywords: discrete uniform distribution, parameters, mean, median, mode.
The main idea of this content is to outline the parameters and key statistics of a discrete uniform distribution, which is a probability distribution where every possible outcome has an equal likelihood of occurring. The distribution is defined by two integers `a` and `b`, and various statistics such as mean, median, and mode are calculated based on these parameters. The mode is noted as not applicable, indicating that the distribu

In [22]:
metadata=generate_metadata(response)
print(metadata)

Here is the extracted metadata in JSON format:

```json
{
  "title": "Fundamental Statistical Concepts",
  "summary": "The document discusses fundamental statistical concepts, including the mean, standard deviation, z-test, and p-value. These measures are crucial in statistics, particularly in fields like science, finance, and engineering. The mean represents the central tendency of a dataset, while the standard deviation measures the variation or dispersion. The z-test determines significant differences between sample and population means, and the p-value quantifies the strength of evidence against the null hypothesis.

The text also introduces the 67-95-99 rule, also known as the empirical rule, which describes the distribution of data in a normal (Gaussian) distribution. This rule states that about 68% of data falls within one standard deviation of the mean, 95% within two standard deviations, and 99.7% within three standard deviations.

In addition to these concepts, the text explo