In [88]:
import streamlit as st
import tempfile
import json
from pathlib import Path
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
import pytesseract
from dotenv import load_dotenv
from PIL import Image
import io
import fitz
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage, SystemMessage

In [89]:
import os
load_dotenv()

True

In [90]:
os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Path set in environment variables for tesseract to work 

### Function to extract full text and all inline images from pdf and docx as txt files wont contain images

In [91]:
def extract_text_and_inline_images(uploaded_file):
    suffix = Path(uploaded_file.name).suffix.lower()
    text = ""
    image_list = []
    ocr_texts_per_image = []

    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
        tmp_file.write(uploaded_file.read())
        tmp_file_path = tmp_file.name

    try:
        if suffix == ".pdf":
            try:
                reader = PdfReader(tmp_file_path)
                text = "\n".join([page.extract_text() or "" for page in reader.pages])
            except Exception:
                text = ""
            pdf_doc = fitz.open(tmp_file_path)
            for page in pdf_doc:
                images = page.get_images(full=False)
                for img in images:
                    xref = img[0] if isinstance(img, tuple) else img
                    base_image = pdf_doc.extract_image(xref)
                    image_data = base_image["image"]
                    image = Image.open(io.BytesIO(image_data)).convert("RGB")
                    image_list.append(image)
                    ocr_img_text = pytesseract.image_to_string(image)
                    ocr_texts_per_image.append(ocr_img_text)
        elif suffix == ".docx":
            doc = DocxDocument(tmp_file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            rels = doc.part._rels
            for rel in rels:
                rel = rels[rel]
                if rel.reltype == RT.IMAGE:
                    image_data = rel.target_part.blob
                    image = Image.open(io.BytesIO(image_data)).convert("RGB")
                    image_list.append(image)
                    ocr_img_text = pytesseract.image_to_string(image)
                    ocr_texts_per_image.append(ocr_img_text)
        elif suffix == ".txt":
            with open(tmp_file_path, "r", encoding="utf-8") as f:
                text = f.read()
        else:
            text = f"Unsupported file type: {suffix}"
    finally:
        try:
            os.remove(tmp_file_path)
        except PermissionError:
            pass

    return {
        "text": text.strip(),
        "images": image_list,
        "ocr_texts_per_image": ocr_texts_per_image
    }

### Function to chunk the large text into chunks of size 2800 assuming 1 token is approximately 4 chars

In [92]:
def chunk_text_by_tokens(text, max_tokens=700):
    chunk_size = 2800
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        current_chunk.append(word)
        current_length += len(word) + 1
        if current_length >= chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

In [93]:
import io
with open("test.txt", "rb") as f:
    file_bytes = f.read()

uploaded_file = io.BytesIO(file_bytes)
uploaded_file.name = "test.txt" 

result = extract_text_and_inline_images(uploaded_file)

print(result['text'])
print(result['images'])
print(result['ocr_texts_per_image'])


1)nth_super_ugly_number:
  problem:
    description: >
      Find the nth super ugly number, where a super ugly number is a positive integer 
      whose prime factors are only from the given list `primes`.

    input:
      n: The index (1-based) of the super ugly number to return.
      primes: A list of prime numbers used to generate super ugly numbers.

    output:
      dp[n]: The nth super ugly number.

  definitions:
    dp[i]: >
      The i-th super ugly number, where:
      - dp[1] is initialized to 1 (the first super ugly number),
      - dp[i] is built using previously computed dp values and the prime list.

    a[j]: >
      The index pointer for each prime number `primes[j]`, initially pointing to dp[1].
      - It tracks which multiple of a given prime to consider next.

  approach:
    type: Dynamic Programming (with multiple moving pointers)
    steps:
      - Initialize `dp` as a vector of size (n+1), with dp[1] = 1.
      - Initialize an index pointer array `a` of siz

In [94]:
chunks=chunk_text_by_tokens(result['text'])
for chunk in chunks:
    print(chunk,len(chunk))

1)nth_super_ugly_number: problem: description: > Find the nth super ugly number, where a super ugly number is a positive integer whose prime factors are only from the given list `primes`. input: n: The index (1-based) of the super ugly number to return. primes: A list of prime numbers used to generate super ugly numbers. output: dp[n]: The nth super ugly number. definitions: dp[i]: > The i-th super ugly number, where: - dp[1] is initialized to 1 (the first super ugly number), - dp[i] is built using previously computed dp values and the prime list. a[j]: > The index pointer for each prime number `primes[j]`, initially pointing to dp[1]. - It tracks which multiple of a given prime to consider next. approach: type: Dynamic Programming (with multiple moving pointers) steps: - Initialize `dp` as a vector of size (n+1), with dp[1] = 1. - Initialize an index pointer array `a` of size m (number of primes), all set to 1. - For each i from 2 to n: - Set val = infinity. - For each prime: - Comput

### Summarising with Groq model

In [95]:
def summarize_with_groq(text, api_key, model="llama-3.3-70b-versatile", max_tokens=700):
    llm = ChatGroq(model=model, api_key=api_key, max_tokens=max_tokens)
    prompt = f"Summarize the following text in detail,extract the meaningful sections of document like author names,important keywords etc, but keep the summary under {max_tokens} tokens:\n\n{text}"
    response = llm([
        SystemMessage(content="You are a professional summarization assistant."),
        HumanMessage(content=prompt)
    ])
    return response.content.strip()

Hierarchical summarization is a technique designed to handle very large documents that exceed the token limits of language models like Groq’s Llama 3.3 70B Versatile. First, the document is divided into manageable chunks, and each chunk is individually summarized. Then, all these chunk summaries are combined and, if necessary, summarized again to produce a concise final summary within the model’s token limit. This approach ensures that important information from the entire document is retained and condensed efficiently, making it possible to generate high-quality summaries even for massive documents that would otherwise overwhelm the model’s input size. The main advantages are scalability, comprehensive coverage, and improved summary quality.

In [96]:
def summarize_pdf_text(full_text):
    chunks = chunk_text_by_tokens(full_text, max_tokens=700)
    summaries = []
    for chunk in chunks:
        summary = summarize_with_groq(chunk, api_key=GROQ_API_KEY, model="llama-3.3-70b-versatile", max_tokens=700)
        summaries.append(summary)
    combined_summary = "\n".join(summaries)
    if len(summaries) > 1 or len(combined_summary.split()) > 700:
        combined_summary = summarize_with_groq(combined_summary, api_key=GROQ_API_KEY, model="llama-3.3-70b-versatile", max_tokens=700)
    return combined_summary.strip()

### Function to generate Metadata using the summarised text by groq model

In [97]:
def generate_metadata(summarized_text):
    llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=GROQ_API_KEY, max_tokens=700)
    prompt = f"""
You are a professional and wonderful metadata assistant.

Analyze the following document summary and return structured metadata in JSON format with fields: 
- title
- summary (at least 15-20 lines in detail covering all important points)
- keywords (comma-separated)
- topics (broad subject categories)
- author (if mentioned)
- document_type
Extract and leverage the important sections of summary
Document Summary:
{summarized_text}
"""
    try:
        response = llm([
            SystemMessage(content="You are a metadata extraction assistant."),
            HumanMessage(content=prompt)
        ])
        return response.content.strip()
    except Exception as e:
        st.error(f"Metadata extraction failed: {e}")
        return '{"error": "Metadata extraction failed."}'

In [98]:
def summarize_ocr_text(ocr_text):
    if not ocr_text.strip():
        return "No OCR content found to summarize."
    llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=GROQ_API_KEY)
    prompt = (
        "You are a professional assistant. "
        "Start your response with 'The following image ...'. "
        "Summarize the following OCR-extracted content in a clear, well-organized, and visually appealing markdown format. "
        "Your summary should include:\n"
        "- A short title or heading for the content\n"
        "- Key points or highlights as a bullet list\n"
        "- Detected names, dates, numbers, or keywords (if any)\n"
        "- A concise paragraph summarizing the main idea or purpose\n"
        "If the content is a graph or chart, explain axes and key trends. "
        "If it's a table, highlight main comparisons or figures. "
        "If it's a scanned paragraph, summarize the main idea. "
        "Avoid assumptions. If content is unclear, mention it.\n\n"
        f"OCR Text:\n{ocr_text}"
    )
    try:
        response = llm([
            SystemMessage(content="You summarize OCR-extracted content in structured markdown."),
            HumanMessage(content=prompt)
        ])
        return response.content.strip()
    except Exception as e:
        return f"OCR summarization failed: {e}"

### Testing each fuctions

### 1)TXT File

In [99]:
import io
with open("test.txt", "rb") as f:
    file_bytes = f.read()

uploaded_file = io.BytesIO(file_bytes)
uploaded_file.name = "test.txt" 

result = extract_text_and_inline_images(uploaded_file)

print(result['text'])
print(result['images'])
print(result['ocr_texts_per_image'])

1)nth_super_ugly_number:
  problem:
    description: >
      Find the nth super ugly number, where a super ugly number is a positive integer 
      whose prime factors are only from the given list `primes`.

    input:
      n: The index (1-based) of the super ugly number to return.
      primes: A list of prime numbers used to generate super ugly numbers.

    output:
      dp[n]: The nth super ugly number.

  definitions:
    dp[i]: >
      The i-th super ugly number, where:
      - dp[1] is initialized to 1 (the first super ugly number),
      - dp[i] is built using previously computed dp values and the prime list.

    a[j]: >
      The index pointer for each prime number `primes[j]`, initially pointing to dp[1].
      - It tracks which multiple of a given prime to consider next.

  approach:
    type: Dynamic Programming (with multiple moving pointers)
    steps:
      - Initialize `dp` as a vector of size (n+1), with dp[1] = 1.
      - Initialize an index pointer array `a` of siz

In [100]:
response=summarize_pdf_text(result['text'])
print(response)

**Summary of the Document:**

The document discusses various problems related to dynamic programming, including finding the nth super ugly number, counting the number of Longest Increasing Subsequences (LIS) in an array, and determining the number of insertions required to make a string palindromic. The problems are solved using dynamic programming techniques, with a focus on time and space complexity analysis.

**Extracted Information:**

* **Author Names:** 
  + Not mentioned in most sections
  + Samyak Mahapatra (mentioned in one section)
* **Important Keywords:** 
  + Dynamic Programming
  + Prime Factors
  + Super Ugly Number
  + Longest Increasing Subsequences
  + Array
  + Longest Common Subsequence (LCS)
  + Longest Palindromic Subsequence (LPS)
  + Minimum Insertions to Make a String Palindromic
* **Problem Statements:**
  + Finding the nth super ugly number
  + Counting the number of Longest Increasing Subsequences (LIS) in an array
  + Determining the number of insertions re

In [101]:
metadata=generate_metadata(response)
print(metadata)

Here is the extracted metadata in JSON format:

```json
{
  "title": "Dynamic Programming Techniques for Solving Complex Problems",
  "summary": "The document discusses various problems related to dynamic programming, including finding the nth super ugly number, counting the number of Longest Increasing Subsequences (LIS) in an array, and determining the number of insertions required to make a string palindromic. The problems are solved using dynamic programming techniques, with a focus on time and space complexity analysis. The document covers various algorithms, including dynamic programming with multiple moving pointers, and provides formulas for calculating the Longest Common Subsequence (LCS) and the number of insertions required to make a string palindromic. The time complexities for the approaches are provided, including O(n * k) for finding the nth super ugly number, O(n^2) for counting the number of Longest Increasing Subsequences (LIS) in an array, and O(n * m) for determinin

### 2)PDF

In [104]:
import io
with open("test.pdf", "rb") as f:
    file_bytes = f.read()

uploaded_file = io.BytesIO(file_bytes)
uploaded_file.name = "test.pdf" 

result = extract_text_and_inline_images(uploaded_file)

print(result['text'])
# print(result['images'])
print(result['ocr_texts_per_image'])

The tiger ( Panthera tigris ) is the largest member of the cat family and one of the world ’s most iconic 
and powerful predators. Easily recognized by its striking orange coat with black stripes, the tiger is 
native to various parts of Asia, inhabiting forests, grasslands, and mangrove swamps. Tigers are 
solitary and territorial animals, relying on stealth and strength to hunt prey such as deer, wild boar, 
and buffalo. Sadly, their populations have declined sharply due to habitat loss, poaching, and huma n-
wildlife conflict, leaving fewer than 4,000 tigers in the wild today. As an endangered species and a 
keystone predator, the tiger plays a crucial role in maintaining the health of its ecosystem. Revered in 
many cultures as a symbol of strength and courag e, the tiger ’s survival depends on ongoing 
conservation efforts to protect both the species and its natural habitats.  
 
The elephant is the largest living land animal, easily recognized by its massive body, long trunk, lar

In [105]:
response=summarize_pdf_text(result['text'])
print(response)

**Detailed Summary:**

The provided text discusses two separate topics: the first part focuses on three iconic species (tiger, elephant, and parrot), while the second part talks about dogs and conservation efforts for extraordinary birds. 

The first part describes the tiger, a powerful predator native to Asia, recognized by its distinctive orange coat with black stripes. Unfortunately, tiger populations have declined due to habitat loss, poaching, and human-wildlife conflict. The elephant, the largest living land animal, is highly intelligent and social, with three species existing today. Elephants are herbivores and play a crucial role in shaping their ecosystems, but their populations are declining due to similar threats. The parrot, a vibrant and intelligent bird, is known for its colorful feathers and ability to mimic sounds, but many species face threats from habitat loss and human activities.

In the second part, the text shifts to discussing dogs, highlighting their intelligenc

In [106]:
for text in result['ocr_texts_per_image']:
    print(summarize_ocr_text(text))

The following image contains a short and concise OCR-extracted content. 
### Summer Message
* The content consists of three short words 
Key points: 
- Best
- Summer
- Ever
Detected keywords: Summer.
The main idea of this content appears to be a brief expression of enthusiasm or excitement about the summer season, with the words "Best. Summer. Ever." likely indicating a positive anticipation or experience. However, without more context, the purpose or origin of this message is unclear.
The following image appears to be a scanned paragraph of text. 
### Introduction to a Historical Narrative
* Contrasting periods in time
* Coexistence of wisdom and foolishness
* Historical context not explicitly stated
Detected keywords: wisdom, foolishness, best of times, worst of times. 
The main idea of this paragraph, which seems to be the opening of a historical narrative, possibly from the novel "A Tale of Two Cities" by Charles Dickens, is to set a tone that highlights the contradictions and comp

In [107]:
metadata=generate_metadata(response)
print(metadata)

Here's the extracted metadata in JSON format:

```json
{
  "title": "Iconic Species and Conservation Efforts",
  "summary": "The document discusses two main topics: the first part focuses on three iconic species - the tiger, elephant, and parrot - and their declining populations due to habitat loss, poaching, and human-wildlife conflict. The tiger, a powerful predator native to Asia, is recognized by its distinctive orange coat with black stripes. The elephant, the largest living land animal, is highly intelligent and social, with three species existing today. Elephants are herbivores and play a crucial role in shaping their ecosystems. The parrot, a vibrant and intelligent bird, is known for its colorful feathers and ability to mimic sounds, but many species face threats from habitat loss and human activities. \nIn the second part, the text shifts to discussing dogs, highlighting their intelligence, loyalty, and companionship. Dogs come in various breeds, each with distinct physical t

### Docx

In [108]:
import io
with open("test.docx", "rb") as f:
    file_bytes = f.read()

uploaded_file = io.BytesIO(file_bytes)
uploaded_file.name = "test.docx" 

result = extract_text_and_inline_images(uploaded_file)

print(result['text'])
# print(result['images'])
print(result['ocr_texts_per_image'])

The mean and standard deviation are two fundamental statistical measures with distinct yet complementary properties. The mean, or average, represents the central tendency of a dataset, providing a single value that summarizes all the numbers in the set. It is sensitive to every value in the data, meaning that even a single extremely large or small value (an outlier) can significantly affect the mean. On the other hand, the standard deviation measures the amount of variation or dispersion in the dataset. A low standard deviation indicates that the data points are closely clustered around the mean, while a high standard deviation suggests that the values are spread out over a wider range. Both the mean and standard deviation are widely used in statistics to describe and compare datasets, and they are especially important in fields such as science, finance, and engineering for understanding patterns, consistency, and reliability in data. 
The z-test and p-value are important concepts in s

In [109]:
response=summarize_pdf_text(result['text'])
print(response)

**Summary:**

The provided text discusses fundamental statistical concepts, including measures of central tendency and variation, hypothesis testing, and data distribution. The mean and standard deviation are introduced as essential measures, with the mean representing the central tendency of a dataset and the standard deviation measuring variation or dispersion. The z-test and p-value are explained as crucial tools in statistical hypothesis testing, determining significant differences between sample and population means and quantifying evidence against the null hypothesis. The 67-95-99 rule, or empirical rule, describes the distribution of data in a normal distribution, with specific percentages of data falling within one, two, or three standard deviations of the mean.

The text also explores the log-normal distribution, a continuous probability distribution where the logarithm of the random variable is normally distributed. This distribution is useful for modeling data that cannot be

In [110]:
for txt in result['ocr_texts_per_image']:
    print(summarize_ocr_text(txt))

The following image appears to be a scanned paragraph describing parameters of a discrete uniform distribution. 
### Discrete Uniform Distribution Parameters
* Parameters: 
  * a and b are integers where b > a
  * n = b - a + 1 (number of possible values)
  * Support: {a, a+1, ..., b-1, b}
  * PMF (Probability Mass Function): 1/n
  * CDF (Cumulative Distribution Function): |k - a + 1|/n
* Detected keywords: discrete uniform distribution, parameters, PMF, CDF, mean, median, mode
* Detected numbers: none specific, but variables a, b, and n are defined

This content describes the parameters of a discrete uniform distribution, describing the support, probability mass function, cumulative distribution function, mean, median, and mode. The mean and median are both calculated as (a + b)/2, while the mode is not applicable (N/A) for this distribution type. The parameters a and b define the range of the distribution, with n representing the total number of possible values. The PMF is uniformly 

In [111]:
metadata=generate_metadata(response)
print(metadata)

Here is the extracted metadata in JSON format:

```json
{
  "title": "Fundamental Statistical Concepts",
  "summary": "The provided text discusses fundamental statistical concepts, including measures of central tendency and variation, hypothesis testing, and data distribution. The mean and standard deviation are introduced as essential measures, with the mean representing the central tendency of a dataset and the standard deviation measuring variation or dispersion. The z-test and p-value are explained as crucial tools in statistical hypothesis testing, determining significant differences between sample and population means and quantifying evidence against the null hypothesis. The 67-95-99 rule, or empirical rule, describes the distribution of data in a normal distribution, with specific percentages of data falling within one, two, or three standard deviations of the mean.

The text also explores the log-normal distribution, a continuous probability distribution where the logarithm of 