In [2]:
import os
import json
import re
import pytesseract
import google.generativeai as genai
from pdf2image import convert_from_path
from PIL import Image
from tqdm import tqdm
from IPython.display import Image, display
import io

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Configure Gemini API
genai.configure(api_key="AIzaSyBHsmeRUUxi0EqiUy2D9Pm1kavdPanCk1Q")
vision_model = genai.GenerativeModel("gemini-1.5-flash")

In [4]:
import fitz  # PyMuPDF 
from math import ceil 
from IPython.display import Image, display

class PdfChunker: 
    def __init__(self, pdf_path, chunk_size=3): 
        """ 
        Initialize the PDF chunker 
         
        Args: 
            pdf_path (str): Path to the PDF file 
            chunk_size (int, optional): Number of pages per chunk. Defaults to 3. 
        """ 
        self.doc = fitz.open(pdf_path) 
        self.total_pages = len(self.doc) 
        self.chunk_size = chunk_size 
        self.chunks = ceil(self.total_pages / chunk_size)  # Calculate total number of chunks 
         
    def get_chunk(self, chunk_number): 
        """ 
        Get a specific chunk of pages as images 
         
        Args: 
            chunk_number (int): The chunk number (0-based index) 
             
        Returns: 
            list: List of PNG image data for the requested chunk 
        """ 
        if chunk_number >= self.chunks: 
            raise ValueError(f"Chunk number should be between 0 and {self.chunks-1}") 
             
        start_page = chunk_number * self.chunk_size 
        end_page = min((chunk_number + 1) * self.chunk_size, self.total_pages) 
         
        chunk_images = [] 
        for page_num in range(start_page, end_page): 
            page = self.doc[page_num] 
            pix = page.get_pixmap() 
            img_data = pix.tobytes("png") 
            chunk_images.append(img_data) 
             
        return chunk_images 
     
def pdf_to_images(pdf_path, pages_per_chunk=3): 
    """Convert PDF into images and store them in memory as chunks.""" 
    chunker_obj = PdfChunker(pdf_path, pages_per_chunk)  # Pass chunk_size  
    image_chunks = [] 
    total_chunks = chunker_obj.chunks 
    for i in range(total_chunks): 
        image_chunks.append(chunker_obj.get_chunk(i)) 
    return image_chunks 

# # Example usage with display
# def display_pdf_images(pdf_path, pages_per_chunk=3):
#     """Display PDF images chunk by chunk"""
#     image_chunks = pdf_to_images(pdf_path, pages_per_chunk)
    
#     # Display each chunk of images
#     for chunk_index, chunk in enumerate(image_chunks):
#         print(f"Chunk {chunk_index + 1}:")
#         for page_index, img_data in enumerate(chunk):
#             display(Image(data=img_data))

# # Uncomment and replace with your PDF path
# display_pdf_images("test_pdf.pdf")

In [6]:
def analyze_images(image_paths, global_insights, chapter_insights):
    """Analyze images while referring to global and chapter-level insights."""
    from PIL import Image
    import io

    # Convert byte image data to PIL Image objects
    image_data = [Image.open(io.BytesIO(img_bytes)) for img_bytes in image_paths]

    # Contextual prompt for Gemini with explicit chapter detection
    chapter_detection_prompt = f"""
    Carefully examine these images and perform the following tasks:

    1. Chapter Detection:
    - Identify any clear chapter headings or titles in the images
    - Look for page headers, section markers, or explicit chapter indicators
    - Note the exact chapter name or number as it appears in the image
    - If multiple chapters are present, list them in order of appearance
    - If no chapter is present in these images, don't output any chapter

    2. Comprehensive Image Analysis:
    - Provide a thorough analysis of the key insights present in these images
    - The analysis must be deep and cover all available information
    - Separate different themes into distinct paragraphs
    - Avoid unnecessary introductory phrases
    - DO NOT use unnecessary phrases like: ## Analysis of Text Chunk: Sections 2.9 and 2.10\n\nThis text chunk,**Comprehensive Image Analysis:**,..


    Previous Context:
    - Global insights so far: {global_insights[:]}
    - Current chapter insights: {chapter_insights[-5:]}

    Output Format:
    First, list the detected chapter (if any)
    Only list one detected chapter per one set.
    Then, provide a detailed analysis of the image contents
    """

    # Generate response with chapter detection and image analysis
    response = vision_model.generate_content([chapter_detection_prompt] + image_data)
    return response.text if response else ""

def detect_chapters(text):
    """Detect chapters based on the chapter detection part of the image analysis."""
    # Extract chapter information from the first line of the image analysis
    chapter_lines = text.split('\n')[0].strip()
    
    # Look for chapter patterns
    chapter_pattern = r"\b(?:Chapter|CHAPTER|CHAP)\s+\d+.*"
    detected_chapters = re.findall(chapter_pattern, chapter_lines)
    
    return detected_chapters if detected_chapters else [chapter_lines]

def split_text_into_chunks(text, chunk_size=50):
    """Splits a large text into chunks of approximately 'chunk_size' words."""
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def summarize_text(text_chunk, previous_chapter_insights, current_chapter, global_context):
    """
    Generate a comprehensive summary of a text chunk considering contextual insights.
    
    Args:
        text_chunk (str): The current 50-word chunk to be analyzed
        previous_chapter_insights (list): Insights from previous chunks in the current chapter
        current_chapter (str): Name of the current chapter
        global_context (list): Global insights collected so far
    
    Returns:
        str: Comprehensive analysis of the text chunk
    """
    # Prepare previous chapter insights as context
    previous_insights_context = " ".join(previous_chapter_insights) if previous_chapter_insights else "No previous insights in this chapter."

    # Construct a detailed prompt that incorporates multiple layers of context
    prompt = f"""
    Analyze the following text chunk with extreme thoroughness. Your analysis should:
    1. Extract key insights specific to this text chunk
    2. Contextualize the chunk within the chapter's previous insights
    3. Highlight connections to previous insights
    4. Do NOT use unnecessary phrases like: ## Analysis of Text Chunk: Sections 2.9 and 2.10\n\nThis text chunk,**Comprehensive Image Analysis:**,..

    Current Chapter: {current_chapter}

    Previous Chapter Insights:
    {previous_insights_context}

    Global Context Hints:
    {' '.join(global_context[-3:])}

    Text Chunk to Analyze:
    {text_chunk}

    Provide a comprehensive, structured analysis that:
    - Identifies core themes and concepts
    - Explains how this chunk relates to previous insights in the chapter
    - Highlights any new or unique information
    - Connects insights across different parts of the text
    """

    # Use vision model to generate a comprehensive analysis
    response = vision_model.generate_content(prompt)
    return response.text if response else ""

def process_pdf(pdf_path, output_json="insights.json", summary_json="chunk_summaries.json"):
    """Process PDF into subparts, analyze, detect chapters, store insights, and generate summaries."""
    image_chunks = pdf_to_images(pdf_path)  # No output folder, just a list of image lists
    insights = {"global": [], "chapters": {}}
    chunk_summaries = {}
    current_chapter = "Introduction"  # Default if no chapter is found
    chapter_chunk_insights = []  # Tracks insights for the current chapter

    for i, image_group in enumerate(tqdm(image_chunks, desc="Processing chunks")):
        # Analyze image while considering global and chapter-level insights
        sub_local_insight = analyze_images(
            image_group,
            insights["global"],
            insights["chapters"].get(current_chapter, [])
        )

        # Detect chapters directly from the image analysis output
        detected_chapters = detect_chapters(sub_local_insight)
        if detected_chapters:
            # If a new chapter is detected, reset chapter-specific insights
            current_chapter = detected_chapters[-1]  # Use last detected chapter name
            chapter_chunk_insights = []  # Reset insights for the new chapter

        # Chunk text into 50-word segments
        text_chunks = split_text_into_chunks(sub_local_insight, chunk_size=50)

        # Store insights by chapter
        if current_chapter not in insights["chapters"]:
            insights["chapters"][current_chapter] = []

        insights["chapters"][current_chapter].extend(text_chunks)

        # Generate comprehensive summary for each chunk
        for chunk_number, chunk in enumerate(text_chunks, 1):
            # Pass previous chapter chunk insights before the current chunk
            previous_insights = chapter_chunk_insights.copy()
            
            chunk_summary = summarize_text(
                chunk, 
                previous_insights, 
                current_chapter, 
                insights["global"]
            )
            chunk_summaries[f"{current_chapter}_Chunk_{chunk_number}"] = chunk_summary
            
            # Add current chunk's insight to chapter-specific insights
            chapter_chunk_insights.append(chunk_summary)

        # Update global insights
        insights["global"].append(sub_local_insight)

    # Save insights JSON
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(insights, f, indent=4, ensure_ascii=False)

    # Save chunk summaries JSON
    with open(summary_json, "w", encoding="utf-8") as f:
        json.dump(chunk_summaries, f, indent=4, ensure_ascii=False)

    print(f"Insights saved to {output_json}")
    print(f"Chunk summaries saved to {summary_json}")

# Example Usage:
pdf_file = "david_ch12.pdf"
process_pdf(pdf_file)

Processing chunks: 100%|██████████| 7/7 [05:09<00:00, 44.20s/it]

Insights saved to insights.json
Chunk summaries saved to chunk_summaries.json



