In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances

from pdf_scraper.doc_utils     import open_exam
from pdf_scraper.doc_utils     import identify_footers, identify_instructions, identify_subtitles, identify_subsubtitles
from pdf_scraper.line_utils    import clean_line_df, get_category_boxes, get_df_bbox
from pdf_scraper.doc_utils     import get_images, filter_images, assign_in_image_captions, identify_vertical_captions
from pdf_scraper.doc_utils     import enrich_doc_df_with_images, identify_page_clusters, get_lines_in_image_clusters, preproc_images
from pdf_scraper.clustering.cluster_utils import get_vert_neigh_dist, split_cluster, hdbscan, find_y0_dL, correct_eps_y_scale,get_eps_x, get_eps_y
from pdf_scraper.general_utils import df_bbox_dist, df_bbox_next_row_dist
from pdf_scraper.image_utils   import (get_bboxed_page_image, show_all_imgs, show_image, 
                                       filter_horizontal_strips, filter_point_images, filter_low_res_doubles,
                                       reconstitute_strips, sort_images)

pd.set_option("display.float_format", "{:.2f}".format)
pd.set_option("display.max_colwidth", 200)

In [None]:
def check_year_images(year, page, x_scale, y_scale):

    doc    = open_exam(year, "english", "al",1)
    images = get_images(doc)
    images = filter_images(images)
    assign_in_image_captions(df,images)
    
    df = clean_line_df(df)
    df = enrich_doc_df_with_images(df,images)

    
    page_df = df.loc[df.page==page, ["text", 'x0', 'y0', 'x1', 'y1', "page","w","category"]].copy()
    
    x_scale, y_scale = 2.0/3.0 , 1.15
    eps_y = get_eps_y(page_df, page, y_scale)
    eps_x = get_eps_x(page_df, page, x_scale)
    
    print(f"eps_x: {eps_x:<8.2f} eps_y: {eps_y:<8.2f} eps_y scale:{y_scale:4.2f}")
    
    rectangs, labia = hdbscan(page_df, 100, eps_x, eps_y, df_bbox_dist,False)
    imgs = [get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=labelos) for rectangies, labelos in zip(rectangs, labia)]
    display(imgs[-1])
    return rectangs, labia

# Split images

This will be similar to the stripped images issue we have already dealt with.

In [None]:
doc    = open_exam(2005, "english", "al",1)
images = get_images(doc)
images = preproc_images(images)
show_all_imgs(3,5,images)

In [None]:
def find_contiguous_image_pairs(images, tol) -> list[list[dict]]:
    contiguous_image_pairs = []
    for i in range(len(images)):
        for j in range(i+1,len(images)):
            im_a, im_b  = images[i], images[j]
            x0_a, y0_a, x1_a, y1_a = im_a["bbox"]
            x0_b, y0_b, x1_b, y1_b = im_b["bbox"]
    
            same_page = im_a["page"] == im_b["page"]
            same_x    = (x0_a==x0_b and x1_a ==x1_b)
    
            a_bellow = (y1_a <= y0_b+tol and y1_a >= y0_b-tol)
            a_on_top = (y1_b <= y0_a+tol and y1_b >= y0_a-tol)
            top_bottom_touch = a_bellow or a_on_top
    
            if same_page and same_x and top_bottom_touch:
                contiguous_image_pairs.append([im_a,im_b] )
    return contiguous_image_pairs

def merge_contiguous_pair_lists(contiguous_image_pairs):
    """
    Merge contiguous image pairs (like [1,2], [2,3]) into full groups ([1,2,3]).
    Keeps groups separate by page.
    """
    merged = []

    images = [img for im_pair in contiguous_image_pairs for img in im_pair] 
    pair_numbers = [[a["number"],b["number"]] for a, b in contiguous_image_pairs]
    
    for pair in pair_numbers:
        added = False
        for group in merged:
            page_pair  = next(im["page"] for im in images if im["number"]==pair[0])
            page_group = next(im["page"] for im in images if im["number"] in group)
            if page_group != page_pair:
                continue

            if any(x in group for x in pair):
                group.update(pair)
                added = True
                break
        if not added:
            merged.append(set(pair))  
    
    merged_ids = [sorted(list(g)) for g in merged]
    image_lookup = {im["number"]: im for im in images}
    contiguous_image_groups = [ [image_lookup[id] for id in id_list] for id_list in merged_ids]
    contiguous_image_groups = [ sort_images(img_list) for img_list in contiguous_image_groups]
    

    return contiguous_image_groups

def identify_contiguous_images(images):
    contiguous_image_pairs= find_contiguous_image_pairs(images,0.01)
    contiguous_image_groups= merge_contiguous_pair_lists(contiguous_image_pairs)
    return contiguous_image_groups


merged= identify_contiguous_images(images)
print([[im["number"] for im in im_list] for im_list in merged] )

fart = identify_contiguous_images(merged[0])[0]
print([im["number"] for im in fart] )


In [None]:
from pdf_scraper.image_utils import is_horizontal_strip
import io 
def stitch_strips(image_blocks: list[dict]) -> dict:
    """
    Stitch a list of horizontal image strips (already sorted top-to-bottom) into a single image.
    Return a dictionary mimicking a fitz text block.
    """
    # check if strips or contiguous:
    strip_blocks = [strip for strip in image_blocks if is_horizontal_strip(strip)]
    if not strip_blocks:
        strip_blocks = identify_contiguous_images(image_blocks)[0]
    if not strip_blocks:
        return image_blocks

    images = [Image.open(io.BytesIO(block["image"])) for block in strip_blocks]

    total_height = sum(img.height for img in images)
    max_width    = max(img.width for img in images)

    stitched = Image.new("RGB", (max_width, total_height), (255, 255, 255))
    offset = 0
    for img in images:
        stitched.paste(img, (0, offset))
        offset += img.height

    img_byte_arr = io.BytesIO()
    stitched.save(img_byte_arr, format='PNG')
    img_bytes = img_byte_arr.getvalue()
    stitched.close(); img_byte_arr.close()

    min_number = min(block["number"]  for block in image_blocks)
    min_x0     = min(block["bbox"][0] for block in image_blocks)
    min_y0     = min(block["bbox"][1] for block in image_blocks)
    max_x1     = max(block["bbox"][2] for block in image_blocks)
    max_y1     = max(block["bbox"][3] for block in image_blocks)
    bbox = (min_x0, min_y0, max_x1, max_y1)

    img_block = image_blocks[0].copy()
    img_block["number"]=min_number
    img_block["bbox"]=bbox
    img_block['width']= stitched.width
    img_block['height']= stitched.height
    img_block['size']= len(img_bytes)
    img_block['image']= img_bytes
    #'transform': ref_block.get('transform', (1.0, 0.0, 0.0, 1.0, min_x0, min_y0)),

    return img_block

def reconstitute_split_images(image_blocks: dict):
    split_images = identify_contiguous_images(image_blocks)
    split_ids    = [img["number"] for im_group in split_images for img in im_group] 

    stitched = [stitch_strips(group) for group in split_images]
    filtered_blocks = [img for img in image_blocks if img["number"] not in split_ids]
    filtered_blocks.extend(stitched)
    filtered_blocks.sort(key=lambda x: (x["page"], x["bbox"][1]))
    return filtered_blocks

In [None]:
doc    = open_exam(2005, "english", "al",1)
images = get_images(doc)
images = preproc_images(images)
images = reconstitute_split_images(images)
show_all_imgs(1,5,images)

In [None]:
doc    = open_exam(2005, "english", "al",1)
images = get_images(doc)
print("before")
show_all_imgs(3,5,images)
#images = filter_images(images)
#images = reconstitute_split_images(images)
images = filter_images(images)
print("after")
show_all_imgs(1,5,images)

# Low resolution image doubles

In [None]:
def old_filter_images(images):
    if len(images) > 100:
        images=filter_point_images(images)
    if len(images) > 100:
        images = reconstitute_strips(images)
    return images



In [None]:
doc    = open_exam(2011, "english", "al",1)
images = get_images(doc)
images = old_filter_images(images)
show_all_imgs(3,3,images)

In [None]:
print(images[0]["bbox"], images[0]["size"])
print(images[1]["bbox"], images[1]["size"])

In [None]:
images[0]

In [None]:
def filter_low_res_doubles(images):
    images_to_drop = []
    for i in range(len(images)):
        for j in range(i+1,len(images)):
            im1, im2  = images[i], images[j]
            if im2["bbox"]==im1["bbox"]:
                images_to_drop.append( im1["number"] if im1["size"] > im2["size"] else im2["number"])
    return [im for im in images if im["number"] not in images_to_drop]
filtered_images = filter_low_res_doubles(images)

In [None]:
show_all_imgs(3,3,filtered_images)