In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect
from PIL import Image
import sys, re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances

from pdf_scraper.doc_utils   import open_exam, get_doc_line_df, identify_section_headers, identify_text_headers, get_path_from_doc
from pdf_scraper.doc_utils   import identify_footers, identify_instructions, identify_subtitles, identify_subsubtitles
from pdf_scraper.line_utils  import clean_line_df, get_df_bbox
from pdf_scraper.doc_utils   import get_images, filter_images, get_raw_lines, assign_in_image_captions, identify_vertical_captions
from pdf_scraper.clustering.cluster_utils import find_y0_dL
from pdf_scraper.image_utils import show_image, show_all_imgs

pd.set_option("display.float_format", "{:.2f}".format)
pd.set_option("display.max_colwidth", 200)

In [None]:
# get cluster bboxes
def get_cluster_boxes(df, labels):
    rectangies = []
    clust_labes = np.unique(labels)[1:] if -1 in labels else np.unique(labels)
    for i in clust_labes:
        temp_df = df[df.cluster==i]
        rectangies.append( Rect(get_df_bbox(temp_df)) )
    return rectangies

def get_category_boxes(df, cat):
    rectangies = []
    clust_labes = np.unique(df[cat])[1:] if -1 in df[cat] else np.unique(df[cat])
    for i in clust_labes:
        temp_df = df[df[cat]==i]
        rectangies.append( Rect(get_df_bbox(temp_df)) )
    return rectangies

def enrich_doc_df_with_images(df, images):
    poo = {val: [ img["bbox"][i] for img in images ] for i, val in enumerate(["x0","y0","x1","y1"])}
    img_dict = { }
    for i, coord in enumerate(["x0","y0","x1","y1"]):
        img_dict[coord]   = [ img["bbox"][i] for img in images ]
    img_dict["page"]  = [ img["page"]   for img in images]
    img_dict["image"] = [1]*len(images)
    img_df = pd.DataFrame(img_dict)    
    rich_df = pd.concat([df, img_df],ignore_index=True).sort_values(by=["page","y0"],ignore_index=True)
    
    return rich_df

def get_bboxed_page_image(doc,  page_number: int, rects: list[Rect],  color: tuple[float]=(0,0,0.0), labels: list[int] = [], ) -> Image:
    i_p  = int(page_number-1)

    out_doc = fitz.open()
    out_doc.insert_pdf(doc, from_page=i_p, to_page=i_p)
    page = out_doc[0]

    for i, rect in enumerate(rects):
        page.draw_rect(rect, color=color, width=3)
        if len(labels) >0:
            label_text = str(labels[i])
            pos = fitz.Point(rect.x0, rect.y0 - 2)  # adjust -2 for spacing
            page.insert_text(pos, label_text, fontsize=8, color=(1,0,0))

    pix = page.get_pixmap(matrix=fitz.Matrix(1, 1))  # scale=2 for higher resolution
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    out_doc.close()

    return img



In [None]:
def df_bbox_next_row_dist(y0, y1, y0_next, y1_next):
    """End-to-end vertical distance between two line segments.

    This function will return 0 if the two bboxes overlap in the chosen dimension.
    Otherwise it will return the distance between their closest endpoints.
    """
    overlap = np.maximum(y0, y0_next) <= np.minimum(y1, y1_next)
    dist = np.where(overlap, 0.0, np.minimum(np.abs(y1 - y0_next), np.abs(y1_next - y0)))
    return dist

def df_bbox_dist(row1, row2):
    """
    This calculates the bbox end to end distance between to dataframe rows.

    It can be used to generate a distance matrix between all rows of a dataframe of lines.
    
    df_bbox_dist(row1[["y0","y1]], row2[["y0","y1"]]) = vertical   end to end bbox distance
    df_bbox_dist(row1[["x0","x1]], row2[["x0","x1"]]) = horizontal end to end bbox distance
    """
    y0, y1 = row1
    y0_next, y1_next = row2

    overlap = max(y0, y0_next) <= min(y1, y1_next)
    if overlap:
        return 0.0
    return min(abs(y1 - y0_next), abs(y1_next - y0))



def split_cluster(df: pd.DataFrame, i_clust: int,  metric, eps, dir, verbose=False):
    if verbose: print(f"scanning cluster {i_clust}")
    last_id  = df.cluster.max()
    clust_df = df[df.cluster==i_clust].copy()

    X             = pairwise_distances(clust_df[dir],metric=metric)
    scan          = DBSCAN(eps=eps, min_samples=1,metric="precomputed")
    labels        = scan.fit_predict(X)
    
    unique_labels = np.unique(labels)
    n_labels = len( unique_labels )
    if n_labels ==1:
        if verbose: print("No split")
        return  unique_labels
    
    labels[labels!=0] += last_id
    labels[labels==0] += i_clust

    df.loc[clust_df.index, "cluster"] = labels
    unique_labels = np.unique(labels)
    
    if verbose: print(f"Cluster {i_clust} split {dir} with eps = {round(eps)} into {n_labels} clusters: {unique_labels}")

    return unique_labels

def hdbscan(df: pd.DataFrame, max_iter: int, eps_x: float, eps_y: float, metric, verbose=False):
    dir1 = ["y0"] if metric=="euclidean" else ["y0","y1"]
    dir2 = ["x0"] if metric=="euclidean" else ["x0","x1"]
    dirs = ((dir1, eps_y), (dir2,eps_x))
    i_dir, n_fail, df["cluster"] = (0, 0, 0)
    N_clusters=1
    rectangies, labia = ([], [])
    
    for n_loop in range(max_iter):
        # assign the direction and cluster numbers for this round of scanning
        dir, eps  = dirs[i_dir]
    
        if verbose: print(f"Full Scan {n_loop} in {dir} with eps={eps:<6.0f}")
        if n_fail >=4:
            break
        # Loop over all current clusters and break up in dir
        for i_clust in np.unique(df.cluster):
            split_cluster(df, i_clust, metric, eps, dir, verbose=verbose)
                
        labelos = np.unique(df.cluster)
        n_clusters = len(labelos)
        i_dir = 1 if i_dir==0 else 0
    
        if n_clusters == N_clusters:
            n_fail +=1
            continue
        else:
            n_fail = 0
            N_clusters = n_clusters
    
        rectangies.append(get_category_boxes(df, 'cluster') )
        labia.append( np.unique(df.cluster) )
        if verbose: print(f"Total {n_clusters} clusters: {labelos}")

    return ( rectangies, labia )

#rectangs, labia = hdbscan(page_df, 100, eps_x0, eps_y0, "euclidean",False)
#imgs = [get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=labelos) for rectangies, labelos in zip(rectangs, labia)]


In [None]:
year=2019
#year=2002
doc    = open_exam(year, "english", "al",1)
df     = get_doc_line_df(doc)

images = get_images(doc)
images = filter_images(images)
assign_in_image_captions(df,images)

doc_width     = doc[0].rect.width
middle        = doc_width/2
standard_font = df.mode_font.mode()[0]
median_font   = df.font_size.median()


df = clean_line_df(df)
identify_footers(df)
identify_instructions(df)
identify_section_headers(df)
identify_text_headers(df, doc_width)
identify_subtitles(df, doc_width)
identify_subsubtitles(df,doc_width)


for image in images:
    if image["page"] <2 or image["page"] >8:
        continue
    identify_vertical_captions(df, image)

In [None]:
page = np.unique(df[df.caption2==1].page)[0]
page_df = df.loc[df.page==page, ["text", 'x0', 'y0', 'x1', 'y1', "page","w"]].copy()
page_images = [image for image in images if image["page"]==page]
page_df = enrich_doc_df_with_images(page_df,page_images)
page_df["dL"] = page_df.y0.diff()

doc_page = doc[int(page-1)]
pix = doc_page.get_pixmap(matrix=fitz.Matrix(0.8, 0.5))  # scale=2 for higher resolution
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
print(f"page: {page}")
display(img)

## Separate y0 and x0 scans 

In [None]:
eps_y0 = dL_y0 = find_y0_dL(page_df)*1.15
scan_y = DBSCAN(eps=dL_y0, min_samples=1)
page_df["y_cluster"]= scan_y.fit_predict(page_df[["y0"]])


median_w = page_df.w.dropna().map(round).median()
eps_x0    = median_w*0.5
scan_x = DBSCAN(eps=eps_x0, min_samples=1)
page_df["x_cluster"]= scan_x.fit_predict(page_df[["x0"]])

rectangies_y = get_category_boxes(page_df, 'y_cluster')
y_img = get_bboxed_page_image(doc, page, rectangies_y, color=(0,0,0.5), labels=np.unique(page_df.y_cluster) )


rectangies_x = get_category_boxes(page_df, 'x_cluster')
x_img = get_bboxed_page_image(doc, page, rectangies_x,color=(0.5,0,0.0),labels = np.unique(scan_x.labels_))

# Join lcluster labels
page_df["xy_cluster"] = page_df.groupby(["x_cluster", "y_cluster"]).ngroup()
rectangies_xy = get_category_boxes(page_df, 'xy_cluster')
xy_img = get_bboxed_page_image(doc, page, rectangies_xy,color=(0.0,0,0.0),labels=np.unique(page_df.xy_cluster))

fig, axes = plt.subplots(1,3,figsize=(16,8))

axes[0].imshow(y_img); axes[0].axis("off"); axes[0].set_title("y0-scan");
axes[1].imshow(x_img); axes[1].axis("off"); axes[1].set_title("x0-scan")
axes[2].imshow(xy_img); axes[2].axis("off"); axes[2].set_title("x0y0-scan")
plt.subplots_adjust(wspace=0.0)


- This is an improved clustering. But we notice that not all pagraphs are separated in y
  - Do another y clustering within each group.
  - The issue is that, when looking only at y, the existence of "A library in the middle" to the right of the paragraph ending in "Eames lamp" gives a false
    impression of contiguity.

# Hierarchical alternating scan till stability

In [None]:
median_dy0 = find_y0_dL(page_df)
median_w   = page_df.w.median()
eps_y0     = median_dy0*1.15
eps_x0     = median_w*0.5

rectangs, labia = hdbscan(page_df, 100, eps_x0, eps_y0, "euclidean",False)
imgs = [get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=labelos) for rectangies, labelos in zip(rectangs, labia)]
display(imgs[-1])

# Scan using bbox distances

# Loop over years

In [None]:
year=2001; page = 2 
def check_year_page(year, page, x_scale, y_scale):
    doc    = open_exam(year, "english", "al",1)
    df     = get_doc_line_df(doc)
    
    images = get_images(doc)
    images = filter_images(images)
    assign_in_image_captions(df,images)
    
    df = clean_line_df(df)
    for image in images:
        if image["page"] <2 or image["page"] >8:
            continue
        identify_vertical_captions(df, image)
    page_images = [image for image in images if image["page"]==page]
    
    page_df = df.loc[df.page==page, ["text", 'x0', 'y0', 'x1', 'y1', "page","w"]].copy()
    page_df = enrich_doc_df_with_images(page_df,page_images)
    
    
    page_df["dL_e2e"] = df_bbox_next_row_dist(page_df.y0, page_df.y1, page_df.y0.shift(-1), page_df.y1.shift(-1) )
    dLs = page_df.dL_e2e.dropna()
    dL_median = dLs[dLs!=0].median()
    
    #y_scale = 1.00
    eps_y = dL_median * y_scale
    
    middle = (page_df.x0.min() + page_df.x1.max())/2
    left  = page_df[page_df.x1 < middle +5 ]
    right = page_df[page_df.x0 > middle -5 ]
    
    left_right_dist  = pairwise_distances(left[["x0","x1"]], right[["x0","x1"]], metric=df_bbox_dist)
    eps_x = x_scale * left_right_dist.min()
    # This is for the case were there is document length lines on the same page as the dual columns.
    # These will cause the min end to end difference to be 0. 10 is a good estimate for an appropriate x distance
    # found by trial and error.
    eps_x = 10 if eps_x == 0 else eps_x   
    
    
    print(f"eps_x: {eps_x} eps_y: {eps_y}")
    
    rectangs, labia = hdbscan(page_df, 100, eps_x, eps_y, df_bbox_dist,False)
    imgs = [get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=labelos) for rectangies, labelos in zip(rectangs, labia)]
    display(imgs[-1])
#check_year_page(2023, 6, (2.0/3.0), 1.10)

In [None]:
for year, page in [(2001, 2),(2002, 6),(2010, 7),(2011, 2),(2011, 3),(2012, 4),(2012, 5),
                   (2013, 4),(2013, 6),(2013, 7),(2014, 3),(2023, 6),(2024, 3)]:
    print(year, page)
    check_year_page(year, page, 2/3, 1.1)