In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect
from PIL import Image
from time import sleep
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances

from pdf_scraper.doc_utils   import open_exam, get_doc_line_df, identify_section_headers, identify_text_headers
from pdf_scraper.doc_utils   import identify_footers, identify_instructions, identify_subtitles, identify_subsubtitles
from pdf_scraper.line_utils  import clean_line_df, get_df_bbox
from pdf_scraper.doc_utils   import get_images, filter_images,  assign_in_image_captions, identify_vertical_captions
from pdf_scraper.clustering.cluster_utils import find_y0_dL
from pdf_scraper.image_utils import show_image, show_all_imgs

pd.set_option("display.float_format", "{:.2f}".format)
pd.set_option("display.max_colwidth", 200)

In [None]:
def get_category_boxes(df, cat):
    rectangies = []
    clust_labes = np.unique(df[cat])[1:] if -1 in df[cat] else np.unique(df[cat])
    for i in clust_labes:
        temp_df = df[df[cat]==i]
        rectangies.append( Rect(get_df_bbox(temp_df)) )
    return rectangies

def enrich_doc_df_with_images(df, images):
    poo = {val: [ img["bbox"][i] for img in images ] for i, val in enumerate(["x0","y0","x1","y1"])}
    img_dict = { }
    for i, coord in enumerate(["x0","y0","x1","y1"]):
        img_dict[coord]   = [ img["bbox"][i] for img in images ]
    img_dict["page"]  = [ img["page"]   for img in images]
    img_dict["image"] = [1]*len(images)
    img_df = pd.DataFrame(img_dict)    
    rich_df = pd.concat([df, img_df],ignore_index=True).sort_values(by=["page","y0"],ignore_index=True)
    
    return rich_df

def get_bboxed_page_image(doc,  page_number: int, rects: list[Rect],  color: tuple[float]=(0,0,0.0), labels: list[int] = [], ) -> Image:
    i_p  = int(page_number-1)

    out_doc = fitz.open()
    out_doc.insert_pdf(doc, from_page=i_p, to_page=i_p)
    page = out_doc[0]

    for i, rect in enumerate(rects):
        page.draw_rect(rect, color=color, width=3)
        if len(labels) >0:
            label_text = str(labels[i])
            pos = fitz.Point(rect.x0, rect.y0 - 2)  # adjust -2 for spacing
            page.insert_text(pos, label_text, fontsize=8, color=(1,0,0))

    pix = page.get_pixmap(matrix=fitz.Matrix(1, 1))  # scale=2 for higher resolution
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    out_doc.close()

    return img



In [None]:
def df_bbox_next_row_dist(y0, y1, y0_next, y1_next):
    """End-to-end vertical distance between two line segments.

    This function will return 0 if the two bboxes overlap in the chosen dimension.
    Otherwise it will return the distance between their closest endpoints.
    """
    overlap = np.maximum(y0, y0_next) <= np.minimum(y1, y1_next)
    dist = np.where(overlap, 0.0, np.minimum(np.abs(y1 - y0_next), np.abs(y1_next - y0)))
    return dist

def df_bbox_dist(row1, row2):
    """
    This calculates the bbox end to end distance between to dataframe rows.

    It can be used to generate a distance matrix between all rows of a dataframe of lines.
    
    df_bbox_dist(row1[["y0","y1]], row2[["y0","y1"]]) = vertical   end to end bbox distance
    df_bbox_dist(row1[["x0","x1]], row2[["x0","x1"]]) = horizontal end to end bbox distance
    """
    y0, y1 = row1
    y0_next, y1_next = row2

    overlap = max(y0, y0_next) <= min(y1, y1_next)
    if overlap:
        return 0.0
    return min(abs(y1 - y0_next), abs(y1_next - y0))

In [None]:
def split_cluster(df: pd.DataFrame, i_clust: int,  metric, eps, dir, verbose=False):
    if verbose: print(f"scanning cluster {i_clust}")
    last_id  = df.cluster.max()
    clust_df = df[df.cluster==i_clust].copy()

    X             = pairwise_distances(clust_df[dir],metric=metric)
    scan          = DBSCAN(eps=eps, min_samples=1,metric="precomputed")
    labels        = scan.fit_predict(X)
    
    unique_labels = np.unique(labels)
    n_labels = len( unique_labels )
    if n_labels ==1:
        if verbose: print("No split")
        return  unique_labels
    
    labels[labels!=0] += last_id
    labels[labels==0] += i_clust

    df.loc[clust_df.index, "cluster"] = labels
    unique_labels = np.unique(labels)
    
    if verbose: print(f"Cluster {i_clust} split {dir} with eps = {round(eps)} into {n_labels} clusters: {unique_labels}")

    return unique_labels

def hdbscan(df: pd.DataFrame, max_iter: int, eps_x: float, eps_y: float, metric, verbose=False):
    dir1 = ["y0"] if metric=="euclidean" else ["y0","y1"]
    dir2 = ["x0"] if metric=="euclidean" else ["x0","x1"]
    dirs = ((dir1, eps_y), (dir2,eps_x))
    i_dir, n_fail, df["cluster"] = (0, 0, 0)
    N_clusters=1
    rectangies, labia = ([], [])
    
    for n_loop in range(max_iter):
        # assign the direction and cluster numbers for this round of scanning
        dir, eps  = dirs[i_dir]
    
        print(f"Full Scan {n_loop} in {dir} with eps={eps:<6.0f}")
        if n_fail >=4:
            break
        # Loop over all current clusters and break up in dir
        for i_clust in np.unique(df.cluster):
            split_cluster(df, i_clust, metric, eps, dir, verbose=verbose)
                
        labelos = np.unique(df.cluster)
        n_clusters = len(labelos)
        i_dir = 1 if i_dir==0 else 0
    
        if n_clusters == N_clusters:
            n_fail +=1
            continue
        else:
            n_fail = 0
            N_clusters = n_clusters
    
        rectangies.append(get_category_boxes(df, 'cluster') )
        labia.append( np.unique(df.cluster) )
        print(f"Total {n_clusters} clusters: {labelos}")

    return ( rectangies, labia )

#rectangs, labia = hdbscan(page_df, 100, eps_x0, eps_y0, "euclidean",False)
#imgs = [get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=labelos) for rectangies, labelos in zip(rectangs, labia)]


In [None]:
year=2019
#year=2002
doc    = open_exam(year, "english", "al",1)
df     = get_doc_line_df(doc)

images = get_images(doc)
images = filter_images(images)
assign_in_image_captions(df,images)

doc_width     = doc[0].rect.width
middle        = doc_width/2
standard_font = df.mode_font.mode()[0]
median_font   = df.font_size.median()


df = clean_line_df(df)
identify_footers(df)
identify_instructions(df)
identify_section_headers(df)
identify_text_headers(df, doc_width)
identify_subtitles(df, doc_width)
identify_subsubtitles(df,doc_width)


for image in images:
    if image["page"] <2 or image["page"] >8:
        continue
    identify_vertical_captions(df, image)

In [None]:
page = np.unique(df[df.caption2==1].page)[0]
page_df = df.loc[df.page==page, ["text", 'x0', 'y0', 'x1', 'y1', "page"]].copy()
doc_page = doc[int(page-1)]

pix = doc_page.get_pixmap(matrix=fitz.Matrix(0.8, 0.5))  # scale=2 for higher resolution
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
print(f"page: {page}")
display(img)

# Enrich doc_df with image information

In [None]:
page_df     = df[df.page==page]
page_images = [image for image in images if image["page"]==page]
show_image(page_images[0])

In [None]:
page_df = enrich_doc_df_with_images(page_df,page_images)
page_df["dL"] = page_df.y0.diff()
page_df[['x0', 'y0', 'x1', 'y1', 'mode_font',  'text', 'font_size', 'image', 'page', 'counts']].head(8)

# Basic dbscan on X0 and Y0

In [None]:
fig, axes = plt.subplots(1,2,figsize=(16,4))

sns.histplot(page_df.y0,ax=axes[0]);
sns.histplot(page_df.x0,ax=axes[1]);

## y0 scan

In [None]:
eps_y0 = dL_y0 = find_y0_dL(page_df)*1.15
X = pairwise_distances(page_df[["y0"]],metric="euclidean")

scan = DBSCAN(eps=eps_y0, min_samples=1, metric="precomputed")
page_df["y_cluster"]= scan.fit_predict(X)

rectangies = get_category_boxes(page_df, 'y_cluster')
y_img = get_bboxed_page_image(doc, page, rectangies,color=(0,0,0.5), labels= np.unique(page_df.y_cluster))
display(y_img)


- Because the closeness is only measured with y0, we can see that cluster 2 contains the image, and the lines down to wolverhampton 
- The bbox extracted from this goes to the end of the image, but actually only contains the images and the first paragraph.

## X0 scan

For the x0 dbscan, we will base eps on the median line width. <br>
If a line is separated from another by more than a column width, it is not in its epsilon neighbourhood

In [None]:
median_w = page_df.w.dropna().map(round).median()
eps_x0   = median_w*0.5

X    = pairwise_distances(page_df[["x0"]],metric="euclidean")
scan = DBSCAN(eps=eps_x0, min_samples=1, metric="precomputed")
page_df["x_cluster"] = scan.fit_predict(X)

In [None]:
rectangies = get_category_boxes(page_df, 'x_cluster')
x_img = get_bboxed_page_image(doc, page, rectangies,color=(0.5,0,0.0),labels = np.unique(scan.labels_))
fig, axes = plt.subplots(1,2,figsize=(16,8))

axes[0].imshow(y_img); axes[0].axis("off"); axes[0].set_title("y0-scan");
axes[1].imshow(x_img); axes[1].axis("off"); axes[1].set_title("x0-scan")
plt.subplots_adjust(wspace=0.0)


- Cluster 0 contains every line that starts within w from the left hand side, including the title and subtitle.
  - The box around them then is a box to the end of the title and subtitle.
- Cluster 1 is all the lines or images that start more than x from the very left - right column, figure, and caption.

## Non-hierarchical Double scan

In [None]:
page_df["cluster_id"] = page_df.groupby(["x_cluster", "y_cluster"]).ngroup()

rectangies = get_category_boxes(page_df, 'cluster_id')
xy_img = get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0),labels=np.unique(page_df.cluster_id))
display(xy_img)

- This is an improved clustering. But we notice that not all pagraphs are separated in y
  - Do another y clustering within each group.
  - The issue is that, when looking only at y, the existence of "A library in the middle" to the right of the paragraph ending in "Eames lamp" gives a false
    impression of contiguity.

# HDBSCAN with x0 and y0

In [None]:
#median_dy0 = find_y0_dL(page_df)
#median_w   = page_df.w.median()
#eps_y0     = median_dy0*1.15
#eps_x0     = median_w*0.5
#
#dirs = ((["y0"],eps_y0), (["x0"],eps_x0))
#i_dir = 0
#page_df["cluster"]=0
#N_clusters=1
#n_fail = 0
#
#for n_loop in range(60):
#    # assign the direction and cluster numbers for this round of scanning
#    dir = dirs[i_dir][0] ; eps = dirs[i_dir][1]
#    last_id = len(np.unique(page_df.cluster))
#
#    print(f"Scan {n_loop} in {dir}")
#    if n_fail >=4:
#        break
#    # Loop over all current clusters and break up in dir
#    for i_clust in np.unique(page_df.cluster):
#        #split_cluster(page_df, i_clust, "euclidean", eps, dir, True)
#
#        clust_df = page_df[page_df.cluster==i_clust].copy()
#
#        scan = DBSCAN(eps=eps, min_samples=1)
#        scan.fit(clust_df[[dir]])
#
#        labels = scan.labels_
#        n_labels = len(np.unique(labels) )
#
#        if n_labels ==1:
#            continue
#        
#        labels[labels==0] = i_clust
#        for i in range(1, n_labels):
#            labels[labels==i]=last_id
#            last_id +=1
#        clust_df.cluster = labels
#        page_df.loc[clust_df.index, "cluster"] = clust_df.cluster
#
#        print(f"Cluster {i_clust} split {dir} into {n_labels} clusters: {np.unique(labels)}")
#            
#
#    labelos = np.unique(page_df.cluster)
#    n_clusters = len(labelos)
#    i_dir = 1 if i_dir==0 else 0
#
#    if n_clusters == N_clusters:
#        n_fail +=1
#        continue
#    else:
#        n_fail = 0
#        N_clusters = n_clusters
#
#    rectangies = get_category_boxes(page_df, 'cluster')
#    img = get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=labelos)
#    print(f"Total {len(labelos)} clusters: {labelos}")
#    display(img)
#    sleep(5)

In [None]:
median_dy0 = find_y0_dL(page_df)
median_w   = page_df.w.median()
eps_y0     = median_dy0*1.15
eps_x0     = median_w*0.5

rectangs, labia = hdbscan(page_df, 100, eps_x0, eps_y0, "euclidean",False)
imgs = [get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=labelos) for rectangies, labelos in zip(rectangs, labia)]

fig, axes = plt.subplots(1,3,figsize=(16,8))

dir="y0"
for i in range(len(axes)):
    axes[i].imshow(imgs[i]); axes[i].axis("off"); axes[i].set_title(f"Scan {i+1} in {dir}");
    dir = "x0" if dir=="y0" else "y0"

# Scan using bbox distances

In [None]:
dff = page_df[['x0', 'y0', 'x1', 'y1',  'text', 'font_size', 'image', 'counts']].copy()

## y-clustering bbox end-to-end 

### find eps

In [None]:
dff["dL"] = df_bbox_next_row_dist(dff.y0, dff.y1, dff.y0.shift(-1), dff.y1.shift(-1) )
dff["dy0"] = dff.y0.shift(-1) - dff.y0
dff.head()

In [None]:
from pdf_scraper.data_sci_utils import examine_value_counts
dy0s = dff.dy0.dropna()
dy0_median =  dy0s[dy0s !=0].median()
dLs = dff.dL.dropna()
dL_median = dLs[dLs!=0].median()

print(f"median dy0 {dy0_median:6.2f}                          median dL {dL_median:6.2f}")
examine_value_counts(pd.concat([dy0s.round(1),dLs.round(1)],axis=1),["dy0","dL"])

fig, axes = plt.subplots(1,2,figsize=(16,6))
sns.histplot(dy0s, ax=axes[0]);
sns.histplot(dLs.round(2), ax=axes[1]);


In [None]:

dist_matrix = pairwise_distances(dff[["y0", "y1"]].values, metric=df_bbox_dist)
mask = ~np.eye(dist_matrix.shape[0], dtype=bool)
vals = dist_matrix[mask]  
sns.histplot(vals.round(2).flatten());

### dbscan Split in y

In [None]:
scan = DBSCAN(eps=dL_median*1.15, min_samples=1,metric="precomputed")
X = pairwise_distances(dff[["y0","y1"]],metric=df_bbox_dist)
dff["y_cluster"] = scan.fit_predict(X)

rectangies = get_category_boxes(dff, 'y_cluster')
y_img = get_bboxed_page_image(doc, page, rectangies,color=(0.5,0,0.0))
display(y_img)

- So because we are only considering one direction, the whole dual colum part is counted as one y-block. This is because when there
  is a gap in one column, there is text in the next.
  - Look for example at the gap after Wolverhampton. This would count as a split if it were not for the fact that too the right we have the image covering this wole distance. So there is a clear connection path of bbox-y-ends from one pagraph to the next.

## x-clustering bbox end to end 

### Find epx

In [None]:
dfx = dff.sort_values(by="x0").drop(columns=["dL","dy0"]).copy()

dfx["dX"] = df_bbox_next_row_dist(dfx.x0, dfx.x1, dfx.x0.shift(-1), dfx.x1.shift(-1) )
dfx["dx0"] = dfx.x0.shift(-1) - dfx.x0

dfx.head(6)

 - It is not really informative to look at row by row differences in x, this does not reflect normal reading order or document layout.

 - Let's look at the distributions of the full neighbour distances
 - Find the characteristic dX to use in the eps scan
 - do the eps scan

 - return to hierarchical clustering code.

 - clean all code and implement on captions.

In [None]:
dist_matrix = pairwise_distances(dfx[["x0", "x1"]].values, metric=df_bbox_dist)
mask = ~np.eye(dist_matrix.shape[0], dtype=bool)
vals = dist_matrix[mask]   

dist_matrix2 = pairwise_distances(dfx[["x0"]].values, metric="euclidean")
mask2 = ~np.eye(dist_matrix2.shape[0], dtype=bool)
vals2 = dist_matrix2[mask2]   

fig, axes = plt.subplots(2, 2 , figsize=(16,8))
axes = axes.flatten()
sns.histplot(vals.round(2).flatten(),ax=axes[0]);
axes[0].set_title("end to end");
sns.histplot(vals2.round(2).flatten(),ax=axes[1]);
axes[1].set_title("x0");

sns.histplot(vals[vals!=0].round(2).flatten(),ax=axes[2]);
axes[2].set_title("end to end - 0 excluded");
sns.histplot(vals2[vals2!=0].round(2).flatten(),ax=axes[3]);
axes[3].set_title("x0 - 0 excluded");

 - e2e has more 0s because now lines which are overlapping over their bboxs are given 0 distance, not just lines which start at the same x0
 - e2e has outside of 0 a wider spread of values because for many lines with the same x0, there are different x1s. 
   - Therefore (x0_1 - x1_0) has bigger variety than (x0_1 - x1_0)
 - for the e2e eps_x, we could use the peak of the 0-excluded distribution on the left.

 - Let's first see if we can pick out a good candidate by measuring the distance between the dual-column columns

In [None]:
x0 = middle-16
x1 = middle+9

y0 = page_images[0]["bbox"][3]
y1 = dff.y1.max() -50

rectangies = [Rect(x0,y0,x1,y1)]
img = get_bboxed_page_image(doc, page, rectangies,color=(0.5,0,0.0))
display(img)
#plt.imshow(img)

eps_x = eps_x_mid_gap =  x1-x0
print(f"Box end to end is {eps_x}")

In [None]:
left = dfx[dfx.x1 < middle]
right = dfx[dfx.x0 > middle]

left_right_dist  = pairwise_distances(left[["x0","x1"]], right[["x0","x1"]], metric=df_bbox_dist)
i_leftmost_right = right.x0.argmin()
i_rightmost_left = left.x1.argmax()
print(f"{left_right_dist[i_rightmost_left,i_leftmost_right]} should equal {left_right_dist.min()}")
print(f"Compare to earlier epx_x_mid_gap: {eps_x_mid_gap}")

### dbscan split in x

In [None]:
fig, axes = plt.subplots(1,3, figsize=(16,4))
for i, eps_scale in enumerate([0.2, 0.5, 1]):
    X = pairwise_distances(dff[["x0","x1"]],metric=df_bbox_dist)
    scan = DBSCAN(eps=eps_x*eps_scale, min_samples=1,metric="precomputed")
    dff["x_cluster"] = scan.fit_predict(X)
    
    rectangies = get_category_boxes(dff, 'x_cluster')
    x_img = get_bboxed_page_image(doc, page, rectangies,color=(0.5,0,0.0))
    axes[i].imshow(x_img); axes[i].axis("off");

- This is because the bbox of, for example, the subtitle covers the entire figure, and therefore overlaps in X with every line.
  - you need to partition by y first.

# HDBSCAN with end-to-end distances

- We are going to do first end-to-end y distance, with end-to-end x
- then we will do end-to-end y distance with regular x0
- then we will do end-to-end y distance with overlap-and-x0 x.

In [None]:
eps_y      = dL_median*1.05
eps_x      = 10 #eps_x_mid_gap*0.01

# dirs = ((["y0","y1"], eps_y), (["x0","x1"],eps_x))
# i_dir = 0
# page_df["cluster"]=0
# N_clusters=1
# n_fail = 0
# 
# for n_loop in range(50):
#     # assign the direction and cluster numbers for this round of scanning
#     dir = dirs[i_dir][0] ; eps = dirs[i_dir][1]
#     last_id = len(np.unique(page_df.cluster))
# 
#     print(f"Full Scan {n_loop} in {dir} with eps={eps:<6.0f}")
#     if n_fail >=4:
#         break
#     # Loop over all current clusters and break up in dir
#     for i_clust in np.unique(page_df.cluster):
#         print(f"scanning cluster {i_clust}")
#         clust_df = page_df[page_df.cluster==i_clust].copy()
#         X = pairwise_distances(clust_df[dir],metric=df_bbox_dist)
# 
#         scan = DBSCAN(eps=eps, min_samples=1,metric="precomputed")
#         scan.fit(X)
# 
#         labels = scan.labels_
#         n_labels = len(np.unique(labels) )
# 
#         if n_labels ==1:
#             print("No split")
#             continue
#         
#         print(f"Cluster {i_clust} split {dir[0][0]} into {n_labels} clusters: {np.unique(labels)}")
#         print("Reassigning labels")
#         for i in range(1, n_labels):
#             labels[labels==i]=last_id
#             last_id +=1
#         labels[labels==0] = i_clust
#         print(np.unique(labels))
#         clust_df.cluster = labels
#         page_df.loc[clust_df.index, "cluster"] = clust_df.cluster
# 
#         print(f"Cluster {i_clust} split {dir} into {n_labels} clusters: {np.unique(labels)}")
#             
# 
#     labelos = np.unique(page_df.cluster)
#     n_clusters = len(labelos)
#     i_dir = 1 if i_dir==0 else 0
# 
#     if n_clusters == N_clusters:
#         n_fail +=1
#         continue
#     else:
#         n_fail = 0
#         N_clusters = n_clusters
# 
#     rectangies = get_category_boxes(page_df, 'cluster')
#     img = get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=labelos)
#     print(f"Total {len(labelos)} clusters: {labelos}")
#     display(img)
#     sleep(5)

In [None]:
rectangs, labia = hdbscan(page_df, 100, eps_x, eps_y, df_bbox_dist,False)
imgs = [get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=labelos) for rectangies, labelos in zip(rectangs, labia)]

fig, axes = plt.subplots(1,3,figsize=(16,8))

dir="y0"
for i in range(len(axes)):
    axes[i].imshow(imgs[i]); axes[i].axis("off"); axes[i].set_title(f"{dir}");
    dir = "x0" if dir=="y0" else "y0"

## Test split_cluster

Here we just manually split the document by looking at the clusters that need splitting after each split.

We used mixed metrics

In [None]:
page_df.cluster = 0
dirs= []
last_id = page_df.cluster.max()
dir=["y0"]
dirs.append(dir)
new_clusts = split_cluster(page_df, 0, 'euclidean', eps_y0, dir  )
print(np.unique(page_df.cluster))

rectangies = get_category_boxes(page_df, 'cluster')
img1 = get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=np.unique(page_df.cluster))

dir=["x0","x1"]
dirs.append(dir)
last_id = split_cluster(page_df, 3,  df_bbox_dist, 10, dir )
print(np.unique(page_df.cluster))
rectangies = get_category_boxes(page_df, 'cluster')
img2 = get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=np.unique(page_df.cluster))


dir=["y0"]
dirs.append(dir)
last_id = split_cluster(page_df, 5,  "euclidean", eps_y0, dir)
print(np.unique(page_df.cluster))
rectangies = get_category_boxes(page_df, 'cluster')
img3 = get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=np.unique(page_df.cluster))

dir=["x0","x1"]
dirs.append(dir)
last_id = split_cluster(page_df, 2,  df_bbox_dist, 10, ["x0","x1"])
print(np.unique(page_df.cluster))
rectangies = get_category_boxes(page_df, 'cluster')
img4 = get_bboxed_page_image(doc, page, rectangies,color=(0.0,0,0.0), labels=np.unique(page_df.cluster))


fig, axes = plt.subplots(2,2,figsize=(16,16))
imgs = [img1, img2, img3,img4]
axes = axes.flatten()

for i in range(len(axes)):
    axes[i].imshow(imgs[i]); axes[i].axis("off"); axes[i].set_title(f"{dirs[i]}");
plt.subplots_adjust(wspace=0.0)

# Further Ideas

Instead of doing a fully end-to-end [x0, x1] distance, keep taking the distances between x0s, but also set
those lines overlapping in x to have 0 distance.

I don't know actually if this makes too much sense. What it would do is not separate for example an image with a caption which
is very far underneath it to the right.

It has the advantage that the distances in x0s are more uniform than e2e distances. A short line on the left will have the
same x0 distance from any kind of line on the right, where as the e2e distance depends on the size of the line.

It also has the advantage that it would be more robust in the case of lines within a column being split into several
lines for formatting. For example: <The         big       house> can happen sometimes in a text, and sometimes this can
be split into 3 separate lines by pymudpdf. If the gaps are big enough between the words there, and the sublines composing
the line, then the e2e distances may not include them in the epx_neighbouroood of the surrounding lines.

Actually it will because of the overlap.

As we have seen the epx_x of 10 we need to separate in x is very small.