In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances

from pdf_scraper.doc_utils     import open_exam, get_doc_line_df, identify_section_headers, identify_text_headers, get_path_from_doc
from pdf_scraper.doc_utils     import identify_footers, identify_instructions, identify_subtitles, identify_subsubtitles
from pdf_scraper.line_utils    import clean_line_df, get_category_boxes, get_df_bbox
from pdf_scraper.doc_utils     import get_images, preproc_images, assign_in_image_captions, identify_vertical_captions
from pdf_scraper.doc_utils     import enrich_doc_df_with_images, identify_all_page_clusters
from pdf_scraper.clustering.cluster_utils import get_vert_neigh_dist, split_cluster, hdbscan, find_y0_dL, correct_eps_y_scale,get_eps_x, get_eps_y
from pdf_scraper.general_utils import df_bbox_dist, df_bbox_next_row_dist
from pdf_scraper.image_utils   import get_bboxed_page_image

pd.set_option("display.float_format", "{:.2f}".format)
pd.set_option("display.max_colwidth", 200)

This notebook will contain the methodology used to test the clustering algorithms. Many of the problems highlighted here have been
solved by excluding images from the clustering (setting text_only to True), but the methods used to analyse the problems remain valid and so we store them 
here.

# Year 2011, page 6

## Check default spatial clustering

In [None]:
year, page = 2011, 6
def check_page_clustering(year,page):
    doc    = open_exam(year, "english", "al",1)
    doc_width     = doc[0].rect.width
    df     = get_doc_line_df(doc)
    
    images = get_images(doc)
    images = preproc_images(images)
    assign_in_image_captions(df,images)
    
    df = enrich_doc_df_with_images(df,images)
    df = clean_line_df(df)
    identify_all_page_clusters(df,2.0/3.0, 1.15, text_only=False)
    
    identify_footers(df)
    identify_instructions(df)
    identify_section_headers(df)
    identify_text_headers(df, doc_width)
    identify_subtitles(df, doc_width)
    identify_subsubtitles(df, doc_width)
    
    page_df = df[df.page==page]
    
    rectangs = get_category_boxes(page_df, 'cluster')
    labelos  = np.unique(page_df.cluster)
    boxed_page = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)
    
    display(boxed_page)
    return doc, page_df

doc, page_df = check_page_clustering(year,page)

## Check all line boxes and check category boxes

Checkign all line boxes will allow us to see if any of the lines have been misboxed. This would in turn lead to them being misclustered.

Previously there was a situation where a line had a space at the end of it included in its box which impeded the correct clustering.
To fix this issue the clean_lines function was modified so that any space will be removed (it was previously set to remove only lines
a prepend or postpend of at least 6 spaces.)

This can also show us if there are invisible lines in the document that are messing up with the clustering.

Checking the category boxes will just allow us to see how each of the categories have been boxed as well.

In [None]:
page_df["line_id"] = page_df.index.values
rectangs = get_category_boxes(page_df, 'line_id')
labelos = []
line_box_img = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)

rectangs     = get_category_boxes(page_df, 'category')
labelos      = np.unique(page_df.category)
category_img = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)

fig, axes = plt.subplots(1, 2, figsize=(16, 8))

axes[0].imshow(line_box_img); axes[0].set_title("Line Boxes"); axes[0].axis("off")
axes[1].imshow(category_img); axes[1].set_title("Category Boxes") ; axes[1].axis("off")

plt.tight_layout()
plt.show()

## Examine different combinations of clustering

This will allow us to see which text group/image is causing the problems with the clustering. We can choose
the problematic cluster and see if we can split it by excluding agiven category.

In [None]:
cluster_df = page_df[page_df.cluster==0]
title_or_image             = (cluster_df.category=="title") | (cluster_df.category=="image")
subtitle_or_image          = (cluster_df.category=="subtitle") | (cluster_df.category=="image")
title_or_subtitle_or_image = (cluster_df.category=="title") |(cluster_df.category=="subtitle") | (cluster_df.category=="image")
uncategorised              = cluster_df.category=='uncategorised'
image                      = cluster_df.category=="image"

split_cluster(cluster_df.loc[title_or_image], 0, df_bbox_dist, 5, ["x0",'x1'],verbose=True)
split_cluster(cluster_df.loc[subtitle_or_image], 0, df_bbox_dist, 5, ["x0",'x1'],verbose=True)
split_cluster(cluster_df.loc[title_or_subtitle_or_image], 0, df_bbox_dist, 5, ["x0",'x1'],verbose=True)

split_cluster(cluster_df.loc[uncategorised | title_or_image], 0, df_bbox_dist, 5, ["x0",'x1'],verbose=True)

A horizontal split of cluster 0 cannot be achieved because the title overlaps 
with both right and left columns and the right column overlaps with the image.

A vertical split cannot be achieved because the image overlaps vertically with everything in the cluster,
and the ned of the first paragraph also overlaps vertically with the start of the first paragraph in 
the column on the right.


possible solution: remove the image and re-attempt the clustering

# Year 2024 page 4

In [None]:
year, page = 2024, 4
doc, page_df = check_page_clustering(2024,4)

In [None]:
page_df["line_id"] = page_df.index.values
rectangs = get_category_boxes(page_df, 'line_id')
labelos = []
line_box_img = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)

rectangs     = get_category_boxes(page_df, 'category')
labelos      = np.unique(page_df.category)
category_img = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)

fig, axes = plt.subplots(1, 2, figsize=(16, 8))

axes[0].imshow(line_box_img); axes[0].set_title("Line Boxes"); axes[0].axis("off")
axes[1].imshow(category_img); axes[1].set_title("Category Boxes") ; axes[1].axis("off")

plt.tight_layout()
plt.show()

In this case it is clear that the vertical overlap of the image with the subtitle, which overlaps with all lines, 
is what prevents the page from being split up.

Removing the image and re-doing the clustering may be a good idea.

# Year 2025, page 6

In [None]:
year, page = 2025, 6
doc, page_df = check_page_clustering(year,page)

Here we have the unfortunate situation that the subtitle overlaps horizontally with all lines,
and then that the two columns are shifted with respect to each other such that there is always a line on the 
right where there is a break on the left. 

However, why is the subtitle not split vertically from the column text?

Here again we can see the disruptive effect of for example, just the title and subtitle.

The tittle and subtitle could just always be given their own boxes. This would cause much less disruption in subsequent blocking.

In [None]:
clust_id=1
cluster_df = page_df[page_df.cluster==clust_id]
eps_x = get_eps_x(page_df, page, 2.0/3.0)
eps_y = get_eps_y(page_df, page, 1.15)
title_or_image             = (cluster_df.category=="title") | (cluster_df.category=="image")
subtitle_or_image          = (cluster_df.category=="subtitle") | (cluster_df.category=="image")
title_or_subtitle_or_image = (cluster_df.category=="title") |(cluster_df.category=="subtitle") | (cluster_df.category=="image")
uncategorised              = cluster_df.category=='uncategorised'
image                      = cluster_df.category=="image"


split_cluster(cluster_df                                , clust_id, df_bbox_dist, eps_y, ["y0",'y1'],verbose=True)
#split_cluster(cluster_df.loc[subtitle_or_image]         , clust_id, df_bbox_dist, eps_x, ["x0",'x1'],verbose=True)
#split_cluster(cluster_df.loc[title_or_subtitle_or_image], clust_id, df_bbox_dist, eps_x, ["x0",'x1'],verbose=True)
#split_cluster(cluster_df.loc[uncategorised | title_or_image], 0, df_bbox_dist, eps_x, ["x0",'x1'],verbose=True)

## Box categories

In [None]:
doc    = open_exam(year, "english", "al",1)
rectangs = get_category_boxes(page_df, 'category')
labelos  = np.unique(page_df.category)
boxed_page = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)

display(boxed_page)

In [None]:
page_df[page_df.category=='subtitle'].text

Note the secret reptition of the text (340, 342, 343) which is not rendered in the pdf. This is an issue we have had before. The solution is to find
a way to remove the text that is not being rendered.

# Year 2001, page 6

In [None]:
year, page = 2001, 6
doc, page_df = check_page_clustering(year,page)

In [None]:
page_df["line_id"] = page_df.index.values
rectangs = get_category_boxes(page_df, 'line_id')
labelos = []
line_box_img = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)

rectangs     = get_category_boxes(page_df, 'category')
labelos      = np.unique(page_df.category)
category_img = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)

fig, axes = plt.subplots(1, 2, figsize=(16, 8))

axes[0].imshow(line_box_img); axes[0].set_title("Line Boxes"); axes[0].axis("off")
axes[1].imshow(category_img); axes[1].set_title("Category Boxes") ; axes[1].axis("off")

plt.tight_layout()
plt.show()

The problem with this clustering is most likely that a much too small eps_y was extracted for this page.
This is an argument for document wide determination of eps_y. Or just a font-based determination, which 
would probably be the wisest.