In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances

from pdf_scraper.clustering.cluster_utils import get_vert_neigh_dist, split_cluster, hdbscan, find_y0_dL, correct_eps_y_scale,get_eps_x, get_eps_y
from pdf_scraper.doc_utils     import open_exam, get_doc_line_df, identify_section_headers, identify_text_headers, get_path_from_doc
from pdf_scraper.doc_utils     import identify_footers, identify_instructions, identify_subtitles, identify_subsubtitles
from pdf_scraper.doc_utils     import get_images, preproc_images, assign_in_image_captions, identify_vertical_captions
from pdf_scraper.doc_utils     import enrich_doc_df_with_images, identify_all_page_clusters, get_lines_in_image_clusters
from pdf_scraper.line_utils    import clean_line_df, get_category_boxes, get_df_bbox
from pdf_scraper.general_utils import df_bbox_dist, df_bbox_next_row_dist
from pdf_scraper.image_utils   import get_bboxed_page_image

from pdf_scraper.general_utils import bbox_horiz_dist, bbox_vert_dist

pd.set_option("display.float_format", "{:.2f}".format)
pd.set_option("display.max_colwidth", 200)

In [None]:
def check_year_page(year, page, x_scale, y_scale, text_only_clusters=False):
    doc    = open_exam(year, "english", "al",1)
    df     = get_doc_line_df(doc)
    
    images = get_images(doc)
    images = preproc_images(images)
    assign_in_image_captions(df,images)
    
    df = clean_line_df(df)
    df = enrich_doc_df_with_images(df,images)
    df = identify_all_page_clusters(df, x_scale, y_scale, text_only_clusters)

    
    page_df = df.loc[df.page==page, ["text", 'x0', 'y0', 'x1', 'y1', "page","w","category","cluster"]].copy()

    rectangs = get_category_boxes(page_df, 'cluster')
    labelos  = np.unique(page_df.cluster)
    boxed_page = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)
    
    return boxed_page

In [None]:
img = check_year_page(2011,6, 2.0/3.0,1.15, True)
display(img)


In [None]:
year, page = 2011, 6
doc    = open_exam(year, "english", "al",1)
df     = get_doc_line_df(doc)

images = get_images(doc)
images = preproc_images(images)
assign_in_image_captions(df,images)

doc_width     = doc[0].rect.width
middle        = doc_width/2
standard_font = df.mode_font.mode()[0]
median_font   = df.font_size.median()


df = enrich_doc_df_with_images(df,images)
df = clean_line_df(df)
identify_all_page_clusters(df,2.0/3.0,1.15, True)
identify_footers(df)
identify_instructions(df)
identify_section_headers(df)
identify_text_headers(df, doc_width)
identify_subtitles(df, doc_width)
identify_subsubtitles(df,doc_width)

indices = get_lines_in_image_clusters(df)
page_df = df.loc[df.page==page, ["text", 'x0', 'y0', 'x1', 'y1', "page","w","category","cluster"]].copy()

## Filter non-caption in same dbscan cluster as image

In [None]:
page_df = df.loc[df.page==page, ["text", 'x0', 'y0', 'x1', 'y1', "page","w"]].copy()
page_images = [image for image in images if image["page"]==page]
page_df = enrich_doc_df_with_images(page_df,page_images)

doc_page = doc[int(page-1)]
pix = doc_page.get_pixmap(matrix=fitz.Matrix(0.8, 0.5))  # scale=2 for higher resolution
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
#print(f"page: {page}")
#display(img)

In [None]:
imgs = check_year_page(2011,2, 2/3, 1.15)
display(imgs[-1])

In [None]:
# These are years where there are captions outside of images. We would like to use hdbscan to differentiate 
# these caption lines from other lines.
#for year, page in [(2001, 2),(2002, 6),(2010, 7),(2011, 2),(2011, 3),(2012, 4),(2012, 5),
#                   (2013, 4),(2013, 6),(2013, 7),(2014, 3),(2019,2) ,(2020,3), (2023, 6),(2024, 3)]:
#    print(year, page)
#    check_year_page(year, page, 2/3, 1.15)

# Develop vertical caption identification using clusters

In [None]:
year, page = 2011, 2
doc    = open_exam(year, "english", "al",1)
doc_width     = doc[0].rect.width
df     = get_doc_line_df(doc)

images = get_images(doc)
images = preproc_images(images)
assign_in_image_captions(df,images)

df = enrich_doc_df_with_images(df,images)
df = clean_line_df(df)
identify_all_page_clusters(df,2.0/3.0, 1.15, text_only=True)

identify_footers(df)
identify_instructions(df)
identify_section_headers(df)
identify_text_headers(df, doc_width)
identify_subtitles(df, doc_width)
identify_subsubtitles(df, doc_width)

page_df = df[df.page==page]

#for page in range(1,9):
rectangs = get_category_boxes(page_df, 'cluster')
labelos  = np.unique(page_df.cluster)
boxed_page = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)

display(boxed_page)

In [None]:
def get_cluster_image_overlaps(df) -> list[int]:
    """
    """
    if np.unique(df.cluster).shape[0]<1:
        raise RuntimeError("Identify text and image clusters before searching for captions.")
    if len(df[df.category=="image"])==0:
        raise RuntimeError("Enrich dataframe with images before searching for vertical captions.")

    indices = []
    for page in range(2,9):
        page_df = df[df.page==page]

        near_pairs  = lines_near_images(page_df, 1, 1)
        im_clusts   = np.unique([pair[0] for pair in near_pairs])

        in_image_cluster     = page_df.cluster.isin(clusters_with_images)
        uncategorised        = page_df.category=="uncategorised"
        mask                 = uncategorised & in_image_cluster & ~is_image

        indices.extend( page_df[mask].index.to_list() ) 

    return  indices

In [None]:
def line_clusters_near_images(page_df, eps_x, eps_y):
    """
    This function identifies line clusters which are within eps_x and eps_y of
    any images.
    
    The function returns a list of tuples, each tuple containing the image cluster index,
    and the line cluster index for line clusters which are close enough.
    """
    olaps = []
    is_image = page_df.category=="image"
    image_clusters = np.unique(page_df[is_image].cluster)
    line_clusters  = np.unique(page_df[~is_image].cluster)
    for i_clust in image_clusters:
        for j_clust in line_clusters:
            df_i = page_df[page_df.cluster==i_clust]
            df_j = page_df[page_df.cluster==j_clust]
            bbox_i= get_df_bbox(df_i)
            bbox_j= get_df_bbox(df_j)

            lines_below = bbox_j[1] > bbox_i[1]

            dx = bbox_horiz_dist(bbox_j, bbox_i) 
            dy = bbox_vert_dist(bbox_j, bbox_i) 
            #print(f"{(i_clust, j_clust)} {(round(dx,2),round(dy,2))})")
            if lines_below and dx<=eps_x and dy<=eps_y:
                olaps.append((i_clust, j_clust))
    return olaps

line_clusters_near_images(page_df, 0,1)

In [None]:
i_clust_image = 16
def get_nearest_line_cluster(page_df, i_clust_image):
    """
    This function will not find line clusters which have a y0 within the image bbox.
    """
    df_image = page_df[page_df.cluster==i_clust_image]
    bbox_i= get_df_bbox(df_image)
    
    below_image = (page_df.y0 >= bbox_i[3])
    same_side   = page_df.apply(lambda row: bbox_horiz_dist( bbox_i, (row["x0"],row["y0"],row["x1"],row["y1"])) == 0 ,axis=1)
    
    df_lines = page_df[below_image & same_side]
    
    line_cluster_dfs = df_lines.groupby("cluster").agg({"x0":"min", "x1":"max", "y0":"min", "y1":"max"})
    dir= ["y0","y1"]
    im_line_clust_dists   = pairwise_distances(df_image[dir], line_cluster_dfs[dir], metric=df_bbox_dist)
    min_dist_clust        = im_line_clust_dists.argmin()
    min_dist              = im_line_clust_dists.min()
    nearest_line_clust_id = line_cluster_dfs.iloc[min_dist_clust].name

    return (min_dist, nearest_line_clust_id)
dy, id = get_nearest_line_cluster(page_df, 16)

dy, id 

In [None]:
def line_closer_to_image_than_nn_lines(page_df, image_Cid, line_Cid):
    """
    This function assumes that the lines being looked at are beneath the image.
    """
    # calculate dist to image
    line_df   =page_df[page_df.cluster==line_Cid]
    line_bbox =get_df_bbox(line_df) 

    image_df  = page_df[page_df.cluster==image_Cid]
    image_bbox =get_df_bbox(image_df)

    dy_li = bbox_vert_dist(line_bbox, image_bbox)

    # calculate dist to next lines
    below_line = (page_df.y0 >= line_bbox[3])
    same_side   = page_df.apply(lambda row: bbox_horiz_dist( line_bbox, (row["x0"],row["y0"],row["x1"],row["y1"])) == 0 ,axis=1)
    other_line_clusters = page_df[below_line & same_side].groupby("cluster").agg({"x0":"min", "x1":"max", "y0":"min", "y1":"max"})
    
    dy_ll = pairwise_distances(line_df[["y0","y1"]], other_line_clusters[["y0","y1"]], metric=df_bbox_dist).min()

    return dy_li < dy_ll

image_Cid = 16
line_Cid  = 14
if line_closer_to_image_than_nn_lines(page_df,image_Cid, line_Cid):
    caption = "\n".join(page_df[page_df.cluster==line_Cid].text.values)
    print(f"image: {16}")
    print(f"caption: {caption}")

In [None]:
def identify_outside_captions_page(page_df:pd.DataFrame, tol ) -> pd.DataFrame:
    images = page_df[page_df.category=="image"]
    if len(images) == 0:
        return page_df
    image_cIds = np.unique(images.cluster)

    for im_id in image_cIds: 
        dy, line_cId = get_nearest_line_cluster(page_df, im_id)
        if dy > tol:
            continue
        closer_to_im = line_closer_to_image_than_nn_lines(page_df, im_id, line_cId)
        if closer_to_im:
            page_df.loc[page_df.cluster==line_cId, "category"] = "caption2"
    return page_df

In [None]:
page_df = identify_outside_captions_page(page_df, 2)
page_df[page_df.category=="caption2"]

In [None]:
def lines_near_image(page_df, eps_y, i_clust_image):
    olaps = []

    df_image = page_df[page_df.cluster==i_clust_image]
    bbox_i= get_df_bbox(df_image)
    
    below_image = (page_df.y0 >= bbox_i[3])
    same_side   = page_df.apply(lambda row: bbox_horiz_dist( bbox_i, (row["x0"],row["y0"],row["x1"],row["y1"])) == 0,axis=1 )


    line_clusters_below  = np.unique(page_df[(~is_image) & below_image & same_side ].cluster)
    for clust_id in line_clusters_below:
        df_line = page_df[page_df.cluster==clust_id]
        bbox_j= get_df_bbox(df_line)
        dx = bbox_horiz_dist(bbox_j, bbox_i) 
        dy = bbox_vert_dist(bbox_j, bbox_i) 
        #print(f"{(i_clust, j_clust)} {(round(dx,2),round(dy,2))})")
        if dx==0 and dy<=eps_y:
            olaps.append((i_clust, j_clust))
    return olaps

In [None]:
from pdf_scraper.general_utils import shared_centre
def check_image_test_cluster_pair(page_df: pd.DataFrame, pair: tuple[int,int]):
    im_clust, text_clust = pair
    tol=1

    text_df  = page_df[page_df.cluster==text_clust] 
    image_df = page_df[page_df.cluster==im_clust]

    x0_t,y0_t, x1_t,y1_t = text_box  = get_df_bbox(text_df)
    x0_i,y0_i, x1_i,y1_i = image_box = get_df_bbox(image_df)

    above_top    = y0_i > y1_t
    below_bottom = y1_i < y0_t
    centred      = shared_centre(text_box,image_box,tol) 
    within_image_frame = (x0_t >= x0_i-tol) & (x1_t <= x1_i+tol)
    uncategorised    = (text_df.category=="uncategorised").sum()/len(text_df) >= 0.5

    dy_image
    dy_nearest_text_cluster

In [None]:
year, page = 2011, 1
doc    = open_exam(year, "english", "al",1)
doc_width     = doc[0].rect.width
df     = get_doc_line_df(doc)

images = get_images(doc)
images = preproc_images(images)
assign_in_image_captions(df,images)

df = enrich_doc_df_with_images(df,images)
df = clean_line_df(df)
identify_all_page_clusters(df,2.0/3.0, 1.15, text_only=True)

identify_footers(df)
identify_instructions(df)
identify_section_headers(df)
identify_text_headers(df, doc_width)
identify_subtitles(df, doc_width)
identify_subsubtitles(df, doc_width)

page_df = df[df.page==page]

#for page in range(1,9):
rectangs = get_category_boxes(page_df, 'cluster')
labelos  = np.unique(page_df.cluster)
boxed_page = get_bboxed_page_image(doc, page, rectangs,color=(0.0,0,0.0), labels=labelos)

display(boxed_page)

In [None]:
for page in range(2,9):
    page_df = df[df.page==page]
    identify_outside_captions_page(page_df, 1) 

df[df.category=="caption2"].head(20)

In [None]:
page_df = df[df.page==2]
identify_outside_captions_page(page_df, 1) 
out_df = identify_outside_captions_page(page_df, 1) 
out_df[out_df.category=="caption2"]