In [1]:
from pdf_scraper.doc_utils   import (open_exam, get_doc_line_df, identify_section_headers,
                                     identify_text_headers,identify_footers, identify_instructions,
                                     identify_subtitles, identify_subsubtitles,get_images,preproc_images,
                                     assign_in_image_captions, identify_vertical_captions,
                                     identify_all_page_clusters, enrich_doc_df_with_images)

from pdf_scraper.line_utils    import clean_line_df, get_category_boxes, get_df_bbox
from pdf_scraper.image_utils   import get_bboxed_page_image,show_image, show_all_imgs
from pdf_scraper.general_utils import bbox_vert_dist, bbox_horiz_dist
from time import sleep
import numpy as np
import pandas as pd

import os, io, re

from IPython.display import display, clear_output, Image
import matplotlib.pyplot as plt

In [None]:
def get_parsed_df(doc):
    df = get_doc_line_df(doc)
    doc_width = doc[0].rect.width

    images = preproc_images(get_images(doc))
    assign_in_image_captions(df, images)

    df = clean_line_df(df)
    df = enrich_doc_df_with_images(df, images)

    identify_all_page_clusters(df, 2.0/3.0, 1.15, text_only=True)
    identify_footers(df)
    identify_instructions(df)
    identify_section_headers(df)
    identify_text_headers(df, doc_width)
    identify_subtitles(df, doc_width)
    identify_subsubtitles(df, doc_width)

    return df

def view_year_page(year,page):
    doc         = open_exam(year)
    images      = preproc_images(get_images(doc))
    df          = get_parsed_df(doc)

    page_df = df[df.page == page]
    rects   = get_category_boxes(page_df, "cluster")
    img     = get_bboxed_page_image(doc, page, rects, labels = np.unique(page_df.cluster))

    return img


# Check and clean dataframe

In [None]:
df = pd.read_csv("captioned_dfs.csv")
df.drop_duplicates(inplace=True)
df.sort_values(by=["year","page"],ignore_index=True,inplace=True)
df.loc[df.category=="caption2",["text","page","year"]]

The below check will identify cases where two labelling run throughs were made and different categorisations
were assigned to the same lines in different run throughs. The code below will allow you to identify and then
remove the incorrectly categorised lines.

In [None]:
capt_df = df.loc[ df.category=="caption2", ["text","category","year","page"]].copy()
check = df[["text","category","year","page"]].merge(capt_df, "inner", on=["text","year","page"])
assert len(check[check.category_x != check.category_y]) == 0


# Once you have identified incorrectly categorised lines using the code above, you can drop them using their
# indices as below.
# df.drop([12453,12454],inplace=True)
# df.sort_values(by=["year","page"],ignore_index=True,inplace=True)
# df.to_csv("captioned_dfs.csv")

# Preprocessing

## Drop pages without images and unwanted columns

In [None]:
for year in range(2001, 2026):
    print(year, end=", ")
    doc         = open_exam(year)
    images      = preproc_images(get_images(doc))
    image_pages = set(img["page"] for img in images if img["page"]>1)
    drop_mask   = (df.year == year) & (~df.page.isin(image_pages))
    to_drop     = df[drop_mask].index
    df.drop(to_drop, inplace=True)

df = df[["x0","y0","x1","y1", "mode_font","w", "h", "text","font_size","category","page","cluster","year"]]
df.head()

## Drop already categorised lines

In [None]:
mask = df.category.isin( ("uncategorised","caption2","image") )
print(len(df))
df = df[mask]
print(len(df))

## Convert line_df to block_df

In [None]:
def text_func(text_series):
    """
    Returns True if any text entry in the series likely refers to an image or figure caption.
    Matches words and patterns like:
      - 'image', 'image 1', 'Image:', etc.
      - 'figure', 'figure 1', 'fig.', 'fig 2', etc.
      - 'above', 'below', with or without colons or punctuation.
    """
    pattern = re.compile(
        r'\b('
        r'image\s*\d*[\s:.,;)]*|'      # image, image 1:, image2, etc.
        r'fig(?:ure)?\s*\d*[\s:.,;)]*|' # fig, fig. 1, figure 2:, etc.
        r'above[\s:.,;)]*|'             # above, above:
        r'below[\s:.,;)]*'              # below, below:
        r')',
        re.IGNORECASE
    )
    return text_series.astype(str).apply(lambda t: bool(pattern.search(t))).any()

agg_dict = {
    "x0": "min",
    "y0": "min",
    "x1": "max",
    "y1": "max",
    "mode_font": lambda x: x.mode().iat[0] if not x.mode().empty else None,
    "w": "median",
    "h": "median",
    "text": lambda x: "\n".join(x.astype(str)),
    #"text": text_func,
    "font_size": "median",
    "category": lambda x: x.mode().iat[0] if not x.mode().empty else None
}

new_col_aggs = {
    "n_lines": ("x0", "count")
}

block_df            = df.groupby(["year","page","cluster"]).agg(agg_dict).reset_index()
block_df["n_lines"] = df.groupby(["year","page","cluster"]).agg(n_lines=("x0","count")).values


text_block_df  = block_df[block_df.category !="image"].copy()
image_block_df = block_df[block_df.category =="image"].copy()

In [None]:
block_df.loc[(block_df.category=="image"), "h"]  = block_df[block_df.category=="image"].y1 - block_df.loc[block_df.category=="image"].y0
block_df.loc[(block_df.category=="image"), "w"]  = block_df[block_df.category=="image"].x1 - block_df.loc[block_df.category=="image"].x0
block_df[block_df.category=="image"].head(4)

In [None]:
capt_block_df = block_df[block_df.category=="caption2"].copy()
print(f'Data is {100*len(capt_block_df)/len(block_df[block_df.category !="image"]):0.2f}% caption.')
capt_block_df

## Find nearest image to block

In [None]:
# For each block, check the distance to every image on the same page and year,
# choose block which is closest. dx will have to be 0 for it to be a vertical caption.
# - filter only blocks with dx == 0
# - amongst those return the lowest dy.

### Via images dictionary

In [None]:
def images_overlapping_with_bbox(bbox, page, images):
    """
    Note: These images must be from the same year as the bbox.
    """
    imgs = [img for img in images if img["page"]==page]
    ov_ims = []
    for img in imgs:
        dx = bbox_horiz_dist(bbox, img["bbox"])
        if dx == 0:
            print(f"overlaps with image: {img["number"]}")
            ov_ims.append(img)
    return ov_ims

def nearest_overlapping_image(bbox, imgs):
    """
    Of those images which overlap in the x direction, which is the nearest vertically?
    """
    dy_min = 1000.0
    nearest_image = None
    for im in imgs:
        im_bbox = im["bbox"]
        dy = bbox_vert_dist(bbox,im_bbox)
        if  dy < dy_min:
            nearest_image = im
            dy_min = dy
    return (nearest_image, dy_min)


In [None]:
year= 2011
doc = open_exam(year)
images = preproc_images(get_images(doc))
capt_bbox = get_df_bbox( capt_block_df.loc[[177]])
page = capt_block_df.loc[177].page
imgs = images_overlapping_with_bbox(capt_bbox, page, images)

show_all_imgs(1,3, imgs )

In [None]:
year= 2011
doc = open_exam(year)
images = preproc_images(get_images(doc))
capt_bbox = get_df_bbox( capt_block_df.loc[[177]])
page = capt_block_df.loc[177].page
imgs = images_overlapping_with_bbox(capt_bbox, page, images)
nearest_img, dy = nearest_overlapping_image(capt_bbox, imgs)

#show_image(img )

### Via image-enriched dataframe

In [None]:
def get_overlapping_image_indices(row, block_df):
    """
    Note: These images must be from the same year as the bbox.
    """
    row_year  = (block_df.year == row.year)
    row_page  = (block_df.page == row.page)
    is_image  = (block_df.category=="image")

    bbox   = row.x0, row.y0, row.x1, row.y1
    img_df = block_df[ is_image & row_page & row_year ].copy()

    img_df["dx"] = img_df.apply( lambda x: bbox_horiz_dist(bbox, (x.x0, x.y0, x.x1, x.y1) ) , axis=1)
    indices = img_df[img_df.dx==0].index
    return indices

In [None]:
row = block_df.loc[227]
year   = row.year
page   = row.page
bbox   = row.x0, row.y0, row.x1, row.y1
img_df = block_df[(block_df.category=="image") & (block_df.year==year) & (block_df.page==page)].copy()

img_df["dx"] =img_df.apply( lambda x: bbox_horiz_dist(bbox, (x.x0, x.y0, x.x1, x.y1) ), axis=1)
indices = img_df[img_df.dx==0].index
indices.shape[0]

In [None]:
text_block_df["im_ovs"]=text_block_df.apply(lambda x: get_overlapping_image_indices(x, block_df).shape[0], axis=1)

all_ovs = text_block_df.apply(lambda x: get_overlapping_image_indices(x, block_df), axis=1)
text_block_df["img_ov"] = all_ovs.apply( lambda x: True if len(x)>0 else False)
text_block_df.head(10)

In [None]:
print(len(text_block_df[text_block_df.img_ov]))
print(len(text_block_df))
print(f'Data is {100*len(capt_block_df)/len(text_block_df[text_block_df.img_ov]):0.2f}% caption.')

In [None]:
def get_nearest_image_index(row, block_df):
    bbox1   = row.x0, row.y0, row.x1, row.y1
    i_img   = get_overlapping_image_indices(row,block_df)
    img_df  = block_df.loc[i_img]

    dy = img_df.apply( lambda x: bbox_vert_dist(bbox1, (x.x0,x.y0,x.x1,x.y1) ), axis=1 )

    if len(dy) >0:
        return dy.idxmin()
    return np.nan

def get_nearest_image_distance(row , block_df):
    bbox1 = row.x0, row.y0, row.x1, row.y1

    i_image = get_nearest_image_index(row, block_df)
    if np.isnan(i_image):
        return np.nan
    im_row = block_df.loc[i_image]
    bbox2 = im_row.x0, im_row.y0, im_row.x1, im_row.y1

    dy = bbox_vert_dist(bbox1, bbox2 )

    return  dy

In [None]:
row = capt_block_df.loc[227]
im_idx = get_nearest_image_index(row, block_df)
image_block_df.loc[im_idx]

In [None]:
get_nearest_image_distance(row, block_df)

In [None]:
text_block_df.apply(lambda x: get_nearest_image_distance(x, block_df), axis=1)

## Define block properties relative to nearest image

In [None]:
def get_nearest_image_relative_width(row , block_df):

    width_txt = row.w

    i_image = get_nearest_image_index(row, block_df)
    if np.isnan(i_image):
        return np.nan
    width_im = block_df.loc[i_image].w

    return width_txt/ width_im

text_block_df.apply(lambda x: get_nearest_image_relative_width(x, block_df), axis=1)

In [None]:
def get_nearest_image_centre_offset(row , block_df):

    row_centre = (row.x0 + row.x1)/2

    i_image = get_nearest_image_index(row, block_df)
    if np.isnan(i_image):
        return np.nan
    im_row = block_df.loc[i_image]
    im_centre = (im_row.x0 + im_row.x1)/2

    return abs(im_centre - row_centre)

text_block_df.apply(lambda x: get_nearest_image_centre_offset(x, block_df), axis=1)

## Distance to nearest text block

In [None]:
def get_distance_to_nearest_text_block(row, block_df):
    bbox1        = row.x0, row.y0, row.x1, row.y1

    exclude_self = (block_df.index != row.name)
    row_year     = (block_df.year == row.year)
    row_page     = (block_df.page == row.page)
    not_image    = (block_df.category != "image")

    text_df      = block_df.loc[ exclude_self & not_image & row_year & row_page ]
    overlapping  = text_df.apply(lambda x: bbox_horiz_dist(bbox1, (x.x0,x.y0,x.x1,x.y1) )== 0 ,axis=1 )

    ov_text_df   = text_df[overlapping]

    dy = ov_text_df.apply( lambda x: bbox_vert_dist(bbox1, (x.x0,x.y0,x.x1,x.y1) ), axis=1 )

    if len(dy) >0:
        return dy.min()
    return np.nan

text_block_df.apply(lambda x: get_distance_to_nearest_text_block(x, block_df), axis=1)