In [1]:
from pdf_scraper.doc_utils   import (open_exam, get_doc_line_df, identify_section_headers,
                                     identify_text_headers,identify_footers, identify_instructions,
                                     identify_subtitles, identify_subsubtitles,get_images,preproc_images,
                                     assign_in_image_captions, identify_vertical_captions,
                                     identify_all_page_clusters, enrich_doc_df_with_images)

from pdf_scraper.line_utils    import clean_line_df, get_category_boxes, get_df_bbox
from pdf_scraper.image_utils   import get_bboxed_page_image,show_image, show_all_imgs
from pdf_scraper.general_utils import bbox_vert_dist, bbox_horiz_dist
from time import sleep
import numpy as np
import pandas as pd

import os, io, re

from IPython.display import display, clear_output, Image
import matplotlib.pyplot as plt

In [2]:
def get_parsed_df(doc):
    df = get_doc_line_df(doc)
    doc_width = doc[0].rect.width

    images = preproc_images(get_images(doc))
    assign_in_image_captions(df, images)

    df = clean_line_df(df)
    df = enrich_doc_df_with_images(df, images)

    identify_all_page_clusters(df, 2.0/3.0, 1.15, text_only=True)
    identify_footers(df)
    identify_instructions(df)
    identify_section_headers(df)
    identify_text_headers(df, doc_width)
    identify_subtitles(df, doc_width)
    identify_subsubtitles(df, doc_width)

    return df

In [3]:
def get_caption_input(df, page):
    while True:
       caption_input = input("Caption text: ").strip().lower()
       if caption_input == "":
           break

       matches = df[(df.page == page) & (df.text.str.strip().str.lower().str.contains(caption_input)) ]
       if matches.empty:
           print("No match found. Retype, perhaps with more of line.")
           continue
       elif len(matches) > 1:
           print(f"{len(matches)} matches:")
           display(matches.head(10))
           index_input = input("Select index: ")
           index_input = int(index_input)
           matches     = df.loc[[index_input]]

       df.loc[matches.index, "category"] = "caption2"
       print(f"Marked: {matches.text.values[0]} as caption2.")

In [4]:
def update_image(image_handle, img):
    buf = io.BytesIO()
    plt.figure(figsize=(16, 10))
    plt.imshow(img); plt.axis("off"); plt.tight_layout()
    plt.savefig(buf, format="png")
    plt.close()
    buf.seek(0)

    image_handle.update(Image(data=buf.read()))
    return image_handle


In [52]:
def view_year_page(year,page):
    doc         = open_exam(year)
    images      = preproc_images(get_images(doc))
    df          = get_parsed_df(doc)

    page_df = df[df.page == page]
    rects   = get_category_boxes(page_df, "cluster")
    img     = get_bboxed_page_image(doc, page, rects, labels = np.unique(page_df.cluster))

    return img

In [None]:
dfs = []
output_csv = "captioned_dfs.csv"

for year in range(2020, 2021):
    clear_output(wait=True)
    print(f"\n----- Year: {year} -----\n")
    doc         = open_exam(year)
    images      = preproc_images(get_images(doc))
    df          = get_parsed_df(doc)
    df["year"]  = year

    image_handle = display(None, display_id=True)
    if year in [y for y in range(2001, 2011)]:
        file_exists = os.path.exists(output_csv)
        df.to_csv(output_csv, mode="a", header=not file_exists, index=False)
        dfs.append(df)
        continue

    image_pages = set(img["page"] for img in images if img["page"]>1)
    for page in sorted(image_pages):
        page_df = df[df.page == page]
        rects   = get_category_boxes(page_df, "cluster")
        img     = get_bboxed_page_image(doc, page, rects, labels = np.unique(page_df.cluster))

        image_handle = update_image(image_handle, img)
        print(f"\n{year} --- Page {page} ---")

        has_captions = input("Are there captions on this page? (y/n): ").strip().lower()
        if has_captions not in ("y", "yes"):
            print("Skipping this page.\n")
            continue
        get_caption_input(df, page)

        display(df.loc[(df.category=="caption2") & (df.page==page),["text","page"] ].head())
        good_page = input("Are these captions correct? (y/n): ").strip().lower()
        if good_page not in ("y", "yes"):
            df.loc[(df.page==page) & (df.category=="caption2"), "category"] = "uncategorised"
            get_caption_input(df, page)
        print(f"Done with page {page}.\n")

    file_exists = os.path.exists(output_csv)
    df.to_csv(output_csv, mode="a", header=not file_exists, index=False)

    dfs.append(df)

big_df = pd.concat(dfs, ignore_index=True)
print("\nFinal dataframe shape:", big_df.shape)

# Postprocessing

## Check and clean dataframe

In [5]:
df = pd.read_csv("captioned_dfs.csv")
df.drop_duplicates(inplace=True)
df.sort_values(by=["year","page"],ignore_index=True,inplace=True)
df.loc[df.category=="caption2",["text","page","year"]]

Unnamed: 0,text,page,year
4141,Juliet Manet with Cat by Renoir,2,2011
4184,Summer: Cat on a balustrade,3,2011
4186,by T.A. Steinlen,3,2011
4726,"Mary Robinson with Nadhifa Ibrahim Mohamed,",4,2012
4728,a health-worker in Somalia,4,2012
4767,"Detail from Famine Memorial, Dublin",4,2012
4790,FACT: THE CRISIS IN THE HORN OF AFRICA IS UNLIKE,5,2012
4791,"ANY OTHER – DISPLACING, STARVING, KILLING OVER",5,2012
4792,13 MILLION PEOPLE,5,2012
4795,2004 Indonesia 2010 Haiti ...,5,2012


The below check will identify cases where two labelling run throughs were made and different categorisations
were assigned to the same lines in different run throughs. The code below will allow you to identify and then
remove the incorrectly categorised lines.

In [6]:
capt_df = df.loc[ df.category=="caption2", ["text","category","year","page"]].copy()
check = df[["text","category","year","page"]].merge(capt_df, "inner", on=["text","year","page"])
assert len(check[check.category_x != check.category_y]) == 0

In [None]:
# Once you have identified incorrectly categorised lines using the code above, you can drop them using their
# indices as below.
# df.drop([12453,12454],inplace=True)
# df.sort_values(by=["year","page"],ignore_index=True,inplace=True)
# df.to_csv("captioned_dfs.csv")

## Enrich for caption detection

### Drop pages without images and unwanted columns

In [7]:
for year in range(2001, 2026):
    print(year, end=", ")
    doc         = open_exam(year)
    images      = preproc_images(get_images(doc))
    image_pages = set(img["page"] for img in images if img["page"]>1)
    drop_mask   = (df.year == year) & (~df.page.isin(image_pages))
    to_drop     = df[drop_mask].index
    df.drop(to_drop, inplace=True)

df = df[["x0","y0","x1","y1", "mode_font","w", "h", "text","font_size","category","page","cluster","year"]]
df.head()

2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 

Unnamed: 0,x0,y0,x1,y1,mode_font,w,h,text,font_size,category,page,cluster,year
31,251.158096,29.532829,344.11377,54.46283,Times-Bold,92.955673,24.93,SECTION I,18.0,section,2,0,2001
32,165.719299,51.132805,429.556122,76.062805,Times-Bold,263.836823,24.93,COMPREHENDING (100 marks),18.0,section,2,0,2001
33,266.317688,86.132011,328.961334,111.062012,Times-Bold,62.643646,24.93,TEXT 1,18.0,title,2,1,2001
34,240.658691,104.132011,354.61673,129.062012,Times-Bold,113.958038,24.93,BEING IRISH,18.0,title,2,1,2001
35,50.188999,143.396805,545.094421,160.0168,Times-Bold,494.905422,16.619995,The following extracts are adapted from the bo...,12.0,subtitle,2,2,2001


### Drop already categorised lines

In [8]:
mask = df.category.isin( ("uncategorised","caption2","image") )
print(len(df))
df = df[mask]
print(len(df))

5779
5196


### Convert line_df to block_df

In [29]:
def text_func(text_series):
    """
    Returns True if any text entry in the series likely refers to an image or figure caption.
    Matches words and patterns like:
      - 'image', 'image 1', 'Image:', etc.
      - 'figure', 'figure 1', 'fig.', 'fig 2', etc.
      - 'above', 'below', with or without colons or punctuation.
    """
    pattern = re.compile(
        r'\b('
        r'image\s*\d*[\s:.,;)]*|'      # image, image 1:, image2, etc.
        r'fig(?:ure)?\s*\d*[\s:.,;)]*|' # fig, fig. 1, figure 2:, etc.
        r'above[\s:.,;)]*|'             # above, above:
        r'below[\s:.,;)]*'              # below, below:
        r')',
        re.IGNORECASE
    )
    return text_series.astype(str).apply(lambda t: bool(pattern.search(t))).any()

agg_dict = {
    "x0": "min",
    "y0": "min",
    "x1": "max",
    "y1": "max",
    "mode_font": lambda x: x.mode().iat[0] if not x.mode().empty else None,
    "w": "median",
    "h": "median",
    "text": lambda x: "\n".join(x.astype(str)),
    #"text": text_func,
    "font_size": "median",
    "category": lambda x: x.mode().iat[0] if not x.mode().empty else None
}

new_col_aggs = {
    "n_lines": ("x0", "count")
}

block_df            = df.groupby(["year","page","cluster"]).agg(agg_dict).reset_index()
block_df["n_lines"] = df.groupby(["year","page","cluster"]).agg(n_lines=("x0","count")).values


text_block_df  = block_df[block_df.category !="image"].copy()
image_block_df = block_df[block_df.category =="image"].copy()

In [30]:
block_df.loc[(block_df.category=="image"), "h"]  = block_df[block_df.category=="image"].y1 - block_df.loc[block_df.category=="image"].y0
block_df.loc[(block_df.category=="image"), "w"]  = block_df[block_df.category=="image"].x1 - block_df.loc[block_df.category=="image"].x0
block_df[block_df.category=="image"].head(4)

Unnamed: 0,year,page,cluster,x0,y0,x1,y1,mode_font,w,h,text,font_size,category,n_lines
4,2001,2,8,46.619999,221.170013,142.619995,359.170013,,95.999996,138.0,<image>,,image,1
5,2001,2,9,310.112793,221.675201,407.238892,358.720001,,97.126099,137.0448,<image>,,image,1
6,2001,2,10,46.73,517.570435,168.786896,649.460022,,122.056896,131.889587,<image>,,image,1
7,2001,2,11,310.175995,520.043945,419.066986,648.779907,,108.890991,128.735962,<image>,,image,1


In [31]:
capt_block_df = block_df[block_df.category=="caption2"].copy()
print(f'Data is {100*len(capt_block_df)/len(block_df[block_df.category !="image"]):0.2f}% caption.')
capt_block_df

Data is 2.16% caption.


Unnamed: 0,year,page,cluster,x0,y0,x1,y1,mode_font,w,h,text,font_size,category,n_lines
227,2011,2,14,360.779999,667.547668,522.480042,680.831665,TimesNewRomanPS-BoldItal,182.700043,13.283997,Juliet Manet with Cat by Renoir,12.0,caption2,1
240,2011,3,14,362.160004,359.087708,508.155273,386.171692,TimesNewRomanPS-BoldItal,183.090134,13.283997,Summer: Cat on a balustrade\nby T.A. Steinlen,12.0,caption2,2
278,2012,4,8,81.419998,787.016479,250.958523,799.171387,TimesNewRomanPS-ItalicMT,172.282974,12.154907,"Detail from Famine Memorial, Dublin",10.977799,caption2,1
279,2012,4,10,327.480011,388.796539,543.179313,413.550934,TimesNewRomanPS-ItalicMT,172.289093,12.154846,"Mary Robinson with Nadhifa Ibrahim Mohamed,\na...",10.977799,caption2,2
286,2012,5,1,143.399994,204.179672,462.702971,245.111649,Arial-BoldMT,322.013031,13.391998,FACT: THE CRISIS IN THE HORN OF AFRICA IS UNLI...,12.0,caption2,3
287,2012,5,3,137.399994,416.331604,407.803612,438.973785,Arial-BoldMT,263.789902,11.182327,2004 Indonesia 2010 Haiti ...,10.02,caption2,2
343,2013,4,8,399.600006,497.087708,475.274323,510.371704,TimesNewRomanPS-BoldItal,78.674316,13.283997,William Trevor,12.0,caption2,1
354,2013,6,6,340.440002,541.307617,536.157593,582.191711,TimesNewRomanPS-BoldItal,177.338501,13.283997,Image 1\nJohn Collier’s iconic photograph of t...,12.0,caption2,3
371,2013,7,14,331.139984,418.247711,530.839233,445.271698,TimesNewRomanPS-BoldItal,121.816788,13.283997,Image 2\nThe concourse of Grand Central Station,12.0,caption2,2
384,2014,3,14,349.140015,348.011871,512.487248,359.104004,TimesNewRomanPS-BoldMT,165.852234,11.092133,Canada by Richard Ford – book cover,10.02,caption2,1


### Find nearest image to block

In [12]:
# For each block, check the distance to every image on the same page and year,
# choose block which is closest. dx will have to be 0 for it to be a vertical caption.
# - filter only blocks with dx == 0
# - amongst those return the lowest dy.

In [32]:
def images_overlapping_with_bbox(bbox, page, images):
    """
    Note: These images must be from the same year as the bbox.
    """
    imgs = [img for img in images if img["page"]==page]
    ov_ims = []
    for img in imgs:
        dx = bbox_horiz_dist(bbox, img["bbox"])
        if dx == 0:
            print(f"overlaps with image: {img["number"]}")
            ov_ims.append(img)
    return ov_ims


In [None]:
row = block_df.loc[227]
year   = row.year
page   = row.page
bbox   = row.x0, row.y0, row.x1, row.y1
img_df = block_df[(block_df.category=="image") & (block_df.year==year) & (block_df.page==page)].copy()

img_df["dx"] =img_df.apply( lambda x: bbox_horiz_dist(bbox, (x.x0, x.y0, x.x1, x.y1) ), axis=1)
indices = img_df[img_df.dx==0].index
indices.shape[0]

In [None]:
def get_overlapping_image_indices(row, block_df):
    """
    Note: These images must be from the same year as the bbox.
    """
    row_year  = (block_df.year == row.year)
    row_page  = (block_df.page == row.page)
    is_image  = (block_df.category=="image")

    bbox   = row.x0, row.y0, row.x1, row.y1
    img_df = block_df[ is_image & row_page & row_year ].copy()

    img_df["dx"] = img_df.apply( lambda x: bbox_horiz_dist(bbox, (x.x0, x.y0, x.x1, x.y1) ) , axis=1)
    indices = img_df[img_df.dx==0].index
    return indices

In [46]:
text_block_df["im_ovs"]=text_block_df.apply(lambda x: get_overlapping_image_indices(x, block_df).shape[0], axis=1)

all_ovs = text_block_df.apply(lambda x: get_overlapping_image_indices(x, block_df), axis=1)
text_block_df["img_ov"] = all_ovs.apply( lambda x: True if len(x)>0 else False)
text_block_df.head(10)

Unnamed: 0,year,page,cluster,x0,y0,x1,y1,mode_font,w,h,text,font_size,category,n_lines,img_ov,im_ovs
0,2001,2,3,48.188599,218.503723,283.469727,488.539734,Times-Roman,136.996651,16.007996,"Jennifer Johnston, is a\nwriter and was born i...",11.696153,uncategorised,18,True,2
1,2001,2,4,48.188583,518.503723,283.468658,728.539734,Times-Roman,112.85141,16.007996,"Seán\nMcCague,\nis\nPresident of the Gaelic\nA...",11.696153,uncategorised,18,True,2
2,2001,2,6,311.810181,218.503723,547.106323,503.539734,Times-Roman,135.739746,16.007996,"Polly Devlin, is a writer,\nbroadcaster and co...",11.696153,uncategorised,19,True,2
3,2001,2,7,311.810181,518.503723,547.085693,743.539734,Times-Roman,124.12085,16.007996,"Brian Kennedy, is a singer\nfrom Belfast.\nSon...",11.696153,uncategorised,15,True,2
8,2001,3,0,312.085876,55.187927,547.36145,239.218369,Times-Roman,139.208252,16.007996,Martin Mansergh is special\nadviser to the Tao...,11.696153,uncategorised,14,True,1
9,2001,3,3,62.637798,425.432922,119.941399,442.052917,Times-Bold,57.3036,16.619995,Question A,12.0,uncategorised,1,True,1
10,2001,3,4,69.971001,455.324921,488.537262,471.332916,Times-Roman,201.601933,16.007996,(i)\nWhat aspects of Irishness emerge most str...,12.0,uncategorised,2,True,2
11,2001,3,5,66.637398,485.324921,533.180786,516.332947,Times-Roman,188.552483,16.007996,"(ii)\nIn your opinion, which one of the writer...",12.0,uncategorised,4,True,2
12,2001,3,6,63.303799,530.324951,533.173523,561.332947,Times-Roman,186.507057,16.007996,(iii)\nChoose one of the people in the above t...,12.0,uncategorised,4,True,2
13,2001,3,7,62.638988,590.432922,119.940186,607.052917,Times-Bold,57.301197,16.619995,Question B,12.0,uncategorised,1,True,1


In [None]:
def get_nearest_image_index(row, block_df):
    bbox1   = row.x0, row.y0, row.x1, row.y1
    i_img   = get_overlapping_image_indices(row,block_df)
    img_df  = block_df.loc[i_img]

    dy = img_df.apply( lambda x: bbox_vert_dist(bbox1, (x.x0,x.y0,x.x1,x.y1) ), axis=1 )

    if len(dy) >0:
        return dy.idxmin()
    return np.nan

def get_nearest_image_distance(row , block_df):
    bbox1 = row.x0, row.y0, row.x1, row.y1

    i_image = get_nearest_image_index(row, block_df)
    if np.isnan(i_image):
        return np.nan
    im_row = block_df.loc[i_image]
    bbox2 = im_row.x0, im_row.y0, im_row.x1, im_row.y1

    dy = bbox_vert_dist(bbox1, bbox2 )

    return  dy

In [1]:
row = capt_block_df.loc[227]
im_idx = get_nearest_image_index(row, block_df)
image_block_df.loc[im_idx]

NameError: name 'capt_block_df' is not defined

In [78]:
get_nearest_image_distance(row, block_df)

nan

In [77]:
text_block_df.apply(lambda x: get_nearest_image_distance(x, block_df), axis=1)

0        0.000000
1        0.000000
2        0.000000
3        0.000000
8        0.000000
          ...    
755           NaN
756    120.539703
757    320.699707
759      0.000000
760           NaN
Length: 649, dtype: float64

In [79]:
def get_nearest_image_relative_width(row , block_df):

    width_txt = row.w

    i_image = get_nearest_image_index(row, block_df)
    if np.isnan(i_image):
        return np.nan
    width_im = block_df.loc[i_image].w

    return width_txt/ width_im

In [80]:
text_block_df.apply(lambda x: get_nearest_image_relative_width(x, block_df), axis=1)

0      1.427049
1      0.924580
2      1.397562
3      1.139863
8      1.407551
         ...   
755         NaN
756    0.981770
757    0.959969
759    1.006635
760         NaN
Length: 649, dtype: float64

In [81]:
def get_nearest_image_centre_offset(row , block_df):

    row_centre = (row.x0 + row.x1)/2

    i_image = get_nearest_image_index(row, block_df)
    if np.isnan(i_image):
        return np.nan
    im_row = block_df.loc[i_image]
    im_centre = (im_row.x0 + im_row.x1)/2

    return abs(im_centre - row_centre)

In [82]:
text_block_df.apply(lambda x: get_nearest_image_centre_offset(x, block_df), axis=1)

0       71.209166
1       58.070173
2       70.782410
3       64.826447
8       73.333160
          ...    
755           NaN
756      2.241302
757      3.972472
759    124.903872
760           NaN
Length: 649, dtype: float64

In [88]:
def get_distance_to_nearest_text_block(row, block_df):
    bbox1        = row.x0, row.y0, row.x1, row.y1

    exclude_self = (block_df.index != row.name)
    row_year     = (block_df.year == row.year)
    row_page     = (block_df.page == row.page)
    not_image    = (block_df.category != "image")

    text_df      = block_df.loc[ exclude_self & not_image & row_year & row_page ]
    overlapping  = text_df.apply(lambda x: bbox_horiz_dist(bbox1, (x.x0,x.y0,x.x1,x.y1) )== 0 ,axis=1 )

    ov_text_df   = text_df[overlapping]

    dy = ov_text_df.apply( lambda x: bbox_vert_dist(bbox1, (x.x0,x.y0,x.x1,x.y1) ), axis=1 )

    if len(dy) >0:
        return dy.min()
    return np.nan

In [89]:
text_block_df.apply(lambda x: get_distance_to_nearest_text_block(x, block_df), axis=1)

0       29.963989
1       29.963989
2       14.963989
3       14.963989
8      216.106552
          ...    
755     12.419922
756     12.420013
757     12.420044
759     14.820129
760     14.820129
Length: 649, dtype: float64

In [None]:
year= 2011
doc = open_exam(year)
images = preproc_images(get_images(doc))
capt_bbox = get_df_bbox( capt_block_df.loc[[177]])
page = capt_block_df.loc[177].page
imgs = images_overlapping_with_bbox(capt_bbox, page, images)

show_all_imgs(1,3, imgs )


In [None]:
def nearest_overlapping_image(bbox, imgs):
    """
    Of those images which overlap in the x direction, which is the nearest vertically?
    """
    dy_min = 1000.0
    nearest_image = None
    for im in imgs:
        im_bbox = im["bbox"]
        dy = bbox_vert_dist(bbox,im_bbox)
        if  dy < dy_min:
            nearest_image = im
            dy_min = dy
    return (nearest_image, dy_min)

In [None]:
year= 2011
doc = open_exam(year)
images = preproc_images(get_images(doc))
capt_bbox = get_df_bbox( capt_block_df.loc[[177]])
page = capt_block_df.loc[177].page
imgs = images_overlapping_with_bbox(capt_bbox, page, images)
nearest_img, dy = nearest_overlapping_image(capt_bbox, imgs)

#show_image(img )

In [None]:
row = capt_block_df.loc[177]
row.year

In [None]:
def dy_to_nearest_image(row):
    year = row.year

### Is block aligned in x with image