In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from fitz import Rect
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO

import numpy as np


from pdf_scraper.doc_utils import open_exam, get_images
from pdf_scraper.doc_utils import get_doc_line_df
from pdf_scraper.doc_utils import get_captions

from pdf_scraper.doc_utils import filter_images, identify_instructions, identify_footers, identify_section_headers

from pdf_scraper.line_utils import closest_image,closest_line, closest_vertical_line
from pdf_scraper.general_utils import bbox_vert_dist, bbox_distance
from pdf_scraper.general_utils import bbox_horiz_dist, shared_centre

In [None]:
def show_image(image):
    img_bytes = image["image"]
    img_stream = BytesIO(img_bytes)
    img = Image.open(img_stream)
    display(img)

def show_all_imgs(nrows,ncols, imgs):
    fig, axes = plt.subplots(nrows, ncols, figsize=(18, 5))
    for i, ax in enumerate(axes.flat):
        if i < len(imgs):  # Only show the available imgs
            img_bytes = imgs[i]["image"]
            img = Image.open(BytesIO(img_bytes))
            ax.imshow(img)
            ax.set_title("Page: "+str(imgs[i]['page'])+"; "+imgs[i]["caption"] )
            ax.axis('off')
        else:
            ax.axis('off')  # Hide empty subplot

    plt.tight_layout()
    plt.show()

- If there is a line which is nearer to the image than any other line.
- If the line is below or above and centred on the image centre.
- If the line has either a font or a font size which are not the median for that page.

In [None]:
def closest_line_closest_thing(image,doc_df):
    """
    This function finds the closest line to the input image. If the image is also
    the closest object to this found line, then the line is assigned as a caption.

    Only vertical distances are computed, and only lines within the width of the
    image, and sharing a centre with the image are considered.

    Any lines which have already been identified as something else will be excluded.
    """
    n_page   = image["page"]
    img_bbox = image["bbox"]
    page_df = doc_df[doc_df.page==image["page"]].copy()
    # I think just centred would be better to have here. There can be a caption which
    # wider than the image.
    overlap = page_df.apply(
        lambda row: bbox_horiz_dist((row["x0"],row["y0"],row["x1"],row["y1"] ),image["bbox"])==0,
        axis=1)
    centred = page_df.apply(
        lambda row: shared_centre( (row["x0"],row["y0"],row["x1"],row["y1"] ),image["bbox"], 1) ,
        axis=1)

    unidentified = (page_df.category=="uncategorised")

    overlap_df = page_df[overlap]
    centred_df = page_df[overlap & centred & unidentified]
    if len(centred_df) ==0:
        return None
    idx, dist = closest_vertical_line(img_bbox, centred_df, n_page)
    line_bbox = tuple(centred_df.loc[idx][["x0","y0","x1","y1"]])

    if (len(centred_df)==1):
        if dist > 30:
            return None
        else:
            print(f"There is one line near image and no other lines on page {image["page"]}.")
            return idx

    idx1, dist1 = closest_vertical_line(line_bbox, overlap_df, n_page)

    if dist < dist1:
        print(f"The line closest image on page {image["page"]} is closer to the image than it's nearest line.")
        return idx
    return None

1. Get all images.
   - filter image artifacts and stitch stripped images. 
2. Get all text lines.
3. Identify and Remove captions from text lines.
4. Identify and resort dual column text.
5. Identify and remove page headers and footers. 

In [None]:
# 2001 - bad  (text line near enough to image: shows we need to use expanded doc_df: also lines which share x0)
# 2002 - good (no captions) non-caption close to image counted as caption. ==> to count as inside we say it must be > 0.2 inside.
# 2003 - good (no captions)
# 2004 - good (no captions)
# 2005 - 2 captions, captured but pictures are not all together, they are partitioned in boxes.
# 2006 - good (text above figure captured: could exclude above captions) -> point artifacts filtered
# 2007 - good (no captions)
# 2008 - good (no captions)
# 2009 - good (no captions)
# 2010 - good (no captions)
# 2011 - page 3 multi-line caption.
# 2012 - page 4 right col multi-line caption. p5 disaster image.
# 2013 - p2 img cut into slices - fixed; page 4 caption below image missed; p6 captions missed; p7 multi-line caption.
# 2014 - good (captions inside page 3,4,5,6,7; caption also below p3)
# 2015 - 2018 good
# 2019 - good (page 6 caption under page)
# 2020 - (maybe not good: captions decentred on img page 3)
# 2021 - good (no captions)
# 2022 - good (1 in image page 6)
# 2023 - good (2 lines in image page 3)
# 2024 - good (no captions)
# 2025 - good (1 in image page 2)
# For caption prediction, we should have distance to nearest image
# distance to nearest text.
year=2003
doc = open_exam(year,"english","al",1)
doc_df = get_doc_line_df(doc)
median_font = doc_df.font_size.median()

identify_instructions(doc_df)
identify_footers(doc_df)
identify_section_headers(doc_df)

images = get_images(doc)
print(f"number of raw images               : {len(images):10}")
images = filter_images(images)
print(f"number of images after filter      : {len(images):10}","\n\n")
images = get_captions(doc_df, images)

imgs = [img for img in images if img["page"]>1 and img["page"]<9]
#show_all_imgs(3,4,imgs)

for image in imgs:
    idx= closest_line_closest_thing(image,doc_df)
    if idx:
        print(doc_df.loc[idx].text)

In [None]:
middle = doc[0].rect.width/2
large_font = doc_df.font_size > median_font
bold_font = doc_df.mode_font.str.contains("Bold")
not_p1 = doc_df.page != 1
centred =  ( (doc_df.x0 + doc_df.x1)/2 > middle -30 ) & ( (doc_df.x0 + doc_df.x1)/2 < middle +30 )
uncategorised = doc_df.category=="uncategorised"
doc_df.loc[large_font  & not_p1 & uncategorised & centred , ["text","font_size","page","mode_font","common_font"]]

In [None]:
{2001: 10,2002: 8,2003: 8,2004: 8,2005: 8,2006: 8,
2007: 8,2008: 8,2009: 8,2010: 12,2011: 12,2012: 12,
2013: 12,2014: 12,2015: 12,2016: 12,2017: 12,2018: 21,
2019: 21,2020: 21,2021: 21,2022: 21,2023: 21,2024: 21,
2025: 21}

# if year ==2001:
#     assert len ==10
# elif year >2001 and year <= 2009:
#     assert len 8
# elif year >2009 and year <= 2017:
#     assert len 12
# elif year >2017:
#     assert len 12

In [None]:
for year in range(2001,2026):
    doc = open_exam(year,"english","al",1)
    doc_df = get_doc_line_df(doc)
    doc_df = identify_instructions(doc_df)
    doc_df = identify_footers(doc_df)
    print(f"{year} {len(doc_df[doc_df.category=="footer"])}")

In [None]:
year=2025
doc = open_exam(year,"english","al",1)
doc_df = get_doc_line_df(doc)
doc_df = identify_instructions(doc_df)
doc_df = identify_footers(doc_df)
doc_df = identify_section_headers(doc_df)
print(len(doc_df[doc_df.category=="section"]))
doc_df[doc_df.category=="section"].text.head(40)

In [None]:
page_6 = doc_df[doc_df.page==6]
median_size = page_6.font_size.median()
mode_font = page_6.common_font.mode().values[0]
print(f"median font size: {median_size}\nmedian font: {mode_font}")
page_6[page_6.font_size!=median_size ].head(30)


# Manually assigning captions

In [None]:
page_df =page_6
display(page_df[page_df.text=='Warstones\xa0Library\xa0'][["x0","y0","x1","y1"]])

bbox = tuple(page_df[page_df.text=='Warstones\xa0Library\xa0'][["x0","y0","x1","y1"]].values[0])
img = closest_image(bbox,images)
img["caption"]='Warstones\xa0Library\xa0'