In [None]:
from pdf_scraper.doc_utils   import (open_exam, get_doc_line_df, identify_section_headers,
                                     identify_text_headers,identify_footers, identify_instructions,
                                     identify_subtitles, identify_subsubtitles,get_images,preproc_images,
                                     assign_in_image_captions, identify_vertical_captions,
                                     identify_all_page_clusters, enrich_doc_df_with_images)

from pdf_scraper.line_utils    import clean_line_df, get_category_boxes, get_df_bbox
from pdf_scraper.image_utils   import get_bboxed_page_image,show_image, show_all_imgs
from pdf_scraper.general_utils import bbox_vert_dist, bbox_horiz_dist
from time import sleep
import numpy as np
import pandas as pd

import os, io, re

from IPython.display import display, clear_output, Image
import matplotlib.pyplot as plt

In [None]:
def get_parsed_df(doc):
    df = get_doc_line_df(doc)
    doc_width = doc[0].rect.width

    images = preproc_images(get_images(doc))
    assign_in_image_captions(df, images)

    df = clean_line_df(df)
    df = enrich_doc_df_with_images(df, images)

    identify_all_page_clusters(df, 2.0/3.0, 1.15, text_only=True)
    identify_footers(df)
    identify_instructions(df)
    identify_section_headers(df)
    identify_text_headers(df, doc_width)
    identify_subtitles(df, doc_width)
    identify_subsubtitles(df, doc_width)

    return df

In [None]:
def get_caption_input(df, page):
    while True:
       caption_input = input("Caption text: ").strip().lower()
       if caption_input == "":
           break

       matches = df[(df.page == page) & (df.text.str.strip().str.lower().str.contains(caption_input)) ]
       if matches.empty:
           print("No match found. Retype, perhaps with more of line.")
           continue
       elif len(matches) > 1:
           print(f"{len(matches)} matches:")
           display(matches.head(10))
           index_input = input("Select index: ")
           index_input = int(index_input)
           matches     = df.loc[[index_input]]

       df.loc[matches.index, "category"] = "caption2"
       print(f"Marked: {matches.text.values[0]} as caption2.")

In [None]:
def update_image(image_handle, img):
    buf = io.BytesIO()
    plt.figure(figsize=(16, 10))
    plt.imshow(img); plt.axis("off"); plt.tight_layout()
    plt.savefig(buf, format="png")
    plt.close()
    buf.seek(0)

    image_handle.update(Image(data=buf.read()))
    return image_handle


In [None]:
def view_year_page(year,page):
    doc         = open_exam(year)
    images      = preproc_images(get_images(doc))
    df          = get_parsed_df(doc)

    page_df = df[df.page == page]
    rects   = get_category_boxes(page_df, "cluster")
    img     = get_bboxed_page_image(doc, page, rects, labels = np.unique(page_df.cluster))

    return img

In [None]:
dfs = []
output_csv = "captioned_dfs.csv"

for year in range(2020, 2021):
    clear_output(wait=True)
    print(f"\n----- Year: {year} -----\n")
    doc         = open_exam(year)
    images      = preproc_images(get_images(doc))
    df          = get_parsed_df(doc)
    df["year"]  = year

    image_handle = display(None, display_id=True)
    if year in [y for y in range(2001, 2011)]:
        file_exists = os.path.exists(output_csv)
        df.to_csv(output_csv, mode="a", header=not file_exists, index=False)
        dfs.append(df)
        continue

    image_pages = set(img["page"] for img in images if img["page"]>1)
    for page in sorted(image_pages):
        page_df = df[df.page == page]
        rects   = get_category_boxes(page_df, "cluster")
        img     = get_bboxed_page_image(doc, page, rects, labels = np.unique(page_df.cluster))

        image_handle = update_image(image_handle, img)
        print(f"\n{year} --- Page {page} ---")

        has_captions = input("Are there captions on this page? (y/n): ").strip().lower()
        if has_captions not in ("y", "yes"):
            print("Skipping this page.\n")
            continue
        get_caption_input(df, page)

        display(df.loc[(df.category=="caption2") & (df.page==page),["text","page"] ].head())
        good_page = input("Are these captions correct? (y/n): ").strip().lower()
        if good_page not in ("y", "yes"):
            df.loc[(df.page==page) & (df.category=="caption2"), "category"] = "uncategorised"
            get_caption_input(df, page)
        print(f"Done with page {page}.\n")

    file_exists = os.path.exists(output_csv)
    df.to_csv(output_csv, mode="a", header=not file_exists, index=False)

    dfs.append(df)

big_df = pd.concat(dfs, ignore_index=True)
print("\nFinal dataframe shape:", big_df.shape)