In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

from pdf_scraper.doc_utils     import open_exam, get_images
from pdf_scraper.image_utils   import (show_all_imgs, show_image, filter_point_images, filter_low_res_doubles,
                                       find_contiguous_image_pairs, merge_contiguous_pair_lists, stitch_strips,
                                       reconstitute_split_images, sort_and_rename_images)


pd.set_option("display.float_format", "{:.2f}".format)
pd.set_option("display.max_colwidth", 200)

# Stripped and split Images

When we extract images from the pdf, sometimes these images are split into vertical bands. We would like
to join these images back together as the existence of such bands will interfere with code further down the 
pipeline, in particular captioning code, and also it is just messy. If it is one image let us store it as one
image with one id.

## Find Stripped Images

To find which years have stripped images, let us get the images of many years, filter the point images, and see if there
remains some years that have lots of images.

In [None]:

print(f"{'year':<8} {'n_raw_images':<18} {'n_images':<18} {'n_split_images':<18}")
for year in range(2001,2026):
    doc    = open_exam(year, "english", "al",1)
    images = get_images(doc)
    n_raw_images = len(images)
    images = filter_point_images(images)
    images = filter_low_res_doubles(images)
    images = sort_and_rename_images(images)

    contig_pairs = find_contiguous_image_pairs(images, 0.01)
    split_images = merge_contiguous_pair_lists(contig_pairs)
    print(f"{year:<8} {n_raw_images:<18} {len(images):<18} {len(split_images):<18}")

In [None]:
# Here we can see 2007 and 2013 have stripped images.
# 2005, 2006, and 2007 have these point-image artifacts which must be removed.
# 2009 is a special case as it has two split images on the same page. It can be used to test robustness of code.

In the examinations of images below, to see the problems being solved, uncomment the commented code.

## Examine 2007
This has an image on page 3 split into a good number of strips.

In [None]:
year=2007
doc    = open_exam(year, "english", "al",1)
images = get_images(doc)
images = filter_point_images(images)
images = filter_low_res_doubles(images)
images = sort_and_rename_images(images)
print(f"{year}: {len(images)}")

#show_all_imgs(6, 4,images[2:])
#contig_pairs = find_contiguous_image_pairs(images, 0.01)
#print(len(contig_pairs))

#split_images = merge_contiguous_pair_lists(contig_pairs)
#print(len(split_images))

#stitched = [stitch_strips(group) for group in split_images]

from pdf_scraper.image_utils import reconstitute_split_images

images = reconstitute_split_images(images)

print(f"{year}: {len(images)}")
show_all_imgs(2, 3,images)

## Examine 2013

In [None]:
year=2013
doc    = open_exam(year, "english", "al",1)
images = get_images(doc)
images = filter_point_images(images)
images = filter_low_res_doubles(images)
images = sort_and_rename_images(images)
print(f"{year}: {len(images)}")

#show_all_imgs(3, 4,images[2:])

#contig_pairs = find_contiguous_image_pairs(images, 0.01)
#print(len(contig_pairs))

#split_images = merge_contiguous_pair_lists(contig_pairs)
#print(len(split_images))


#stitched = [stitch_strips(group) for group in split_images]
#show_image(stitched[0])

images= reconstitute_split_images(images)
print(f"{year}: {len(images)}")
show_all_imgs(2, 3,images)

## Examine 2009

In [None]:
year=2009
doc    = open_exam(year, "english", "al",1)
images = get_images(doc)
images = filter_point_images(images)
images = filter_low_res_doubles(images)
images = sort_and_rename_images(images)
print(f"{year}: {len(images)}")

#show_all_imgs(3, 4,images[2:])
#
#contig_pairs = find_contiguous_image_pairs(images, 0.01)
#print(len(contig_pairs))
#
#split_images = merge_contiguous_pair_lists(contig_pairs)
#print(len(split_images))
#
#
#stitched = [stitch_strips(group) for group in split_images]
#show_all_imgs(1,3,stitched)

images = reconstitute_split_images(images)
show_all_imgs(2,3,images)

# Low resolution image doubles

In [None]:
def old_filter_images(images):
    if len(images) > 100:
        images=filter_point_images(images)
    if len(images) > 10:
        images = reconstitute_split_images(images)
    return images

In [None]:
doc    = open_exam(2011, "english", "al",1)
images = get_images(doc)
images = old_filter_images(images)
show_all_imgs(3,3,images)

In [None]:
print(images[0]["bbox"], images[0]["size"])
print(images[1]["bbox"], images[1]["size"])

In [None]:
images[0]

In [None]:
def filter_low_res_doubles(images):
    images_to_drop = []
    for i in range(len(images)):
        for j in range(i+1,len(images)):
            im1, im2  = images[i], images[j]
            if im2["bbox"]==im1["bbox"]:
                images_to_drop.append( im1["number"] if im1["size"] > im2["size"] else im2["number"])
    return [im for im in images if im["number"] not in images_to_drop]
filtered_images = filter_low_res_doubles(images)

In [None]:
show_all_imgs(3,3,filtered_images)