In [None]:
import json
from operator import itemgetter

import numpy as np
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.pdf import PageObject
from PIL import Image, ImageDraw
from pdf2image import convert_from_path

POPPLER_PATH = './poppler-20.11.0/bin/'

USU_PER_IN = 72
IN_TO_MM = 25.4
USU_PER_MM = USU_PER_IN / IN_TO_MM

FRONT_COVER_PAGE = 1
BACK_COVER_PAGE = 266
CONTENT_RANGE = (4, 262)
N_PAGES = CONTENT_RANGE[1]-CONTENT_RANGE[0]+1

with open('./paper_sizes.json') as f: PAPER_SIZES = json.load(f)
with open('./gutter_sizes.json') as f:
    GUTTER_SIZES = json.load(f)
    GUTTER_SIZES = {int(kk): vv for kk, vv in GUTTER_SIZES.items()}
INPUT_PDF = './go-in-action.pdf'

In [None]:
def generate_document_coverage_array(pdf_path, page_range, sample_period=1, dpi=72, poppler_path=None):
    arr_overall = None
    for pp in range(*page_range, sample_period):
        print(f'\r{pp/page_range[1]*100:.0f}%...', end='')
        im = convert_from_path(pdf_path, dpi=dpi, first_page=pp, last_page=pp, poppler_path=poppler_path)[0]
        im_arr = np.array(im)
        if arr_overall is None: arr_overall = np.full(im_arr.shape[:2], False)
        arr_overall += (im_arr != 255).all(axis=2)
    print('done')
    
    return arr_overall

def find_margins_px(array, pad_l=0, pad_r=0, pad_t=0, pad_b=0):
    cols_w_data = array.sum(axis=0).astype(bool)
    margin_l, margin_r = cols_w_data.argmax()-1 + pad_l, np.flip(cols_w_data).argmax()-1 + pad_r

    rows_w_data = array.sum(axis=1).astype(bool)
    margin_t, margin_b = rows_w_data.argmax()-1 + pad_t, np.flip(rows_w_data).argmax()-1 + pad_b
    
    return margin_l, margin_r, margin_t, margin_b

def get_page_size_mm(pdf_page):
    w_usu, h_usu = pdf_page.mediaBox.getWidth(), pdf_page.mediaBox.getHeight()
    w_in, h_in = w_usu / USU_PER_IN, h_usu / USU_PER_IN
    w_mm, h_mm = w_in * IN_TO_MM, h_in * IN_TO_MM
    return w_mm, h_mm

def find_nearest_paper_by_aspect(w, h, size_type='trim', subtract_margins=None, subtract_gutter=None):
    if size_type not in ('trim', 'bleed'): raise Exception()
    
    target_aspect_ratio = w / h
    
    paper_aspects = []
    for paper_sizes in PAPER_SIZES.values():
        paper_x, paper_y = paper_sizes[size_type]['x'], paper_sizes[size_type]['y']
        if subtract_margins:
            margin_l, margin_r, margin_t, margin_b = subtract_margins
            paper_x -= (margin_l + margin_r)
            paper_y -= (margin_t + margin_b)
            
        if subtract_gutter:
            paper_x -= subtract_gutter
            
        paper_aspects.append(paper_x / paper_y)
    
    paper_aspect_deltas = (abs(target_aspect_ratio - ii) for ii in paper_aspects)
    nearest_paper, aspect_delta = sorted(zip((paper_name for paper_name in PAPER_SIZES), paper_aspect_deltas), key=itemgetter(1))[0]
    return nearest_paper, aspect_delta

def find_gutter_size_mm_from_pages(n_pages):
    for max_pages in sorted(GUTTER_SIZES.keys()):
        if n_pages <= max_pages: return GUTTER_SIZES[max_pages]
    raise Exception(f'Too many pages: {n_pages}. Is this actually a book?')
    
def midpoint(xy1, xy2, round_coords=False):
    x1, y1 = xy1
    x2, y2 = xy2
    
    midp = (x1+x2)/2, (y1+y2)/2
    if round_coords:
        return round(midp[0]), round(midp[1])
    else:
        return midp

def draw_cross(draw_obj, centre, length, thickness, colour):
    x, y = centre
    draw_obj.line(((x-length, y), (x+length, y)), colour, thickness)
    draw_obj.line(((x, y-length), (x, y+length)), colour, thickness)
    
    return draw_obj

def annotate_found_margins(page_array, margin_l, margin_r, margin_t, margin_b):
    im = Image.fromarray(page_array)
    im_annot = im.copy().convert('RGB')
    draw = ImageDraw.Draw(im_annot)

    margin_ul, margin_lr = (margin_l, margin_t), (im.size[0]-margin_r, im.size[1]-margin_b)
    draw.rectangle((margin_ul, margin_lr), outline='red')

    content_centroid_pil = midpoint(margin_lr, margin_ul, round_coords=True)
    draw = draw_cross(draw, content_centroid_pil, 5, 2, 'red')

    page_centroid_pil = midpoint((0,0), im.size, round_coords=True)
    draw = draw_cross(draw, page_centroid_pil, 5, 2, 'blue')
    
    return im_annot

In [None]:
TEST_PAGE = 26

# arr_overall = generate_document_coverage_array(INPUT_PDF, *CONTENT_RANGE, 10, USU_PER_IN, POPPLER_PATH)
arr_overall = generate_document_coverage_array(INPUT_PDF, [TEST_PAGE, TEST_PAGE+1], 10, USU_PER_IN, POPPLER_PATH)
# margin_l, margin_r, margin_t, margin_b = find_margins_px(arr_overall)
margin_l, margin_r, margin_t, margin_b = find_margins_px(arr_overall, pad_l=33)

print(f'L: {margin_l}, R: {margin_r}, T: {margin_t}, B: {margin_b}')

annotate_found_margins(arr_overall, margin_l, margin_r, margin_t, margin_b)

In [None]:
pdf_reader = PdfFileReader(INPUT_PDF)
test_page = pdf_reader.getPage(TEST_PAGE-1)
# test_page.mediaBox.lowerLeft  = (test_page.mediaBox.lowerLeft[0]+margin_l, test_page.mediaBox.lowerLeft[1]+margin_b)
# test_page.mediaBox.upperRight = (test_page.mediaBox.upperRight[0]-margin_r, test_page.mediaBox.upperRight[1]-margin_t)
# content_centroid_pdf = midpoint(test_page.mediaBox.lowerLeft, test_page.mediaBox.upperRight, round_coords=True)
# print('LL:', test_page.mediaBox.lowerLeft)
# print('UR:', test_page.mediaBox.upperRight)
# print('Content centre:', content_centroid_pdf)

# out = PdfFileWriter()
# out.addPage(test_page)
# test_out_path = f'./test_crop_{TEST_PAGE:03d}.pdf'
# with open(test_out_path, 'wb') as f: out.write(f)
    
# convert_from_path(test_out_path, dpi=USU_PER_IN, poppler_path=POPPLER_PATH)[0]

In [None]:
pdf_reader.getNumPages()

In [None]:
content_ll = (test_page.mediaBox.lowerLeft[0]+margin_l, test_page.mediaBox.lowerLeft[1]+margin_b)
content_ur = (test_page.mediaBox.upperRight[0]-margin_r, test_page.mediaBox.upperRight[1]-margin_t)
content_width, content_height = content_ur[0]-content_ll[0], content_ur[1]-content_ll[1]
content_centroid = midpoint(content_ll, content_ur)
page_centroid = midpoint((0,0), test_page.mediaBox.upperRight)
content_shift = round(page_centroid[0]-content_centroid[0]), round(page_centroid[1]-content_centroid[1])
print('LL:', content_ll)
print('UR:', content_ur)
print('Content size:', (content_width, content_height))
print('Content centre:', content_centroid)
print('Page centre', page_centroid)
print('Content shift:', content_shift)

In [None]:
MARGIN_X_MM, MARGIN_Y_MM = 13, 13
GUTTER_MM = find_gutter_size_mm_from_pages(N_PAGES)
# target_paper_type = find_nearest_paper_by_aspect(content_width, content_height, subtract_margins=(MARGIN_X_MM, MARGIN_X_MM, MARGIN_Y_MM, MARGIN_Y_MM), subtract_gutter=GUTTER_MM)[0]
target_paper_type = 'us-letter'
print(target_paper_type)

In [None]:
target_paper_size_mm = PAPER_SIZES[target_paper_type]['trim']['x'], PAPER_SIZES[target_paper_type]['trim']['y']
target_content_size_mm = target_paper_size_mm[0]-2*MARGIN_X_MM-GUTTER_MM, target_paper_size_mm[1]-2*MARGIN_Y_MM

content_size_mm = content_width / USU_PER_MM, content_height / USU_PER_MM
content_delta_x, content_delta_y = target_content_size_mm[0] - content_size_mm[0], target_content_size_mm[1] - content_size_mm[1]
content_scale_x, content_scale_y = 1+content_delta_x/content_size_mm[0], 1+content_delta_y/content_size_mm[1]
content_scale_factor = min(content_scale_x, content_scale_y)

print(f'Target sizes, paper: {target_paper_size_mm}, content: {target_content_size_mm}')
print(f'Content size: {content_size_mm}')
print(content_delta_x, content_delta_y)
print(content_scale_x, content_scale_y)
print(content_scale_factor)

In [None]:
test_page_trans_scale = PageObject.createBlankPage(width=test_page.mediaBox.getWidth(), height=test_page.mediaBox.getHeight())

test_page_trans_scale.mergeTranslatedPage(test_page, tx=content_shift[0], ty=content_shift[1])
test_page_trans_scale.scaleBy(content_scale_factor)

out = PdfFileWriter()
out.addPage(test_page_trans_scale)
test_trans_scale_path = f'./test_trans_scale_{TEST_PAGE:03d}.pdf'
with open(test_trans_scale_path, 'wb') as f: out.write(f)
    
im_trans_scale = convert_from_path(test_trans_scale_path, dpi=USU_PER_IN, poppler_path=POPPLER_PATH)[0]
im_trans_scale_annot = im_trans_scale.copy()
draw = ImageDraw.Draw(im_trans_scale_annot)
draw.rectangle(((0,0), (im_trans_scale.size[0]-0.1, im_trans_scale.size[1]-0.1)), outline='blue')
im_trans_scale_annot

In [None]:
target_paper_size_px = (round(target_paper_size_mm[0]*USU_PER_MM), round(target_paper_size_mm[1]*USU_PER_MM))
test_page_trans_scale_merge = PageObject.createBlankPage(width=target_paper_size_px[0], height=target_paper_size_px[1])

test_page_trans_scale_merge.mergePage(test_page_trans_scale)

out = PdfFileWriter()
out.addPage(test_page_trans_scale_merge)
test_trans_scale_merge_path = f'./test_trans_scale_merge_{TEST_PAGE:03d}.pdf'
with open(test_trans_scale_merge_path, 'wb') as f: out.write(f)

test_trans_scale_merge_coverage_arr = generate_document_coverage_array(test_trans_scale_merge_path, (0,1), poppler_path=POPPLER_PATH)
margin_2_l, margin_2_r, margin_2_t, margin_2_b = find_margins_px(test_trans_scale_merge_coverage_arr, pad_l=round(34*content_scale_factor))

content_2_ll = (test_page_trans_scale_merge.mediaBox.lowerLeft[0]+margin_2_l, test_page_trans_scale_merge.mediaBox.lowerLeft[1]+margin_2_b)
content_2_ur = (test_page_trans_scale_merge.mediaBox.upperRight[0]-margin_2_r, test_page_trans_scale_merge.mediaBox.upperRight[1]-margin_2_t)
content_2_width, content_2_height = content_2_ur[0]-content_2_ll[0], content_2_ur[1]-content_2_ll[1]
content_2_centroid = midpoint(content_2_ll, content_2_ur)
page_2_centroid = midpoint((0,0), test_page_trans_scale_merge.mediaBox.upperRight)
content_2_shift = round(page_2_centroid[0]-content_2_centroid[0]), round(page_2_centroid[1]-content_2_centroid[1])
print('LL:', content_2_ll)
print('UR:', content_2_ur)
print('Content size:', (content_2_width, content_2_height))
print('Content centre:', content_2_centroid)
print('Page centre', page_2_centroid)
print('Content shift:', content_2_shift)

display(annotate_found_margins(test_trans_scale_merge_coverage_arr, margin_2_l, margin_2_r, margin_2_t, margin_2_b))

In [None]:
target_paper_size_px = (round(target_paper_size_mm[0]*USU_PER_MM), round(target_paper_size_mm[1]*USU_PER_MM))
test_page_refit = PageObject.createBlankPage(width=target_paper_size_px[0], height=target_paper_size_px[1])

gutter_tx = GUTTER_MM*USU_PER_MM

test_page_refit.mergeTranslatedPage(test_page_trans_scale_merge, tx=content_2_shift[0], ty=content_2_shift[1])

out = PdfFileWriter()
out.addPage(test_page_refit)
test_refit_path = f'./test_refit_{TEST_PAGE:03d}.pdf'
with open(test_refit_path, 'wb') as f: out.write(f)
    
im_refit = convert_from_path(test_refit_path, dpi=USU_PER_IN, poppler_path=POPPLER_PATH)[0]
im_refit_annot = im_refit.copy()
draw = ImageDraw.Draw(im_refit_annot)
draw.rectangle(((0,0), (im_refit.size[0]-0.1, im_refit.size[1]-0.1)), outline='blue')
im_refit_annot