# PDF-Reduction

This notebook explores a pipeline to identify and extract the most relevant pages of an application for the BHT Data Science Master's program using Machine Learning.

# TODO: 

## Setup and Configuration

In [None]:
# -- imports --
import glob
import numpy as np
import os
import pymupdf
import torch
from autogluon.multimodal import MultiModalPredictor
import logging
logging.getLogger("autogluon").setLevel(logging.ERROR)

# -- data paths --
example_imgs = glob.glob('../data/imgs_labeled/color/76_*.jpg')  # 144 Pages
# example_imgs = glob.glob('../data/imgs_labeled/color/1_*.jpg')  # 31 Pages
output_example = '../data/reduced_pdfs/example.pdf'

# -- config --
torch.set_float32_matmul_precision('high')

PRED_PATH = os.getenv("CLASSIFICATION_MODEL")
pred = MultiModalPredictor.load(f'../{PRED_PATH}')

## Classify relevant pages 
Looks for "Stammdaten", "Zeugnisse", "Kursübersichten" and labels them accordingly. Afterwards saves them in a PDF, including a table of contents.

In [None]:
#FIXME: check ORDER! sort, or be sure its sorted properly beforehand
def reduce_pdf(images):
    # predict and get idx
    y_pred = pred.predict(images)

    idx_dict = {}
    idx_dict["Stammdaten"] = np.where(y_pred == 'Stammdaten')[0].tolist()
    idx_dict["Kursübersichten" ] = np.where(y_pred == 'Kursübersicht')[0].tolist()
    idx_dict["Zeugnisse"] = np.where(y_pred == 'Zeugnis')[0].tolist()
  
    # create pdf
    ## misc stuff needed for doc creation
    p = pymupdf.Point(50, 842/2)
    line_start = pymupdf.Point(p.x-10, p.y-25)   
    line_end   = pymupdf.Point(p.x-10, p.y+5) 
    toc = []  # level, title, page

    doc = pymupdf.open()

    for name, idxs in idx_dict.items():
        # create cover page
        textpage = doc.new_page() 
        textpage.draw_line(line_start, line_end, width=3.5)
        textpage.insert_text(
            point=p,
            text=name,
            fontsize=30,
        )
        toc.append([1, name, doc.page_count])
        for idx in idxs:
            imgpage = doc.new_page()
            imgpage.insert_image(
                rect=imgpage.rect, 
                filename=images[idx],
                keep_proportion=False # because images are always A4*200DPI
            ) 

    doc.set_toc(toc=toc)
    doc.save(output_example)

   
reduce_pdf(example_imgs) 