# PDF-Preprocessing

This notebook explores a pipeline to normalize (A4 scale, no rotation) a PDFs pages and turn them into colored and grayscale .jpg images. Afterwards splits the data using sklearns train_test_split.

The preprocessing aswell as the split into train and testdata has already been done. This notebook is more for archival purposes for eventual later use in a productive piece of software.

## Setup and Configuration

In [None]:
# -- imports --
import csv
import glob
import os
import pymupdf
import shutil
import tabulate
import tempfile
import warnings
import numpy as np
from collections import Counter
from deskew import determine_skew
from pdf2image import convert_from_path  #TODO: look into convert_from_bytes
from skimage import io
from skimage.color import rgb2gray
from sklearn.model_selection import train_test_split
from skimage.transform import rotate

# -- data paths --
# note: the used data is not part of the repository due to privacy reasons 
label_paths = '../data/label/DSA_label.csv'
pdf_all = glob.glob('../data/pdf_labeled/*.pdf')
# do not use already existing paths, we do not want to overwrite data with this notebook!
img_out_color = '../data/img_c_dummy'
img_out_gray = '../data/img_g_dummy'
# for testing purposes
actual_color = '../data/imgs_labeled/color'

# -- config --
POPPLER_PATH = os.getenv("POPPLER_PATH")
pdf_all

## PDF Preprocessing

In [None]:
# TODO: look into adding page numbers as metadata, easier way of sorting than with names :) -> REF: https://famondir.github.io/Thesis/
def preprocess_pdf(pdf_path):
    """
    - normalize to A4 portrait with padding for different ratios
    - descew
    - change to grayscale
    - remove blank pages

    saves each PDF-page as a colored and grayscale .jpg file
    """
    A4_W, A4_H = 595, 842
    # create output locations
    os.makedirs(img_out_color, exist_ok=True)
    os.makedirs(img_out_gray, exist_ok=True)
    # important: remove using shutil manually!!
    tmpdir_pdf = tempfile.mkdtemp()
    tmpdir_img = tempfile.mkdtemp()

    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    tmp_pdf_path = os.path.join(tmpdir_pdf, f'{pdf_name}_tmp.pdf')
    doc = pymupdf.open(pdf_path)
    new_doc = pymupdf.open()

    for page_idx in range(len(doc)):
        # normalize to A4 and remove rotation
        page = doc.load_page(page_idx)
        w, h = page.rect.width, page.rect.height
        rotation = page.rotation
        if rotation != 0:
            page.set_rotation(0)
        a4_scale = min(A4_W / w, A4_H / h)
        new_w, new_h = w * a4_scale, h * a4_scale
        dx, dy = (A4_W - new_w) / 2, (A4_H - new_h) / 2
        new_page = new_doc.new_page(width=A4_W, height=A4_H)
        # place original scaled page onto new empty page
        trans = pymupdf.Matrix(a4_scale, a4_scale).pretranslate(dx / a4_scale, dy / a4_scale)
        new_page.show_pdf_page(new_page.rect, doc, page_idx, trans)

    new_doc.save(tmp_pdf_path)
    doc.close(); new_doc.close()

    # convert to imgs
    images = convert_from_path(
        pdf_path=tmp_pdf_path,
        dpi=200,
        fmt="jpg",
        poppler_path=POPPLER_PATH
    )
    # temporarily save images (look for method to directly transfer to next part)
    for img_idx, image in enumerate(images):
        tmp_image_path = os.path.join(tmpdir_img, f'{pdf_name}_{img_idx + 1}.jpg')  # naming starts at 1
        image.save(tmp_image_path, "JPEG")

    # descew and create grayscale
    tmp_image_paths = glob.glob(f'{tmpdir_img}/*.jpg')
    for img_path in tmp_image_paths:
        img_name = os.path.splitext(os.path.basename(img_path))[0]
        image = io.imread(img_path)
        grayscale = rgb2gray(image)
        angle = determine_skew(grayscale)  # blank pages return None
        warnings.simplefilter("ignore", UserWarning)  # comment if out not needed
        # images are saved in a bigger res than A4 would let assume - thats because the DPI isn't 72
        if angle != None:
            rotated_color = rotate(image, angle, resize=False) * 255
            rotated_gray = rotate(grayscale, angle, resize=False) * 255
            io.imsave(f"{img_out_color}/{img_name}.jpg", rotated_color.astype(np.uint8))
            io.imsave(f"{img_out_gray}/{img_name}.jpg", rotated_gray.astype(np.uint8))
        else:
            io.imsave(f"{img_out_color}/{img_name}.jpg", image)
            io.imsave(f"{img_out_gray}/{img_name}.jpg", (grayscale * 255).astype(np.uint8))            

    shutil.rmtree(tmpdir_pdf)
    shutil.rmtree(tmpdir_img)
    
    
for pdf_path in pdf_all[15:16]:
    preprocess_pdf(pdf_path)

## Split data

In [None]:
def split_data(labels):
    SEED = 42
    label_dict = {}
    # create dict with filenames and labels
    with open(labels, encoding='utf-8') as label_csv:
        reader = csv.DictReader(label_csv)
        for row in reader:
            label_dict[row['image']] = row['label']

    image_names = [os.path.splitext(os.path.basename(file_path))[0] for file_path in glob.glob(f'{actual_color}/*.jpg')]
    image_labels = [label_dict[image_name] for image_name in image_names]
    x_train, x_test, y_train, y_test = train_test_split(
        image_names,
        image_labels,
        test_size=0.2, 
        train_size=0.8,     
        random_state=SEED,
        stratify=image_labels  
    )

    # pretty print
    print(tabulate.tabulate(
        [["Training", len(x_train), "", ""]] + 
        [["", "", label, Counter(y_train)[label]] for label, _ in sorted(Counter(y_train).items(), key=lambda x: x[1], reverse=True)] + 
        [["Test", len(x_test), "", ""]] + [["", "", label, Counter(y_test)[label]] for label, _ in sorted(Counter(y_train).items(), key=lambda x: x[1], reverse=True)], 
        headers=["Dataset", "Total Files", "Label", "Count"], 
        tablefmt="grid"
        )
    )
    
    return  x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = split_data(label_paths)