In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect

from pdf_scraper.block_utils import identify_dual_column, get_block_text, sort_dual_column_blocks
from pdf_scraper.block_utils import is_empty_block, clean_blocks, print_block_table, get_block_table
from pdf_scraper.draw_utils  import get_pink_boundary
from pdf_scraper.doc_utils   import open_exam
from pdf_scraper.image_utils import get_bboxed_page_image

In [2]:
doc              = open_exam(2024)
page_number      = 7
page             = doc[page_number-1]
textPage         = page.get_textpage()
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
page_width       = text_dict["width"]   # This is a document wide thing doesn't need to be per page.
blocks           = [block for block in text_dict["blocks"] if not is_empty_block(block)]
raw_bbox_rects   = [Rect(block["bbox"]) for block in text_dict["blocks"]]
bbox_rects       = [Rect(block["bbox"]) for block in blocks ]


# page.get_text() without blocking or resorting

In [None]:
print(page.get_text())

The text as come out in a very haphasard order. Questions are above text, and then the text has the columns not in the correct order.

We will use blocks

In [None]:
raw_block_numbers = [block["number"] for block in text_dict["blocks"]]
img1 = get_bboxed_page_image(doc, page_number, raw_bbox_rects, labels =raw_block_numbers )

block_numbers = [block["number"] for block in blocks]
img2 = get_bboxed_page_image(doc, page_number, bbox_rects, labels =block_numbers )

fig, axes = plt.subplots(1, 2, figsize=(16, 16))
axes[0].imshow(img1); axes[0].set_title("Raw Blocks") ; axes[0].axis("off")
axes[1].imshow(img2); axes[1].set_title("Processed Blocks") ;axes[1].axis("off")
plt.show()


# Sorting Blocks

The in-built sorting of get_text works well in most cases, but can have issues when:
- there is two column text. 
- Column text and header text are not correctly blocked

Therefore we need to write our own functions which can

- Identify and resort dual column text.
- Identify and split incorrectly blocked lines of text. 


# Identifying and sorting dual column text

In [None]:
pink_fill = [(1.0, 0.8980000019073486, 0.9490000009536743)]
king_pink = get_pink_boundary(page_drawings, pink_fill)

dual_blocks = [block for block in blocks if identify_dual_column(block, king_pink)]
table = get_block_table(dual_blocks )
print(table)
sorted_blocks = sort_dual_column_blocks(dual_blocks)
sorted_table  = get_block_table(sorted_blocks)
print(sorted_table)

# Splitting blocks

In [None]:
from pdf_scraper.line_utils  import line_is_empty
from pdf_scraper.line_utils  import  print_line_table
from pdf_scraper.line_utils  import get_mode_font, get_common_font

In [None]:
page_number =4
page        = doc[page_number-1]
page_dict   = page.get_text("dict",sort=True)
blocks      = page_dict["blocks"]
block       = blocks[6]

bbox_rects    = [Rect(block["bbox"]) for block in blocks ]
block_numbers = [block["number"]     for block in blocks ]
img1 = get_bboxed_page_image(doc, page_number, bbox_rects, labels =block_numbers )
display(img1)

In [None]:
print(get_block_text(block))

In [None]:
lines = block['lines']
lines = [line for line in lines if not line_is_empty(line)]
print_line_table(lines)

## Get Mode and Common font

In [None]:
from pdf_scraper.line_utils import get_line_df

pd.set_option("display.float_format", "{:.2f}".format)
df = get_line_df(lines)
#df.head(22)
    

# Block split function

- So to make a new block, we need to assign a number, a type, and a bbox. We already have the lines. 
1. The number is just a label, so we will keep the same label for the two blocks; this will furthermore help to identify a split block.  
2. The type will be the same.
3. bbox: write a function which infers a bbox from the line. Check it on known bboxes for blocks.
   - the functin takes as x0 min(x0) for all lines, x1 is max(x1) , y0 is min(y0), and y1 is max(y1)
4. The lines are the lines put into each block according to the clustering labels. 

In [None]:
from pdf_scraper.clustering.customCluster import reblock_lines
from pdf_scraper.block_utils import split_block

block0, block1 = split_block(block)
print_line_table(block0["lines"])
print("\n\n")
print_line_table(block1["lines"])

- Also, perhaps the bbox calculated without the empty lines are more informative. The new bboxes will not have empty lines in them. 

## Identify badly blocked blocks

So far the only type of bad blocking we have seen which interferes with block ordering, is when the title above a dual column is joined
to one of the columns. To identify this, we may use the following characteristics:

- Two font distributions
- Two width distributions (excluding low word lines and empty lines)
- Discontinuity in dL (excluding empty lines)
- In the pink

In [None]:
from scipy.stats import gaussian_kde
from scipy.signal import find_peaks
def line_space_discont(lines):
    lines = [line for line in lines if not line_is_empty(line)]
    df = get_line_df(lines)
    dLs = np.array(df.dL[:-1])
    
    for i, val in enumerate(dLs):
        temp = np.delete(dLs, i, 0)
        if all(val > temp*1.6):
            #print(i, all(val > temp*1.6) )
            return True
    return False

def find_width_peaks(lines):
    df = get_line_df(lines)
    df = df[df.n_words > 4]
    w  = np.array(df.w)
    if len(w)==0:
        return []
    elif len(w) <=2:
        return [w.mean()]
    x_grid = np.linspace(w.min()-50, w.max()+50,1000)
    kde=gaussian_kde(w,bw_method='silverman')
    kde_vals = kde(x_grid)
    peaks, _ = find_peaks(kde_vals, prominence = 0.0001)
    return peaks



In [None]:
from pdf_scraper.block_utils import in_the_pink, clean_blocks
from pdf_scraper.line_utils  import find_width_peaks, line_space_discont
def detect_bad_block(block,king_pink):
    '''
    This function
    '''
    lines=[line for line in block["lines"] if not line_is_empty(line)]
    df = get_line_df(lines)
    pink = in_the_pink(block, king_pink)
    n_base_fonts  = len(df.common_font.value_counts()) >= 2
    n_width_modes = len(find_width_peaks(lines)) >=2
    space_discont = line_space_discont(lines)
    two_o_three   = [n_base_fonts, n_width_modes, space_discont]

    if pink and sum(two_o_three) >=2:
        return True
    return False

detect_bad_block(blocks[9],king_pink)

In [None]:
print_block_table(blocks)

# Clean Blocks

In [None]:
blocks = clean_blocks(blocks)
new_blocks = []
for i, block in enumerate(blocks):
    if block["type"]:
        new_blocks.append(block)
        continue
    if len(block["lines"]) <=1:
        new_blocks.append(block)
        continue
    if detect_bad_block(block,king_pink):
        two_blocks = split_block(block)
        new_blocks.extend(two_blocks)
        continue
    new_blocks.append(block)

print_block_table(new_blocks)

In [None]:
def preproc_blocks(blocks: list[dict]):
    blocks = clean_blocks(blocks)
    new_blocks = []
    for i, block in enumerate(blocks):
        if block["type"]:
            new_blocks.append(block)
            continue
        if len(block["lines"]) <=1:
            new_blocks.append(block)
            continue
        if detect_bad_block(block,king_pink):
            two_blocks = split_block(block)
            new_blocks.extend(two_blocks)
            continue
        new_blocks.append(block)
    return new_blocks


In [None]:
from pdf_scraper.block_utils import clean_blocks

In [None]:
page             = doc[5]
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
blocks           =  clean_blocks(text_dict["blocks"]) 

In [None]:
print_block_table(blocks)

In [None]:
block = blocks[3]
lines = block["lines"]
line_df = get_line_df(lines)
line_df.head(50)

# Simple Splitting Function

## Get indices of vertical space discontinuity

In [None]:
median = np.median(line_df.dL[:-1])
indices = []
for i, dL in enumerate(line_df.dL):
    if dL > 1.45*median:
        indices.append(i)
print(indices)

In [None]:
split_lines

In [None]:
indices_pp = [i+1 for i in indices]
np_dfs = np.split(line_df, indices_pp, axis=0)
split_lines = np.split(lines, indices_pp, axis=0 )

In [None]:
list[np_dfs[0].index]

In [None]:
print(indices)
mask = np.zeros(len(line_df))
n_df = 0
for i in range(len(indices)):
    if i==0:
        mask[:indices[i]+1] =n_df
        n_df +=1
        continue
    mask[indices[i-1]+1:indices[i]+1] = n_df
    n_df +=1
mask[indices[-1]:] = n_df
mask

In [None]:
median = np.median(line_df.dL[:-1])
indices = []
for i, dL in enumerate(line_df.dL):
    if dL > 1.45*median:
        indices.append(i)
#print(indices, len(indices), indices[len(indices)-1])
print(indices)

dfs = [line_df[0:indices[0]+1]]
print(0, indices[0]+1)
for i, val in enumerate(indices[:-1]):
    print(val+1, indices[i+1]+1)
    dfs.append(line_df[val+1:indices[i+1]+1])
dfs.append(line_df[indices[-1]+1:])
print(indices[-1]+1, ":")

#dfs.append(line_df[indices[len(indices)-1]+1:indices[val+1]])

In [None]:
dfs[4].head(50)

In [None]:
dfs[1].head()

In [None]:
indices = [0]
for i, dL in enumerate(line_df.dL):
    if dL > 1.45*median:
        indices.append(i)
indices.append(-1)
print(indices)

dfs = []
for i, index in enumerate(indices):
    if i==0:
        dfs.append(line_df[indices[i]:indices[i+1]+1])
        print(f"{i} {indices[i+1]}+1")
        continue
    elif i == len(indices)-1:
        dfs.append(line_df[indices[i]+1:])
        print(f"{indices[i]+1} :{indices[i+1]+1}")
        continue
    dfs.append(line_df[indices[i]+1:indices[i+1]+1])
    print(f"{indices[i]+1} :{indices[i+1]+1}")


In [None]:
dfs[1].head()

In [None]:
fart= list(range(40))
print(indices[0],indices[1]+1)
print(fart[indices[0]:indices[1]+1])
print(indices[1]+1,indices[2]+1)
print(fart[indices[1]+1:indices[2]+1])

In [None]:
dfs[0]

In [None]:
line_df[indices[0]:indices[1]]

In [None]:
dfs[1].head()

In [None]:
from pdf_scraper.line_utils import count_vert_space_discont

In [None]:
def line_space_discont(lines):
    lines = [line for line in lines if not line_is_empty(line)]
    df = get_line_df(lines)

    dLs = np.array(df.dL[:-1])
    median = np.median(line_df.dL[:-1])

    for i, val in enumerate(dLs):
        temp = np.delete(dLs, i, 0)
        if val > 1.45*median:
            #print(i, all(val > temp*1.6) )
            return True
    return False
line_space_discont(lines)

In [None]:
count_vert_space_discont(lines)

In [None]:
print(get_block_text(block))

In [None]:
from pdf_scraper.block_utils import get_bbox
def simple_multi_split(block: dict):
    number = block["number"]
    type   = block["type"]
    lines   = [line for line in block["lines"] if not line_is_empty(line)]
    df = get_line_df(lines)

    median = np.median(df.dL[:-1])
    indices = []
    for i, dL in enumerate(df.dL):
        if dL > 1.45*median:
            indices.append(i+1)
    split_lines = np.split(lines, indices, axis=0 )
    split_blocks = [{'number':number, 'type':type, 'bbox':get_bbox(lins) ,'lines':lins} for lins in split_lines]
    return split_blocks



def preproc_blocks(blocks: list[dict], king_pink):
    blocks = clean_blocks(blocks)
    if not king_pink:
        return blocks
    new_blocks = []
    for i, block in enumerate(blocks):
        if block["type"]:
            new_blocks.append(block)
            continue
        if len(block["lines"]) <=1:
            new_blocks.append(block)
            continue
        if detect_bad_block(block,king_pink):
            #split_blocks = split_block(block)
            split_blocks = simple_multi_split(block)
            new_blocks.extend(split_blocks)
            continue
        new_blocks.append(block)
    return new_blocks

In [None]:
simple_multi_split(block)

In [None]:
new_blocks = preproc_blocks(blocks,king_pink)
print_block_table(blocks)
print_block_table(new_blocks)


In [None]:

detect_bad_block(blocks[3],king_pink)

print_line_table(blocks[3]["lines"])