In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect

from pdf_scraper.block_utils import identify_dual_column, get_block_table, get_block_text, sort_dual_column_blocks
from pdf_scraper.block_utils import is_empty_block
from pdf_scraper.draw_utils  import get_pink_boundary

In [None]:
pdf_file         = Path.cwd().parent / "test_pdfs" / "LC002ALP100EV_2024.pdf"
doc              = fitz.open(pdf_file)
page             = doc[6]
textPage         = page.get_textpage()
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
page_width       = text_dict["width"]   # This is a document wide thing doesn't need to be per page.
blocks           = [block for block in text_dict["blocks"] if not is_empty_block(block)]

# Sorting Blocks

The in-built sorting of get_text works well in most cases, but can have issues when:
- there is two column text. 
- Column text and header text are not correctly blocked

Therefore we need to write our own functions which can

- Identify and resort dual column text.
- Identify and split incorrectly blocked lines of text. 


# Identifying and sorting dual column text

In [None]:

pink_fill = (1.0, 0.8980000019073486, 0.9490000009536743)
king_pink = get_pink_boundary(page_drawings, pink_fill)

dual_blocks = identify_dual_column(blocks,page_width, king_pink)
table = get_block_table(dual_blocks )
print(table)
sorted_blocks = sort_dual_column_blocks(dual_blocks)
sorted_table  = get_block_table(sorted_blocks)
print("\n"*3)
print(sorted_table)

# Splitting blocks

In [None]:
from pdf_scraper.line_utils  import line_is_empty
from pdf_scraper.line_utils import get_line_table, print_line_table
from pdf_scraper.line_utils import get_mode_font, get_common_font

In [None]:
page = doc[3]
page_dict= page.get_text("dict",sort=True)
blocks = page_dict["blocks"]
block = blocks[6]

In [None]:
print(get_block_text(block))

In [None]:
lines = block['lines']
lines = [line for line in lines if not line_is_empty(line)]
print_line_table(lines)

## Get Mode and Common font

In [None]:
from pdf_scraper.line_utils import get_line_df

pd.set_option("display.float_format", "{:.2f}".format)
df = get_line_df(lines)
df.head(22)
    

**Idea** If you have nans in a particular row, you simply do not use that dimension in the clustering, but use the others. So like the last row there where there is no 
"distance to next row" element, we would not use that one.

### Ideas for clustering

There is no real reason for the centroids to be able to take values other than the few defined categorical values.
maybe there is.

Nevertheless, consider a custom clustering algorithm where the centroid categorical values can only have fixed values.

Consider ignoring the width for lines which have early endings (few words, last word has full stop)

Consider squaring the y distance.

### Alternative line splitting

Look above at the data frame dL; If you just looped through your vertically sorted lines, <br> 
and then as soon as the distance between one line and the next exceeds what it was previously, split the block.

# Block split function

- So to make a new block, we need to assign a number, a type, and a bbox. We already have the lines. 
1. The number is just a label, so we will keep the same label for the two blocks; this will furthermore help to identify a split block.  
2. The type will be the same.
3. bbox: write a function which infers a bbox from the line. Check it on known bboxes for blocks.
   - the functin takes as x0 min(x0) for all lines, x1 is max(x1) , y0 is min(y0), and y1 is max(y1)
4. The lines are the lines put into each block according to the clustering labels. 

In [None]:
from pdf_scraper.clustering.customCluster import reblock_lines
from pdf_scraper.block_utils import split_block

block0, block1 = split_block(block)
print_line_table(block0["lines"])

- Also, perhaps the bbox calculated without the empty lines are more informative. The new bboxes will not have empty lines in them. 

## Identify badly blocked blocks

So far the only type of bad blocking we have seen which interferes with block ordering, is when the title above a dual column is joined
to one of the columns. To identify this, we may use the following characteristics:

- Two font distributions
- Two width distributions (excluding low word lines and empty lines)
- Discontinuity in dL (excluding empty lines)
- In the pink

In [None]:
def line_space_discont(lines):
    lines = [line for line in lines if not line_is_empty(line)]
    df = get_line_df(lines)
    dLs = np.array(df.dL[:-1])
    
    for i, val in enumerate(dLs):
        temp = np.delete(dLs, i, 0)
        if all(val > temp*1.6):
            #print(i, all(val > temp*1.6) )
            return True
    return False

def find_width_peaks(lines):
    df = get_line_df(lines)
    df = df[df.n_words > 4]
    w  = np.array(df.w)
    if len(w)==0:
        return []
    elif len(w) <=2:
        return [w.mean()]
    x_grid = np.linspace(w.min()-50, w.max()+50,1000)
    kde=gaussian_kde(w,bw_method='silverman')
    kde_vals = kde(x_grid)
    peaks, _ = find_peaks(kde_vals, prominence = 0.0001)
    return peaks



In [None]:
from pdf_scraper.block_utils import in_the_pink, clean_blocks
def detect_bad_block(block,king_pink):
    '''
    This function
    '''
    lines=[line for line in block["lines"] if not line_is_empty(line)]
    df = get_line_df(lines)
    pink = in_the_pink(block, king_pink)
    n_base_fonts  = len(df.common_font.value_counts()) >= 2
    n_width_modes = len(find_width_peaks(lines)) >=2
    space_discont = line_space_discont(lines)
    two_o_three   = [n_base_fonts, n_width_modes, space_discont]

    if pink and sum(two_o_three) >=2:
        return True
    return False

detect_bad_block(blocks[9],king_pink)

In [None]:
get_block_table(blocks)

In [None]:
text_blocks = [block for block in blocks if not block["type"]]
clean_text_blocks = clean_blocks(text_blocks)
for i, block in enumerate(clean_text_blocks):
    if len(block["lines"]) <=1:
        continue
    if detect_bad_block(block,king_pink):
        print(i, block["number"])


In [None]:
get_block_table(clean_blocks(text_blocks))