In [None]:
import math
from utils import *
import pandas as pd
import numpy as np

In [None]:
#pdfs = [i for i in os.listdir() if i.endswith(".pdf")]
#print(pdfs)
#pdf_file = pdfs[0] 
pdf_file = "test_pdfs/LC002ALP100EV_2024.pdf"

In [None]:
import fitz
from fitz import Rect
doc              = fitz.open(pdf_file)
page             = doc[6]
textPage         = page.get_textpage()
text_dict        = page.get_text("dict")
text_blocks      = page.get_text("blocks")
text_dict_sorted = page.get_text("dict",sort=True)


# Dictionary output

Here we will explore the format of the Page.get_text("dict") output.

In [None]:
print(f"Text dict: {text_dict.keys()}")
page_width  = text_dict['width']
page_height = text_dict['height']
print(f"width: {page_width}pts height: {page_height}pts")
print(f"width: {text_dict['width']*0.3528:5.2f}mms height: {text_dict['height']*0.3528:5.2f}mms")
print(f"There are {len(text_dict['blocks'])} dict blocks")
print(f"There are {len(text_blocks)} text block elements")

In [None]:
print(f"Block : {text_dict['blocks'][0].keys()}")
print(f'bbox  : {text_dict["blocks"][0]["bbox"]}')
print(f"line  : {text_dict['blocks'][0]['lines'][0].keys()}")
print(f'span  : {text_dict["blocks"][0]["lines"][0]["spans"][0].keys()}')
print(text_dict["blocks"][0]["lines"][0]["spans"][0]["text"])
print(text_dict["blocks"][0]["lines"][0]["spans"][1]["text"])

The dict output has blocks which come in a list of dictionaries: 
```dict_keys(['number', 'type', 'bbox', 'lines'])```
- number: just label for block
- type: 0 for txt 1 for img
- bbox: 4 bounding box coords as tuple
- lines: the content of the box separated into lines, which are separated into spans

The lines part of this dictionary has again: `dict_keys(['spans', 'wmode', 'dir', 'bbox'])`

- A span is a continuous part of text in a line all with the same formatting. 
  - Different parts of the same line may have different formatting, so one `line["spans"]` is a list of spans

In [None]:
import json

with open('page_7_block_0_dict.json', 'w') as f:
    json.dump(text_dict["blocks"][0], f, indent=4)

## Lines and spans example

In [None]:
print(len(text_dict["blocks"][0]["lines"]))
for i, line in enumerate(text_dict["blocks"][0]["lines"] ): 
    print(f"Line : {i+1}")
    for j, span in enumerate(line["spans"]):
        #if span["text"].isspace():
        #    continue
        print(f'span {j}: {span["text"]}', end = "\t")
    print("\n")

In [None]:
def get_dict_block_text(block_dict: dict ):
    '''
    For a given block dictionary element, as output by Page.get_text("dict")["blocks"], this 
    function will return the text of all the lines, joined by a "\n", and with the spans on 
    each line joined with a space. 
    
    The result is one string with newline separtaed lines and space
    separated spans.
    '''
    block_lines = block_dict["lines"]
    line_texts = [" ".join([ span["text"] for span in line["spans"] ]) for line in block_lines ]
    block_text="\n".join( [ i for i in line_texts if not i.isspace() ])
    return block_text

print(get_dict_block_text(text_dict["blocks"][0]))


images have block type 1, and text has type 0

In [None]:
for block in text_dict["blocks"]:
    if block["type"]==0:
        print(get_dict_block_text(block))
        print("\n")

# Blocks output

This is a list of 4 element tuples. Taking a block as `block = page.get_text("blocks")[0]`
- block[0] = x0 of bbox
- block[1] = y0 of bbox
- block[2] = x1 of bbox
- block[3] = y1 of bbox
- block[4] = all lines of the block joined together.

In [None]:
text_blocks      = page.get_text("blocks")
text_blocks[0]

- page.get_text("blocks") outputs a list of tuples

```(x0, y0, x1, y1, "lines in the block", block_no, block_type)```

### Check order of text segments (for page 7)

In [None]:
print(f"x0: {text_blocks[0][0]:5.2f}, x1: {text_blocks[0][2]:5.2f}, y0: {text_blocks[0][1]:5.2f}, y1: {text_blocks[0][3]:5.2f}")
n_lines=text_blocks[0][4].count('\n')
print(f"There are {n_lines} lines")
print("--"*40)
print(text_blocks[0][4])

In [None]:
only_text_blocks = [i for i in page.get_text("blocks") if i[6]==0]
all_blocks       = [i for i in page.get_text("blocks") ]
print(f"Total text blocks in this page: {len(only_text_blocks)}")
print(f"Total blocks in this page: {len(all_blocks)}")
print("First block:")
only_text_blocks[0]

## Compare dictionary and blocks

In [None]:
text_blocks      = page.get_text("blocks")
text_dict_sorted = page.get_text("dict",sort=True)

print(f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'type':5} {'num':5}")

#(x0, y0, x1, y1, "lines in the block", block_no, block_type)```
for x0, y0, x1, y1, lines, num, typ in all_blocks:
    type = "img" if typ else "txt" 
    print(f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {type:5} {num:<5}")

In [None]:
text_dict        = page.get_text("dict",sort=False)
print(f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'type':5} {'number':7}")
for block in text_dict["blocks"]:
    type = "img" if block["type"] else "txt" 
    x0, y0, x1, y1 = block['bbox']
    table=f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {type:5} {block['number']:<7}"
    print(table)

In [None]:
def get_block_table(blocks: dict):
    table=[f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'type':5} {'number':7} {'first_word':10}", "--"*40]
    for block in blocks:
        type = "img" if block["type"] else "txt" 
        x0, y0, x1, y1 = block['bbox']
        beginning=get_dict_block_text(block)[:11] if type =="txt" else "--"
        line=f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {type:5} {block['number']:<7} {beginning:<10}"
        table.append(line)
    return "\n".join(table)
table = get_block_table(text_dict["blocks"])
print(table)

# Identify pink box

In [None]:
print(len(page.get_drawings()))
drawing_0 = page.get_drawings()[0]
drawing_0

In [None]:
drawing_0["items"][0][0]

In [None]:
pink_fill = page.get_drawings()[0]['fill']
pink_fill

In [None]:
pinks = [ drawing for drawing in page.get_drawings() if drawing['fill']==pink_fill and drawing['type']=='f']
len(pinks)

In [None]:

print(pinks[0]['items'][0])
print(pinks[1]['items'][0])

In [None]:
colors = [ drawing for drawing in page.get_drawings() if drawing['color'] ]
print(len(colors))

In [None]:
# No filtering needs to be done before the definition of king_pink. You can just do the 
# min maxing stuff it should be fine.
def get_pink_boundary(drawings, pink_fill):
    """
    Return all pink fill boxes in the page, excluding those pink boxes already contained within
    another pink box on the page.
    :param drawings: List of drawing objects from get_drawings()
    :param pink_fill: tuple specifying pink colour. (1.0, 0.8980000019073486, 0.9490000009536743) for 2024 P1
    :return: Filtered list of drawings without redundant fills
    """
    # Only look at pink fill objects
    pinks = [d for d in drawings if d["type"] == "f" and d["fill"]==pink_fill ]
    if not pinks:
        return None

    def in_the_stink(pink):
        '''
        returns True if the given pink is contained in any other pink on the page.
        '''
        return any( other["rect"].contains(pink["rect"])  for other in pinks if other != pink )

    filtered_pinks = [p for p in pinks if not in_the_stink(p)]

    x0 = min([p['rect'].x0 for p in filtered_pinks] )
    y0 = min([p['rect'].y0 for p in filtered_pinks] )
    x1 = max([p['rect'].x1 for p in filtered_pinks] )
    y1 = max([p['rect'].y1 for p in filtered_pinks] )
    king_pink = fitz.Rect(x0,y0,x1,y1)

    return king_pink

def in_the_pink(block: dict, king_pink: Rect):
    x0, y0, x1, y1 = block['bbox']
    block_rect = Rect(x0,y0,x1,y1)
    return  king_pink.contains(block_rect)

drawings = page.get_drawings()
king_pink = get_pink_boundary(drawings,pink_fill)

print(king_pink)

# Ordering Dictionary Blocks

- These blocks are not all in the correct order.
- This is ok if it is just a page footer appearing at the top.
- This is not ok in the case of article text appearing in the incorrect reading order.
  - block number 4 should appear after block number 2

- Any text in a double column article cannot be larger than half the page. 

- To know it is two columns: at least two text blocks with the same y position but different x positions. Or at least overlapping y ranges, though separated by their own widths in the x direction.

In [None]:
drawings  = page.get_drawings()
pink_fill = drawings[0]['fill']
king_pink = get_pink_boundary(drawings,pink_fill)

text_dict        = page.get_text("dict")
page_width       = text_dict["width"]
W = page_width/2

dual_blocks = []
for block in text_dict["blocks"]:
    type = "img" if block["type"] else "txt" 
    x0, y0, x1, y1 = block['bbox']
    dx = x1-x0
    # skip empty blocks
    if type == "txt" and not get_dict_block_text(block):
        continue
    # Only blocks in the pink
    if not in_the_pink(block, king_pink):
       continue 
    # If we have a block in the pink less than a page width
    if dx <= W:
        dual_blocks.append(block)
    # If there are other blocks on another side of it?
    
print(get_block_table(dual_blocks))


In [None]:

def isColumnSize(block, page_width):
    x0, y0, x1, y1 = block['bbox']
    col_width = x1 - x0
    return col_width <= page_width/2

def isEmptyBlock(block: dict):
    if block["type"]:
        return 0
    return 0 if get_dict_block_text(block) else 1


def identify_dual_column(page, king_pink):
    text_dict        = page.get_text("dict")
    page_width       = text_dict["width"]
    blocks           = text_dict["blocks"]

    possiBlocks     = [block for block in blocks      if isColumnSize(    block,page_width) ]   
    possiPinks      = [block for block in possiBlocks if in_the_pink(     block,king_pink) ]   
    dual_col_blocks = [block for block in possiPinks  if not isEmptyBlock(block)]

    return dual_col_blocks

dual_blocks = identify_dual_column(page, king_pink)
table = get_block_table(dual_blocks )
print(table)
sorted_blocks = sort_dual_column_blocks(dual_blocks)
sorted_table  = get_block_table(sorted_blocks)
print("\n"*3)
print(sorted_table)

In [None]:
# col1 = x0 is closer to leftmost x0
# col2 = x0 is closer to rightmost x1, or right most x0
def sort_dual_column_blocks(blocks: dict):
    coords = [block['bbox'] for block in blocks] 
    x0_min = min(coord[0] for coord in coords)
    x0_max = max(coord[0] for coord in coords)
    x1_min = min(coord[1] for coord in coords)
    x1_max = max(coord[1] for coord in coords)

    vert_ordered = sorted(blocks, key = lambda block: block["bbox"][1])

    for block in vert_ordered:
        x0, y0, x1, y1 = block['bbox']
        dl = x0-x0_min
        dr = x0-x0_max
        block["col"] = 0 if abs(dl) < abs(dr) else 1
    
    col_ordered = sorted(vert_ordered,key = lambda x: x['col'])

    return col_ordered

sort1 = sort_dual_column_blocks(dual_blocks)
print(get_block_table(sort1))

# Splitting blocks

In [None]:
import fitz
page = doc[3]
page_dict= page.get_text("dict",sort=True)
blocks = page_dict["blocks"]
block = blocks[6]

In [None]:
print(get_block_text(block))

In [None]:
#get line widths:
lines = block['lines']
print(len(lines))
lines[0].keys()

In [None]:
def line_is_empty(line):
    return all( [span["text"].isspace() for span in line["spans"]] )
line_is_empty(lines[0])

In [None]:
good_lines = [line for line in lines if not line_is_empty(line)]
lines = [line for line in lines if not line_is_empty(line)]
print(len(good_lines))

In [None]:
get_block_table(blocks)

In [None]:
def get_line_table(lines: dict):
    '''
    This function outputs a string which will list all the blocks in the page along with their coordinates, their
    type, and the first word if it's a text block.
    '''
    table=[f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'fonts':36} {'first_word':10}", "--"*40]
    for line in lines:
        font           = line["spans"][0]["font"] 
        font_list      = list(set(span["font"] for span in line["spans"] ) )
        x0, y0, x1, y1 = line['bbox']
        beginning      = line["spans"][0]["text"][:5]
        line=f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {' '.join(font_list):36} {beginning:<10}"
        table.append(line)
    table.extend( ["--"*40,"\n"*2] )
    line_table = "\n".join(table)
    return line_table

def print_line_table(lines:dict):
    print(get_line_table(lines))
    return None

print_line_table(good_lines)

## Get Mode and Common font

In [None]:
line = lines[0]
fonts = [ span["font"] for span in line["spans"]]
def get_mode_font(fonts):
    font_counts = np.unique(fonts,return_counts=True)
    maxfontarg  = np.argmax(font_counts[1])
    return fonts[maxfontarg]
print(f"mode fond of line 1: {get_mode_font(fonts)}")

fonts = [ span["font"]for span in lines[6]["spans"]]
print(f"mode fond of line 7: {get_mode_font(fonts)}")
    

In [None]:
fonts = [ span["font"] for span in lines[0]["spans"]  ]
print(fonts)
def common_font_elems(s1,s2):
    L1, L2 = len(s1), len(s2)
    L = L1 if L1 < L2 else L2
    s3 = ""
    for i in range(L):
        if s1[i]!=s2[i]:
            return s3
        s3 += s1[i]
    return s3

def get_common_font(fonts):
    common_font=fonts[0]
    for font in fonts[1:]:
        common_font =common_font_elems(common_font,font)
    return "".join(common_font)
    
get_common_font(fonts)

In [None]:
def get_line_table(lines: dict):
    '''
    This function outputs a string which will list all the blocks in the page along with their coordinates, their
    type, and the first word if it's a text block.
    '''
    table=[f"{'dx':8} {'dy':8} {'unique fonts':36} {'base font':20} {'first_word':25}", "--"*40]
    for line in lines:
        font = line["spans"][0]["font"] 
        font_list = list(set(span["font"] for span in line["spans"] ) )
        common_font = get_common_font(font_list)
        x0, y0, x1, y1 = line['bbox']
        beginning=line["spans"][0]["text"][:25]
        line=f"{x1-x0:<8.2f} {y1-y0:<8.2f} {' '.join(font_list):36} {''.join(common_font):20} {beginning:<25}"
        table.append(line)
    table.extend( ["--"*40,"\n"*2] )
    line_table = "\n".join(table)
    print(line_table)
    return line_table

get_line_table(line)

In [None]:
def get_line_text(line: dict) -> str:
    return "".join( [span["text"] for span in line["spans"] ] )

def get_line_df(lines):
    coords         = [line['bbox'] for line in lines]
    x0             = [coord[0] for coord in coords]
    y0             = [coord[1] for coord in coords]
    dL             = [coords[i+1][1] - coords[i][1] for i in range(len(coords)-1)] + [np.nan]
    x1             = [coord[2] for coord in coords]
    y1             = [coord[3] for coord in coords]
    n_spans        = [len(line["spans"]) for line in lines]
    font_list      = [                [span["font"] for span in line["spans"]  ]  for line in lines]
    common_font    = [get_common_font([span["font"] for span in line["spans"]  ]) for line in lines]
    mode_font      = [get_mode_font(  [span["font"] for span in line["spans"]  ]) for line in lines]
    w              = [coord[2]-coord[0] for coord in coords]
    h              = [coord[3]-coord[1] for coord in coords]
    text           = [get_line_text(line) for line in lines]
    
    data_dict={"x0":x0,"y0":y0,"x1":x1,"y1":y1,"dL":dL, "n_spans":n_spans,"font_list":font_list,      
    "common_font":common_font,"mode_font":mode_font,"w":w,"h":h,"text":text}
    return pd.DataFrame(data_dict)

pd.set_option("display.float_format", "{:.2f}".format)
df = get_line_df(lines)
df.head(22)
    

**Idea** If you have nans in a particular row, you simply do not use that dimension in the clustering, but use the others. So like the last row there where there is no 
"distance to next row" element, we would not use that one.

In [None]:
X = df.drop(columns=["font_list","text","dL","n_spans"])
X[["common_font","mode_font"]] = X[["common_font","mode_font"]].applymap(lambda x: 0 if x=="Calibri,Bold" else 1)
X.head(10)

In [None]:
cat_cols

In [None]:

ohe.transform(df[["common_font","mode_font"]])[:4]

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import math
X = df.drop(columns=["font_list","text","dL","n_spans"])

num_cols = X.select_dtypes(include=np.number).columns
X[num_cols] = StandardScaler().fit_transform(X[num_cols])


cat_cols = X.select_dtypes(include="object").columns
# X[["common_font","mode_font"]] = X[["common_font","mode_font"]].applymap(lambda x: 0 if x=="Calibri,Bold" else 1)
ohe = OneHotEncoder(drop="if_binary", sparse_output=False).fit(X[cat_cols])
X[cat_cols] = ohe.transform(X[cat_cols])
X.head(15)

In [None]:
from sklearn.cluster import KMeans
k = 2
top_init    = X.min().values   
top_init[4]=1
bottom_init = X.max().values  
bottom_init[4]=0
init_centroids = [top_init, bottom_init]
kmeans = KMeans(n_clusters=k, random_state=42,init=init_centroids, n_init="auto")
y_pred = kmeans.fit_predict(X)

In [None]:
pd.DataFrame(kmeans.cluster_centers_,columns= X.columns)

In [None]:
binary_features = [4, 5]  # Indices for "common_font" and "mode_font"

# Post-process centroids to enforce binary values (0 or 1)
for idx in binary_features:
    kmeans.cluster_centers_[:, idx] = np.round(kmeans.cluster_centers_[:, idx])

print("Centers:")
display(pd.DataFrame(kmeans.cluster_centers_,columns= X.columns))
print("Predictions:")
print(kmeans.predict(X))
print("Trouble point:")
display(X.iloc[3:4])
print("distances from each center:")
kmeans.transform(X.iloc[3:4])

In [None]:
point = X.iloc[3:4]
centre1 = kmeans.cluster_centers_[0]
centre2 = kmeans.cluster_centers_[1]

print(f"Point:")
display(point)
print(f"Center1: {centre1}")
print(f"Centre2: {centre2}")

kmeans.predict(point)


In [None]:
def separate_lines():
# Column lines: All lines with a width <= page_with/2 or king_pink/2 , and also with the mode of the fonts for 
# the lines with a width of a certain amount.
# For all width determinations, if a line has under a certain amount of words, it must be excluded, and only font used,
# as well as perhaps proximity to other line groups.

In [None]:
df.head(6)

In [None]:
X = df.drop(columns=["font_list","text","dL","n_spans","x0","x1","mode_font"])
cat_weight = math.sqrt(4.0)
y0_weight = math.sqrt(2.5)

num_cols = X.select_dtypes(include=np.number).columns
X[num_cols] = StandardScaler().fit_transform(X[num_cols])
X["y0"] = X["y0"]* y0_weight


cat_cols = X.select_dtypes(include="object").columns
ohe = OneHotEncoder(drop="if_binary", sparse_output=False).fit(X[cat_cols])
X[cat_cols] = ohe.transform(X[cat_cols])
X["common_font"] = X["common_font"]*cat_weight

display(X.head(6))

top_init    = X.min().values ; top_init[4]=cat_weight
bottom_init = X.max().values ; bottom_init[4]=0
init_centroids = [top_init, bottom_init]
kmeans = KMeans(n_clusters=2, init=init_centroids, n_init="auto")
kmeans = KMeans(n_clusters=2,  n_init=100)
y_pred = kmeans.fit_predict(X)

centre1 = kmeans.cluster_centers_[0]
centre2 = kmeans.cluster_centers_[1]
display(pd.DataFrame(np.vstack((centre1,centre2)  ),columns=X.columns) ) 
y_pred

In [None]:
kmeans.transform(X.iloc[3:4])

### Ideas for clustering

There is no real reason for the centroids to be able to take values other than the few defined categorical values.
maybe there is.

Nevertheless, consider a custom clustering algorithm where the centroid categorical values can only have fixed values.

Consider ignoring the width for lines which have early endings (few words, last word has full stop)

Consider squaring the y distance.