In [290]:
import math
from utils import *
import pandas as pd
import numpy as np

In [1]:
#pdfs = [i for i in os.listdir() if i.endswith(".pdf")]
#print(pdfs)
#pdf_file = pdfs[0] 
pdf_file = "test_pdfs/LC002ALP100EV_2024.pdf"

In [2]:
import fitz
from fitz import Rect
doc              = fitz.open(pdf_file)
page             = doc[6]
textPage         = page.get_textpage()
text_dict        = page.get_text("dict")
text_blocks      = page.get_text("blocks")
text_dict_sorted = page.get_text("dict",sort=True)


# Dictionary output

Here we will explore the format of the Page.get_text("dict") output.

In [7]:
print(f"Text dict: {text_dict.keys()}")
page_width  = text_dict['width']
page_height = text_dict['height']
print(f"width: {page_width}pts height: {page_height}pts")
print(f"width: {text_dict['width']*0.3528:5.2f}mms height: {text_dict['height']*0.3528:5.2f}mms")
print(f"There are {len(text_dict['blocks'])} dict blocks")
print(f"There are {len(text_blocks)} text block elements")

Text dict: dict_keys(['width', 'height', 'blocks'])
width: 595.219970703125pts height: 842.0pts
width: 209.99mms height: 297.06mms
There are 8 dict blocks
There are 7 text block elements


In [8]:
print(f"Block : {text_dict['blocks'][0].keys()}")
print(f'bbox  : {text_dict["blocks"][0]["bbox"]}')
print(f"line  : {text_dict['blocks'][0]['lines'][0].keys()}")
print(f'span  : {text_dict["blocks"][0]["lines"][0]["spans"][0].keys()}')
print(text_dict["blocks"][0]["lines"][0]["spans"][0]["text"])
print(text_dict["blocks"][0]["lines"][0]["spans"][1]["text"])

Block : dict_keys(['number', 'type', 'bbox', 'lines'])
bbox  : (42.540000915527344, 795.9447021484375, 555.2017822265625, 819.104736328125)
line  : dict_keys(['spans', 'wmode', 'dir', 'bbox'])
span  : dict_keys(['size', 'flags', 'bidi', 'char_flags', 'font', 'color', 'alpha', 'ascender', 'descender', 'text', 'origin', 'bbox'])
Leaving Certificate Examination 2024
 


The dict output has blocks which come in a list of dictionaries: 
```dict_keys(['number', 'type', 'bbox', 'lines'])```
- number: just label for block
- type: 0 for txt 1 for img
- bbox: 4 bounding box coords as tuple
- lines: the content of the box separated into lines, which are separated into spans

The lines part of this dictionary has again: `dict_keys(['spans', 'wmode', 'dir', 'bbox'])`

- A span is a continuous part of text in a line all with the same formatting. 
  - Different parts of the same line may have different formatting, so one `line["spans"]` is a list of spans

In [9]:
import json

with open('page_7_block_0_dict.json', 'w') as f:
    json.dump(text_dict["blocks"][0], f, indent=4)

## Lines and spans example

In [10]:
print(len(text_dict["blocks"][0]["lines"]))
for i, line in enumerate(text_dict["blocks"][0]["lines"] ): 
    print(f"Line : {i+1}")
    for j, span in enumerate(line["spans"]):
        #if span["text"].isspace():
        #    continue
        print(f'span {j}: {span["text"]}', end = "\t")
    print("\n")

4
Line : 1
span 0: Leaving Certificate Examination 2024	span 1:  	

Line : 2
span 0: 7 	

Line : 3
span 0:  	

Line : 4
span 0: English – Higher Level – Paper 1 	



In [11]:
def get_dict_block_text(block_dict: dict ):
    '''
    For a given block dictionary element, as output by Page.get_text("dict")["blocks"], this 
    function will return the text of all the lines, joined by a "\n", and with the spans on 
    each line joined with a space. 
    
    The result is one string with newline separtaed lines and space
    separated spans.
    '''
    block_lines = block_dict["lines"]
    line_texts = [" ".join([ span["text"] for span in line["spans"] ]) for line in block_lines ]
    block_text="\n".join( [ i for i in line_texts if not i.isspace() ])
    return block_text

print(get_dict_block_text(text_dict["blocks"][0]))


Leaving Certificate Examination 2024  
7 
English – Higher Level – Paper 1 


images have block type 1, and text has type 0

In [12]:
for block in text_dict["blocks"]:
    if block["type"]==0:
        print(get_dict_block_text(block))
        print("\n")

Leaving Certificate Examination 2024  
7 
English – Higher Level – Paper 1 


N.B. Candidates may NOT answer Question A and Question B on the same text. 
QUESTION A – 50 Marks 
(i) 
What, in your opinion, does the writer reveal about the unique experience of “slow travel” in  
TEXT 3?  Make three points, supporting your response with reference to the text.   
(15)  
(ii)   One of Monisha Rajesh’s fellow travellers, Charles, was wearing a T-shirt saying  Green against the 
Machine .  What are your views on the different ways open to young people to protest, or 
demonstrate their views, on important issues?  Develop three points in your response. 
(15) 
(iii)  Identify four elements of the writer’s style, evident in TEXT 3, and discuss how they 
contribute to making this an informative and appealing piece of travel writing.   
Support  your response with reference to TEXT 3. 
(20) 
QUESTION B – 50 Marks  
Write a series of reflective  diary entries  of a person returning to their homepla

# Blocks output

This is a list of 4 element tuples. Taking a block as `block = page.get_text("blocks")[0]`
- block[0] = x0 of bbox
- block[1] = y0 of bbox
- block[2] = x1 of bbox
- block[3] = y1 of bbox
- block[4] = all lines of the block joined together.

In [None]:
text_blocks      = page.get_text("blocks")
text_blocks[0]

(42.540000915527344,
 795.9447021484375,
 555.2017822265625,
 819.104736328125,
 'Leaving Certificate Examination 2024 \n4 \n \nEnglish – Higher Level – Paper 1 \n',
 0,
 0)

- page.get_text("blocks") outputs a list of tuples

```(x0, y0, x1, y1, "lines in the block", block_no, block_type)```

### Check order of text segments (for page 7)

In [14]:
print(f"x0: {text_blocks[0][0]:5.2f}, x1: {text_blocks[0][2]:5.2f}, y0: {text_blocks[0][1]:5.2f}, y1: {text_blocks[0][3]:5.2f}")
n_lines=text_blocks[0][4].count('\n')
print(f"There are {n_lines} lines")
print("--"*40)
print(text_blocks[0][4])

x0: 42.54, x1: 555.20, y0: 795.94, y1: 819.10
There are 4 lines
--------------------------------------------------------------------------------
Leaving Certificate Examination 2024 
7 
 
English – Higher Level – Paper 1 



In [15]:
only_text_blocks = [i for i in page.get_text("blocks") if i[6]==0]
all_blocks       = [i for i in page.get_text("blocks") ]
print(f"Total text blocks in this page: {len(only_text_blocks)}")
print(f"Total blocks in this page: {len(all_blocks)}")
print("First block:")
only_text_blocks[0]

Total text blocks in this page: 7
Total blocks in this page: 7
First block:


(42.540000915527344,
 795.9447021484375,
 555.2017822265625,
 819.104736328125,
 'Leaving Certificate Examination 2024 \n7 \n \nEnglish – Higher Level – Paper 1 \n',
 0,
 0)

## Compare dictionary and blocks

In [16]:
text_blocks      = page.get_text("blocks")
text_dict_sorted = page.get_text("dict",sort=True)

print(f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'type':5} {'num':5}")

#(x0, y0, x1, y1, "lines in the block", block_no, block_type)```
for x0, y0, x1, y1, lines, num, typ in all_blocks:
    type = "img" if typ else "txt" 
    print(f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {type:5} {num:<5}")

x0       x1       y0       y1       dx       dy       type  num  
42.54    555.20   795.94   819.10   512.66   23.16    txt   0    
42.54    549.14   477.78   784.92   506.60   307.14   txt   1    
47.94    280.85   44.94    386.52   232.91   341.58   txt   2    
311.46   532.79   44.94    291.42   221.33   246.48   txt   3    
47.94    279.90   391.20   461.82   231.96   70.62    txt   4    
290.22   292.93   391.20   403.20   2.71     12.00    txt   5    
311.46   312.82   390.00   396.00   1.36     6.00     txt   6    


In [17]:
text_dict        = page.get_text("dict",sort=False)
print(f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'type':5} {'number':7}")
for block in text_dict["blocks"]:
    type = "img" if block["type"] else "txt" 
    x0, y0, x1, y1 = block['bbox']
    table=f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {type:5} {block['number']:<7}"
    print(table)

x0       x1       y0       y1       dx       dy       type  number 
42.54    555.20   795.94   819.10   512.66   23.16    txt   0      
42.54    549.14   477.78   784.92   506.60   307.14   txt   1      
47.94    280.85   44.94    386.52   232.91   341.58   txt   2      
311.46   532.79   44.94    291.42   221.33   246.48   txt   3      
47.94    279.90   391.20   461.82   231.96   70.62    txt   4      
290.22   292.93   391.20   403.20   2.71     12.00    txt   5      
311.46   312.82   390.00   396.00   1.36     6.00     txt   6      
284.46   527.94   296.16   455.16   243.48   159.00   img   7      


In [18]:
def get_block_table(blocks: dict):
    table=[f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'type':5} {'number':7} {'first_word':10}", "--"*40]
    for block in blocks:
        type = "img" if block["type"] else "txt" 
        x0, y0, x1, y1 = block['bbox']
        beginning=get_dict_block_text(block)[:11] if type =="txt" else "--"
        line=f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {type:5} {block['number']:<7} {beginning:<10}"
        table.append(line)
    return "\n".join(table)
table = get_block_table(text_dict["blocks"])
print(table)

x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
42.54    555.20   795.94   819.10   512.66   23.16    txt   0       Leaving Cer
42.54    549.14   477.78   784.92   506.60   307.14   txt   1       N.B. Candid
47.94    280.85   44.94    386.52   232.91   341.58   txt   2       I found mys
311.46   532.79   44.94    291.42   221.33   246.48   txt   3       the window 
47.94    279.90   391.20   461.82   231.96   70.62    txt   4       undeniably 
290.22   292.93   391.20   403.20   2.71     12.00    txt   5                 
311.46   312.82   390.00   396.00   1.36     6.00     txt   6                 
284.46   527.94   296.16   455.16   243.48   159.00   img   7       --        


# Identify pink box

In [19]:
print(len(page.get_drawings()))
drawing_0 = page.get_drawings()[0]
drawing_0

61


{'items': [('re',
   Rect(42.540000915527344, 42.53997802734375, 284.82000732421875, 388.79998779296875),
   1)],
 'type': 'f',
 'even_odd': False,
 'fill_opacity': 1.0,
 'fill': (1.0, 0.8980000019073486, 0.9490000009536743),
 'rect': Rect(42.540000915527344, 42.53997802734375, 284.82000732421875, 388.79998779296875),
 'seqno': 1,
 'layer': '',
 'closePath': None,
 'color': None,
 'width': None,
 'lineCap': None,
 'lineJoin': None,
 'dashes': None,
 'stroke_opacity': None}

In [20]:
drawing_0["items"][0][0]

're'

In [21]:
pink_fill = page.get_drawings()[0]['fill']
pink_fill

(1.0, 0.8980000019073486, 0.9490000009536743)

In [22]:
pinks = [ drawing for drawing in page.get_drawings() if drawing['fill']==pink_fill and drawing['type']=='f']
len(pinks)

54

In [23]:

print(pinks[0]['items'][0])
print(pinks[1]['items'][0])

('re', Rect(42.540000915527344, 42.53997802734375, 284.82000732421875, 388.79998779296875), 1)
('re', Rect(47.939998626708984, 42.53997802734375, 279.41998291015625, 57.17999267578125), 1)


In [24]:
colors = [ drawing for drawing in page.get_drawings() if drawing['color'] ]
print(len(colors))

0


In [25]:
# No filtering needs to be done before the definition of king_pink. You can just do the 
# min maxing stuff it should be fine.
def get_pink_boundary(drawings, pink_fill):
    """
    Return all pink fill boxes in the page, excluding those pink boxes already contained within
    another pink box on the page.
    :param drawings: List of drawing objects from get_drawings()
    :param pink_fill: tuple specifying pink colour. (1.0, 0.8980000019073486, 0.9490000009536743) for 2024 P1
    :return: Filtered list of drawings without redundant fills
    """
    # Only look at pink fill objects
    pinks = [d for d in drawings if d["type"] == "f" and d["fill"]==pink_fill ]
    if not pinks:
        return None

    def in_the_stink(pink):
        '''
        returns True if the given pink is contained in any other pink on the page.
        '''
        return any( other["rect"].contains(pink["rect"])  for other in pinks if other != pink )

    filtered_pinks = [p for p in pinks if not in_the_stink(p)]

    x0 = min([p['rect'].x0 for p in filtered_pinks] )
    y0 = min([p['rect'].y0 for p in filtered_pinks] )
    x1 = max([p['rect'].x1 for p in filtered_pinks] )
    y1 = max([p['rect'].y1 for p in filtered_pinks] )
    king_pink = fitz.Rect(x0,y0,x1,y1)

    return king_pink

def in_the_pink(block: dict, king_pink: Rect):
    x0, y0, x1, y1 = block['bbox']
    block_rect = Rect(x0,y0,x1,y1)
    return  king_pink.contains(block_rect)

drawings = page.get_drawings()
king_pink = get_pink_boundary(drawings,pink_fill)

print(king_pink)

Rect(42.540000915527344, 42.53997802734375, 535.3800048828125, 467.8800048828125)


# Ordering Dictionary Blocks

- These blocks are not all in the correct order.
- This is ok if it is just a page footer appearing at the top.
- This is not ok in the case of article text appearing in the incorrect reading order.
  - block number 4 should appear after block number 2

- Any text in a double column article cannot be larger than half the page. 

- To know it is two columns: at least two text blocks with the same y position but different x positions. Or at least overlapping y ranges, though separated by their own widths in the x direction.

In [26]:
drawings  = page.get_drawings()
pink_fill = drawings[0]['fill']
king_pink = get_pink_boundary(drawings,pink_fill)

text_dict        = page.get_text("dict")
page_width       = text_dict["width"]
W = page_width/2

dual_blocks = []
for block in text_dict["blocks"]:
    type = "img" if block["type"] else "txt" 
    x0, y0, x1, y1 = block['bbox']
    dx = x1-x0
    # skip empty blocks
    if type == "txt" and not get_dict_block_text(block):
        continue
    # Only blocks in the pink
    if not in_the_pink(block, king_pink):
       continue 
    # If we have a block in the pink less than a page width
    if dx <= W:
        dual_blocks.append(block)
    # If there are other blocks on another side of it?
    
print(get_block_table(dual_blocks))


x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
47.94    280.85   44.94    386.52   232.91   341.58   txt   2       I found mys
311.46   532.79   44.94    291.42   221.33   246.48   txt   3       the window 
47.94    279.90   391.20   461.82   231.96   70.62    txt   4       undeniably 
284.46   527.94   296.16   455.16   243.48   159.00   img   7       --        


In [27]:

def isColumnSize(block, page_width):
    x0, y0, x1, y1 = block['bbox']
    col_width = x1 - x0
    return col_width <= page_width/2

def isEmptyBlock(block: dict):
    if block["type"]:
        return 0
    return 0 if get_dict_block_text(block) else 1


def identify_dual_column(page, king_pink):
    text_dict        = page.get_text("dict")
    page_width       = text_dict["width"]
    blocks           = text_dict["blocks"]

    possiBlocks     = [block for block in blocks      if isColumnSize(    block,page_width) ]   
    possiPinks      = [block for block in possiBlocks if in_the_pink(     block,king_pink) ]   
    dual_col_blocks = [block for block in possiPinks  if not isEmptyBlock(block)]

    return dual_col_blocks

dual_blocks = identify_dual_column(page, king_pink)
table = get_block_table(dual_blocks )
print(table)
sorted_blocks = sort_dual_column_blocks(dual_blocks)
sorted_table  = get_block_table(sorted_blocks)
print("\n"*3)
print(sorted_table)

x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
47.94    280.85   44.94    386.52   232.91   341.58   txt   2       I found mys
311.46   532.79   44.94    291.42   221.33   246.48   txt   3       the window 
47.94    279.90   391.20   461.82   231.96   70.62    txt   4       undeniably 
284.46   527.94   296.16   455.16   243.48   159.00   img   7       --        


NameError: name 'sort_dual_column_blocks' is not defined

In [None]:
# col1 = x0 is closer to leftmost x0
# col2 = x0 is closer to rightmost x1, or right most x0
def sort_dual_column_blocks(blocks: dict):
    coords = [block['bbox'] for block in blocks] 
    x0_min = min(coord[0] for coord in coords)
    x0_max = max(coord[0] for coord in coords)
    x1_min = min(coord[1] for coord in coords)
    x1_max = max(coord[1] for coord in coords)

    vert_ordered = sorted(blocks, key = lambda block: block["bbox"][1])

    for block in vert_ordered:
        x0, y0, x1, y1 = block['bbox']
        dl = x0-x0_min
        dr = x0-x0_max
        block["col"] = 0 if abs(dl) < abs(dr) else 1
    
    col_ordered = sorted(vert_ordered,key = lambda x: x['col'])

    return col_ordered

sort1 = sort_dual_column_blocks(dual_blocks)
print(get_block_table(sort1))

# Splitting blocks

In [132]:
import fitz
page = doc[3]
page_dict= page.get_text("dict",sort=True)
blocks = page_dict["blocks"]
block = blocks[6]

In [133]:
print(get_block_text(block))

TEXT 2 is an edited extract from the opening of Paul Murray’s novel,  The Bee Sting,  shortlisted 
for the 2023 Booker Prize.  The novel tells the tragi-comic story of the Barnes family, set in 
contemporary Ireland.  In this extract we meet the teenage daughter, Cass, and her best friend, 
Elaine. 
Cass and Elaine first met in Chemistry class, 
when Elaine poured iodine on Cass’s eczema 
during an experiment.  It was an accident; 
she’d cried more than Cass did, and insisted on 
going with her to the nurse.  They’d been 
friends ever since.  Every morning Cass called 
to Elaine’s house and they walked to school 
together.  At lunchtime, they rolled up their 
long skirts and wandered around the 
supermarket, listening to music from Elaine’s 
phone, eating croissants from the bakery 
section that were gone by the time they got to 
the checkout.   


In [134]:
#get line widths:
lines = block['lines']
print(len(lines))
lines[0].keys()

19


dict_keys(['spans', 'wmode', 'dir', 'bbox'])

In [135]:
def line_is_empty(line):
    return all( [span["text"].isspace() for span in line["spans"]] )
line_is_empty(lines[0])

True

In [136]:
good_lines = [line for line in lines if not line_is_empty(line)]
lines = [line for line in lines if not line_is_empty(line)]
print(len(good_lines))

17


In [137]:
get_block_table(blocks)

x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
47.94    48.85    27.04    31.06    0.91     4.02     txt   1                 
131.28   452.41   34.36    50.38    321.13   16.02    txt   2       TEXT 2 – FR
288.96   312.91   138.54   150.54   23.95    12.00    txt   7                 
405.00   501.72   129.78   255.78   96.72    126.00   img   14      --        
502.50   504.98   249.04   260.02   2.48     10.98    txt   13                
290.88   387.12   118.62   272.58   96.24    153.96   img   12      --        
47.94    525.21   52.66    326.34   477.27   273.68   txt   3       TEXT 2 is a
310.20   522.03   153.24   399.60   211.83   246.36   txt   8       Elaine hate
47.94    276.15   327.34   477.72   228.21   150.38   txt   4       Cass felt s
310.20   528.64   400.60   536.28   218.44   135.67   txt   9       The Tidy To
310.20   526.38   537.34   614.46   216.18   

'x0       x1       y0       y1       dx       dy       type  number  first_word\n--------------------------------------------------------------------------------\n47.94    48.85    27.04    31.06    0.91     4.02     txt   1                 \n131.28   452.41   34.36    50.38    321.13   16.02    txt   2       TEXT 2 – FR\n288.96   312.91   138.54   150.54   23.95    12.00    txt   7                 \n405.00   501.72   129.78   255.78   96.72    126.00   img   14      --        \n502.50   504.98   249.04   260.02   2.48     10.98    txt   13                \n290.88   387.12   118.62   272.58   96.24    153.96   img   12      --        \n47.94    525.21   52.66    326.34   477.27   273.68   txt   3       TEXT 2 is a\n310.20   522.03   153.24   399.60   211.83   246.36   txt   8       Elaine hate\n47.94    276.15   327.34   477.72   228.21   150.38   txt   4       Cass felt s\n310.20   528.64   400.60   536.28   218.44   135.67   txt   9       The Tidy To\n310.20   526.38   537.34   614.4

In [138]:
def get_line_table(lines: dict):
    '''
    This function outputs a string which will list all the blocks in the page along with their coordinates, their
    type, and the first word if it's a text block.
    '''
    table=[f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'fonts':36} {'first_word':10}", "--"*40]
    for line in lines:
        font           = line["spans"][0]["font"] 
        font_list      = list(set(span["font"] for span in line["spans"] ) )
        x0, y0, x1, y1 = line['bbox']
        beginning      = line["spans"][0]["text"][:5]
        line=f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {' '.join(font_list):36} {beginning:<10}"
        table.append(line)
    table.extend( ["--"*40,"\n"*2] )
    line_table = "\n".join(table)
    return line_table

def print_line_table(lines:dict):
    print(get_line_table(lines))
    return None

print_line_table(good_lines)

x0       x1       y0       y1       dx       dy       fonts                                first_word
--------------------------------------------------------------------------------
47.94    519.13   65.34    77.34    471.19   12.00    Calibri,Bold Calibri,BoldItalic      TEXT      
47.94    499.81   79.98    91.98    451.87   12.00    Calibri,Bold                         for t     
47.94    525.21   94.62    106.62   477.27   12.00    Calibri,Bold                         conte     
47.94    84.01    109.26   121.26   36.07    12.00    Calibri,Bold                         Elain     
47.94    262.08   138.54   150.54   214.14   12.00    Calibri                              Cass      
47.94    268.27   153.24   165.24   220.33   12.00    Calibri                              when      
47.94    254.96   167.88   179.88   207.02   12.00    Calibri                              durin     
47.94    279.52   182.52   194.52   231.58   12.00    Calibri                              she’d     
4

## Get Mode and Common font

In [143]:
line = lines[0]
fonts = [ span["font"] for span in line["spans"]]
def get_mode_font(fonts):
    font_counts = np.unique(fonts,return_counts=True)
    maxfontarg  = np.argmax(font_counts[1])
    return fonts[maxfontarg]
print(f"mode fond of line 1: {get_mode_font(fonts)}")

fonts = [ span["font"]for span in lines[6]["spans"]]
print(f"mode fond of line 7: {get_mode_font(fonts)}")
    

mode fond of line 1: Calibri,Bold
mode fond of line 7: Calibri


In [None]:
fonts = [ span["font"] for span in lines[0]["spans"]  ]
print(fonts)
def common_font_elems(s1,s2):
    L1, L2 = len(s1), len(s2)
    L = L1 if L1 < L2 else L2
    s3 = ""
    for i in range(L):
        if s1[i]!=s2[i]:
            return s3
        s3 += s1[i]
    return s3

def get_common_font(fonts):
    common_font=fonts[0]
    for font in fonts[1:]:
        common_font =common_font_elems(common_font,font)
    return "".join(common_font)
    
get_common_font(fonts)

['Calibri,Bold', 'Calibri,BoldItalic', 'Calibri,Bold']


'Calibri,Bold'

In [None]:
def get_line_table(lines: dict):
    '''
    This function outputs a string which will list all the blocks in the page along with their coordinates, their
    type, and the first word if it's a text block.
    '''
    table=[f"{'dx':8} {'dy':8} {'unique fonts':36} {'base font':20} {'first_word':25}", "--"*40]
    for line in lines:
        font = line["spans"][0]["font"] 
        font_list = list(set(span["font"] for span in line["spans"] ) )
        common_font = get_common_font(font_list)
        x0, y0, x1, y1 = line['bbox']
        beginning=line["spans"][0]["text"][:25]
        line=f"{x1-x0:<8.2f} {y1-y0:<8.2f} {' '.join(font_list):36} {''.join(common_font):20} {beginning:<25}"
        table.append(line)
    table.extend( ["--"*40,"\n"*2] )
    line_table = "\n".join(table)
    print(line_table)
    return line_table

get_line_table(line)

dx       dy       unique fonts                         base font            first_word               
--------------------------------------------------------------------------------
471.19   12.00    Calibri,Bold Calibri,BoldItalic      Calibri,Bold         TEXT 2 is an edited extra
451.87   12.00    Calibri,Bold                         Calibri,Bold         for the 2023 Booker Prize
477.27   12.00    Calibri,Bold                         Calibri,Bold         contemporary Ireland.  In
36.07    12.00    Calibri,Bold                         Calibri,Bold         Elaine.                  
214.14   12.00    Calibri                              Calibri              Cass and Elaine first met
220.33   12.00    Calibri                              Calibri              when Elaine poured iodine
207.02   12.00    Calibri                              Calibri              during an experiment.  It
231.58   12.00    Calibri                              Calibri              she’d cried more than Cas
2

'dx       dy       unique fonts                         base font            first_word               \n--------------------------------------------------------------------------------\n471.19   12.00    Calibri,Bold Calibri,BoldItalic      Calibri,Bold         TEXT 2 is an edited extra\n451.87   12.00    Calibri,Bold                         Calibri,Bold         for the 2023 Booker Prize\n477.27   12.00    Calibri,Bold                         Calibri,Bold         contemporary Ireland.  In\n36.07    12.00    Calibri,Bold                         Calibri,Bold         Elaine.                  \n214.14   12.00    Calibri                              Calibri              Cass and Elaine first met\n220.33   12.00    Calibri                              Calibri              when Elaine poured iodine\n207.02   12.00    Calibri                              Calibri              during an experiment.  It\n231.58   12.00    Calibri                              Calibri              she’d cried more 

In [182]:
def get_line_text(line: dict) -> str:
    return "".join( [span["text"] for span in line["spans"] ] )

def get_line_df(lines):
    coords         = [line['bbox'] for line in lines]
    x0             = [coord[0] for coord in coords]
    y0             = [coord[1] for coord in coords]
    dL             = [coords[i+1][1] - coords[i][1] for i in range(len(coords)-1)] + [np.nan]
    x1             = [coord[2] for coord in coords]
    y1             = [coord[3] for coord in coords]
    n_spans        = [len(line["spans"]) for line in lines]
    font_list      = [                [span["font"] for span in line["spans"]  ]  for line in lines]
    common_font    = [get_common_font([span["font"] for span in line["spans"]  ]) for line in lines]
    mode_font      = [get_mode_font(  [span["font"] for span in line["spans"]  ]) for line in lines]
    w              = [coord[2]-coord[0] for coord in coords]
    h              = [coord[3]-coord[1] for coord in coords]
    text           = [get_line_text(line) for line in lines]
    
    data_dict={"x0":x0,"y0":y0,"x1":x1,"y1":y1,"dL":dL, "n_spans":n_spans,"font_list":font_list,      
    "common_font":common_font,"mode_font":mode_font,"w":w,"h":h,"text":text}
    return pd.DataFrame(data_dict)

pd.set_option("display.float_format", "{:.2f}".format)
df = get_line_df(lines)
df.head(22)
    

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,w,h,text
0,47.94,65.34,519.13,77.34,14.64,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",471.19,12.0,TEXT 2 is an edited extract from the opening o...
1,47.94,79.98,499.81,91.98,14.64,1,"[Calibri,Bold]","Calibri,Bold","Calibri,Bold",451.87,12.0,for the 2023 Booker Prize. The novel tells th...
2,47.94,94.62,525.21,106.62,14.64,1,"[Calibri,Bold]","Calibri,Bold","Calibri,Bold",477.27,12.0,contemporary Ireland. In this extract we meet...
3,47.94,109.26,84.01,121.26,29.28,1,"[Calibri,Bold]","Calibri,Bold","Calibri,Bold",36.07,12.0,Elaine.
4,47.94,138.54,262.08,150.54,14.7,1,[Calibri],Calibri,Calibri,214.14,12.0,"Cass and Elaine first met in Chemistry class,"
5,47.94,153.24,268.27,165.24,14.64,1,[Calibri],Calibri,Calibri,220.33,12.0,when Elaine poured iodine on Cass’s eczema
6,47.94,167.88,254.96,179.88,14.64,1,[Calibri],Calibri,Calibri,207.02,12.0,during an experiment. It was an accident;
7,47.94,182.52,279.52,194.52,14.64,1,[Calibri],Calibri,Calibri,231.58,12.0,"she’d cried more than Cass did, and insisted on"
8,47.94,197.16,251.64,209.16,14.64,1,[Calibri],Calibri,Calibri,203.7,12.0,going with her to the nurse. They’d been
9,47.94,211.8,270.93,223.8,14.64,1,[Calibri],Calibri,Calibri,222.99,12.0,friends ever since. Every morning Cass called


**Idea** If you have nans in a particular row, you simply do not use that dimension in the clustering, but use the others. So like the last row there where there is no 
"distance to next row" element, we would not use that one.

In [184]:
X = df.drop(columns=["font_list","text","dL","n_spans"])
X[["common_font","mode_font"]] = X[["common_font","mode_font"]].applymap(lambda x: 0 if x=="Calibri,Bold" else 1)
X.head(10)

Unnamed: 0,x0,y0,x1,y1,common_font,mode_font,w,h
0,47.94,65.34,519.13,77.34,0,0,471.19,12.0
1,47.94,79.98,499.81,91.98,0,0,451.87,12.0
2,47.94,94.62,525.21,106.62,0,0,477.27,12.0
3,47.94,109.26,84.01,121.26,0,0,36.07,12.0
4,47.94,138.54,262.08,150.54,1,1,214.14,12.0
5,47.94,153.24,268.27,165.24,1,1,220.33,12.0
6,47.94,167.88,254.96,179.88,1,1,207.02,12.0
7,47.94,182.52,279.52,194.52,1,1,231.58,12.0
8,47.94,197.16,251.64,209.16,1,1,203.7,12.0
9,47.94,211.8,270.93,223.8,1,1,222.99,12.0


In [None]:
cat_cols

Index(['font_list', 'common_font', 'mode_font', 'text'], dtype='object')

In [None]:

ohe.transform(df[["common_font","mode_font"]])[:4]

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import math
X = df.drop(columns=["font_list","text","dL","n_spans"])

num_cols = X.select_dtypes(include=np.number).columns
X[num_cols] = StandardScaler().fit_transform(X[num_cols])


cat_cols = X.select_dtypes(include="object").columns
# X[["common_font","mode_font"]] = X[["common_font","mode_font"]].applymap(lambda x: 0 if x=="Calibri,Bold" else 1)
ohe = OneHotEncoder(drop="if_binary", sparse_output=False).fit(X[cat_cols])
X[cat_cols] = ohe.transform(X[cat_cols])
X.head(15)

Unnamed: 0,x0,y0,x1,y1,common_font,mode_font,w,h
0,0.0,-1.68,1.97,-1.68,1.0,1.0,1.97,0.0
1,0.0,-1.49,1.81,-1.49,1.0,1.0,1.81,0.0
2,0.0,-1.3,2.02,-1.3,1.0,1.0,2.02,0.0
3,0.0,-1.1,-1.74,-1.1,1.0,1.0,-1.74,0.0
4,0.0,-0.72,-0.22,-0.72,0.0,0.0,-0.22,0.0
5,0.0,-0.53,-0.17,-0.53,0.0,0.0,-0.17,0.0
6,0.0,-0.34,-0.28,-0.34,0.0,0.0,-0.28,0.0
7,0.0,-0.15,-0.07,-0.15,0.0,0.0,-0.07,0.0
8,0.0,0.05,-0.31,0.05,0.0,0.0,-0.31,0.0
9,0.0,0.24,-0.14,0.24,0.0,0.0,-0.14,0.0


In [None]:
from sklearn.cluster import KMeans
k = 2
top_init    = X.min().values   
top_init[4]=1
bottom_init = X.max().values  
bottom_init[4]=0
init_centroids = [top_init, bottom_init]
kmeans = KMeans(n_clusters=k, random_state=42,init=init_centroids, n_init="auto")
y_pred = kmeans.fit_predict(X)

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [227]:
pd.DataFrame(kmeans.cluster_centers_,columns= X.columns)

Unnamed: 0,x0,y0,x1,y1,common_font,mode_font,w,h
0,0.0,0.32,-0.41,0.32,0.07,0.07,-0.41,0.0
1,0.0,-1.49,1.93,-1.49,1.0,1.0,1.93,0.0


In [239]:
binary_features = [4, 5]  # Indices for "common_font" and "mode_font"

# Post-process centroids to enforce binary values (0 or 1)
for idx in binary_features:
    kmeans.cluster_centers_[:, idx] = np.round(kmeans.cluster_centers_[:, idx])

print("Centers:")
display(pd.DataFrame(kmeans.cluster_centers_,columns= X.columns))
print("Predictions:")
print(kmeans.predict(X))
print("Trouble point:")
display(X.iloc[3:4])
print("distances from each center:")
kmeans.transform(X.iloc[3:4])

Centers:


Unnamed: 0,x0,y0,x1,y1,common_font,mode_font,w,h
0,0.0,0.32,-0.41,0.32,0.0,0.0,-0.41,0.0
1,0.0,-1.49,1.93,-1.49,1.0,1.0,1.93,0.0


Predictions:
[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Trouble point:


Unnamed: 0,x0,y0,x1,y1,common_font,mode_font,w,h
3,0.0,-1.1,-1.74,-1.1,1.0,1.0,-1.74,0.0


distances from each center:


array([[3.09207447, 5.22344259]])

In [303]:
point = X.iloc[3:4]
centre1 = kmeans.cluster_centers_[0]
centre2 = kmeans.cluster_centers_[1]

print(f"Point:")
display(point)
print(f"Center1: {centre1}")
print(f"Centre2: {centre2}")

kmeans.predict(point)


Point:


Unnamed: 0,y0,y1,common_font,w,h
3,-1.75,-1.1,2.0,-1.74,0.0


Center1: [ 6.77189228e-01  4.28292073e-01 -5.55111512e-17 -3.12683565e-01
  0.00000000e+00]
Centre2: [-2.20086499 -1.39194924  2.          1.01622159  0.        ]


array([1], dtype=int32)

In [None]:
def separate_lines():
# Column lines: All lines with a width <= page_with/2 or king_pink/2 , and also with the mode of the fonts for 
# the lines with a width of a certain amount.
# For all width determinations, if a line has under a certain amount of words, it must be excluded, and only font used,
# as well as perhaps proximity to other line groups.

In [302]:
df.head(6)

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,w,h,text
0,47.94,65.34,519.13,77.34,14.64,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",471.19,12.0,TEXT 2 is an edited extract from the opening o...
1,47.94,79.98,499.81,91.98,14.64,1,"[Calibri,Bold]","Calibri,Bold","Calibri,Bold",451.87,12.0,for the 2023 Booker Prize. The novel tells th...
2,47.94,94.62,525.21,106.62,14.64,1,"[Calibri,Bold]","Calibri,Bold","Calibri,Bold",477.27,12.0,contemporary Ireland. In this extract we meet...
3,47.94,109.26,84.01,121.26,29.28,1,"[Calibri,Bold]","Calibri,Bold","Calibri,Bold",36.07,12.0,Elaine.
4,47.94,138.54,262.08,150.54,14.7,1,[Calibri],Calibri,Calibri,214.14,12.0,"Cass and Elaine first met in Chemistry class,"
5,47.94,153.24,268.27,165.24,14.64,1,[Calibri],Calibri,Calibri,220.33,12.0,when Elaine poured iodine on Cass’s eczema


In [301]:
X = df.drop(columns=["font_list","text","dL","n_spans","x0","x1","mode_font"])
cat_weight = math.sqrt(4.0)
y0_weight = math.sqrt(2.5)

num_cols = X.select_dtypes(include=np.number).columns
X[num_cols] = StandardScaler().fit_transform(X[num_cols])
X["y0"] = X["y0"]* y0_weight


cat_cols = X.select_dtypes(include="object").columns
ohe = OneHotEncoder(drop="if_binary", sparse_output=False).fit(X[cat_cols])
X[cat_cols] = ohe.transform(X[cat_cols])
X["common_font"] = X["common_font"]*cat_weight

display(X.head(6))

top_init    = X.min().values ; top_init[4]=cat_weight
bottom_init = X.max().values ; bottom_init[4]=0
init_centroids = [top_init, bottom_init]
kmeans = KMeans(n_clusters=2, init=init_centroids, n_init="auto")
kmeans = KMeans(n_clusters=2,  n_init=100)
y_pred = kmeans.fit_predict(X)

centre1 = kmeans.cluster_centers_[0]
centre2 = kmeans.cluster_centers_[1]
display(pd.DataFrame(np.vstack((centre1,centre2)  ),columns=X.columns) ) 
y_pred

Unnamed: 0,y0,y1,common_font,w,h
0,-2.66,-1.68,2.0,1.97,0.0
1,-2.35,-1.49,2.0,1.81,0.0
2,-2.05,-1.3,2.0,2.02,0.0
3,-1.75,-1.1,2.0,-1.74,0.0
4,-1.14,-0.72,0.0,-0.22,0.0
5,-0.84,-0.53,0.0,-0.17,0.0


Unnamed: 0,y0,y1,common_font,w,h
0,0.68,0.43,-0.0,-0.31,0.0
1,-2.2,-1.39,2.0,1.02,0.0


array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [283]:
kmeans.transform(X.iloc[3:4])

array([[2.78500154, 3.96667625]])

### Ideas for clustering

There is no real reason for the centroids to be able to take values other than the few defined categorical values.
maybe there is.

Nevertheless, consider a custom clustering algorithm where the centroid categorical values can only have fixed values.

Consider ignoring the width for lines which have early endings (few words, last word has full stop)

Consider squaring the y distance.