In [4]:
import os

In [5]:
#pdfs = [i for i in os.listdir() if i.endswith(".pdf")]
#print(pdfs)
#pdf_file = pdfs[0] 
pdf_file = "test_pdfs/LC002ALP100EV_2024.pdf"

# pymupdf

In [6]:
import fitz
from fitz import Rect
doc              = fitz.open(pdf_file)
page             = doc[6]
textPage         = page.get_textpage()
text_dict        = page.get_text("dict")
text_blocks      = page.get_text("blocks")
text_dict_sorted = page.get_text("dict",sort=True)


## Dictionary output

Here we will explore the format of the Page.get_text("dict") output.

In [7]:
print(f"Text dict: {text_dict.keys()}")
page_width  = text_dict['width']
page_height = text_dict['height']
print(f"width: {page_width}pts height: {page_height}pts")
print(f"width: {text_dict['width']*0.3528:5.2f}mms height: {text_dict['height']*0.3528:5.2f}mms")
print(f"There are {len(text_dict['blocks'])} dict blocks")
print(f"There are {len(text_blocks)} text block elements")

Text dict: dict_keys(['width', 'height', 'blocks'])
width: 595.219970703125pts height: 842.0pts
width: 209.99mms height: 297.06mms
There are 8 dict blocks
There are 7 text block elements


In [8]:
print(f"Block : {text_dict['blocks'][0].keys()}")
print(f'bbox  : {text_dict["blocks"][0]["bbox"]}')
print(f"line  : {text_dict['blocks'][0]['lines'][0].keys()}")
print(f'span  : {text_dict["blocks"][0]["lines"][0]["spans"][0].keys()}')
print(text_dict["blocks"][0]["lines"][0]["spans"][0]["text"])
print(text_dict["blocks"][0]["lines"][0]["spans"][1]["text"])

Block : dict_keys(['number', 'type', 'bbox', 'lines'])
bbox  : (42.540000915527344, 795.9447021484375, 555.2017822265625, 819.104736328125)
line  : dict_keys(['spans', 'wmode', 'dir', 'bbox'])
span  : dict_keys(['size', 'flags', 'bidi', 'char_flags', 'font', 'color', 'alpha', 'ascender', 'descender', 'text', 'origin', 'bbox'])
Leaving Certificate Examination 2024
 


The dict output has blocks which come in a list of dictionaries: 
```dict_keys(['number', 'type', 'bbox', 'lines'])```
- number: just label for block
- type: 0 for txt 1 for img
- bbox: 4 bounding box coords as tuple
- lines: the content of the box separated into lines, which are separated into spans

In [9]:
import json

with open('page_7_block_0_dict.json', 'w') as f:
    json.dump(text_dict["blocks"][0], f, indent=4)

#### Lines and spans example

In [10]:
print(len(text_dict["blocks"][0]["lines"]))
for i, line in enumerate(text_dict["blocks"][0]["lines"] ): 
    print(f"Line : {i+1}")
    for j, span in enumerate(line["spans"]):
        #if span["text"].isspace():
        #    continue
        print(f'span {j}: {span["text"]}', end = "\t")
    print("\n")

4
Line : 1
span 0: Leaving Certificate Examination 2024	span 1:  	

Line : 2
span 0: 7 	

Line : 3
span 0:  	

Line : 4
span 0: English – Higher Level – Paper 1 	



In [11]:
def get_dict_block_text(block_dict: dict ):
    '''
    For a given block dictionary element, as output by Page.get_text("dict")["blocks"], this 
    function will return the text of all the lines, joined by a "\n", and with the spans on 
    each line joined with a space. 
    
    The result is one string with newline separtaed lines and space
    separated spans.
    '''
    block_lines = block_dict["lines"]
    line_texts = [" ".join([ span["text"] for span in line["spans"] ]) for line in block_lines ]
    block_text="\n".join( [ i for i in line_texts if not i.isspace() ])
    return block_text

print(get_dict_block_text(text_dict["blocks"][0]))


Leaving Certificate Examination 2024  
7 
English – Higher Level – Paper 1 


images have block type 1, and text has type 0

In [12]:
for block in text_dict["blocks"]:
    if block["type"]==0:
        print(get_dict_block_text(block))
        print("\n")

Leaving Certificate Examination 2024  
7 
English – Higher Level – Paper 1 


N.B. Candidates may NOT answer Question A and Question B on the same text. 
QUESTION A – 50 Marks 
(i) 
What, in your opinion, does the writer reveal about the unique experience of “slow travel” in  
TEXT 3?  Make three points, supporting your response with reference to the text.   
(15)  
(ii)   One of Monisha Rajesh’s fellow travellers, Charles, was wearing a T-shirt saying  Green against the 
Machine .  What are your views on the different ways open to young people to protest, or 
demonstrate their views, on important issues?  Develop three points in your response. 
(15) 
(iii)  Identify four elements of the writer’s style, evident in TEXT 3, and discuss how they 
contribute to making this an informative and appealing piece of travel writing.   
Support  your response with reference to TEXT 3. 
(20) 
QUESTION B – 50 Marks  
Write a series of reflective  diary entries  of a person returning to their homepla

## Blocks output

In [13]:
for block in text_blocks: 
    print(block[4])
    print("\n")

Leaving Certificate Examination 2024 
7 
 
English – Higher Level – Paper 1 



N.B. Candidates may NOT answer Question A and Question B on the same text. 
 
QUESTION A – 50 Marks 
(i) 
What, in your opinion, does the writer reveal about the unique experience of “slow travel” in  
  
TEXT 3?  Make three points, supporting your response with reference to the text.   
(15)  
                                             
(ii)   One of Monisha Rajesh’s fellow travellers, Charles, was wearing a T-shirt saying Green against the 
 
Machine.  What are your views on the different ways open to young people to protest, or 
 
demonstrate their views, on important issues?  Develop three points in your response. 
(15) 
 
(iii)  Identify four elements of the writer’s style, evident in TEXT 3, and discuss how they 
 
contribute to making this an informative and appealing piece of travel writing.   
 
Support  your response with reference to TEXT 3. 
 
 
 
 
 
(20) 
 
QUESTION B – 50 Marks  
Write a se

- page.get_text("blocks") outputs a list of tuples

```(x0, y0, x1, y1, "lines in the block", block_no, block_type)```

### Check order of text segments (for page 7)

In [14]:
print(f"x0: {text_blocks[0][0]:5.2f}, x1: {text_blocks[0][2]:5.2f}, y0: {text_blocks[0][1]:5.2f}, y1: {text_blocks[0][3]:5.2f}")
n_lines=text_blocks[0][4].count('\n')
print(f"There are {n_lines} lines")
print("--"*40)
print(text_blocks[0][4])

x0: 42.54, x1: 555.20, y0: 795.94, y1: 819.10
There are 4 lines
--------------------------------------------------------------------------------
Leaving Certificate Examination 2024 
7 
 
English – Higher Level – Paper 1 



In [15]:
only_text_blocks = [i for i in page.get_text("blocks") if i[6]==0]
all_blocks       = [i for i in page.get_text("blocks") ]
print(f"Total text blocks in this page: {len(only_text_blocks)}")
print(f"Total blocks in this page: {len(all_blocks)}")
print("First block:")
only_text_blocks[0]

Total text blocks in this page: 7
Total blocks in this page: 7
First block:


(42.540000915527344,
 795.9447021484375,
 555.2017822265625,
 819.104736328125,
 'Leaving Certificate Examination 2024 \n7 \n \nEnglish – Higher Level – Paper 1 \n',
 0,
 0)

## Compare dictionary and blocks

In [16]:
text_blocks      = page.get_text("blocks")
text_dict_sorted = page.get_text("dict",sort=True)

print(f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'type':5} {'num':5}")

#(x0, y0, x1, y1, "lines in the block", block_no, block_type)```
for x0, y0, x1, y1, lines, num, typ in all_blocks:
    type = "img" if typ else "txt" 
    print(f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {type:5} {num:<5}")

x0       x1       y0       y1       dx       dy       type  num  
42.54    555.20   795.94   819.10   512.66   23.16    txt   0    
42.54    549.14   477.78   784.92   506.60   307.14   txt   1    
47.94    280.85   44.94    386.52   232.91   341.58   txt   2    
311.46   532.79   44.94    291.42   221.33   246.48   txt   3    
47.94    279.90   391.20   461.82   231.96   70.62    txt   4    
290.22   292.93   391.20   403.20   2.71     12.00    txt   5    
311.46   312.82   390.00   396.00   1.36     6.00     txt   6    


In [17]:
text_dict        = page.get_text("dict",sort=False)
print(f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'type':5} {'number':7}")
for block in text_dict["blocks"]:
    type = "img" if block["type"] else "txt" 
    x0, y0, x1, y1 = block['bbox']
    table=f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {type:5} {block['number']:<7}"
    print(table)

x0       x1       y0       y1       dx       dy       type  number 
42.54    555.20   795.94   819.10   512.66   23.16    txt   0      
42.54    549.14   477.78   784.92   506.60   307.14   txt   1      
47.94    280.85   44.94    386.52   232.91   341.58   txt   2      
311.46   532.79   44.94    291.42   221.33   246.48   txt   3      
47.94    279.90   391.20   461.82   231.96   70.62    txt   4      
290.22   292.93   391.20   403.20   2.71     12.00    txt   5      
311.46   312.82   390.00   396.00   1.36     6.00     txt   6      
284.46   527.94   296.16   455.16   243.48   159.00   img   7      


In [18]:
def get_block_table(blocks: dict):
    table=[f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'type':5} {'number':7} {'first_word':10}", "--"*40]
    for block in blocks:
        type = "img" if block["type"] else "txt" 
        x0, y0, x1, y1 = block['bbox']
        beginning=get_dict_block_text(block)[:11] if type =="txt" else "--"
        line=f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {type:5} {block['number']:<7} {beginning:<10}"
        table.append(line)
    return "\n".join(table)
table = get_block_table(text_dict["blocks"])
print(table)

x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
42.54    555.20   795.94   819.10   512.66   23.16    txt   0       Leaving Cer
42.54    549.14   477.78   784.92   506.60   307.14   txt   1       N.B. Candid
47.94    280.85   44.94    386.52   232.91   341.58   txt   2       I found mys
311.46   532.79   44.94    291.42   221.33   246.48   txt   3       the window 
47.94    279.90   391.20   461.82   231.96   70.62    txt   4       undeniably 
290.22   292.93   391.20   403.20   2.71     12.00    txt   5                 
311.46   312.82   390.00   396.00   1.36     6.00     txt   6                 
284.46   527.94   296.16   455.16   243.48   159.00   img   7       --        


## Identify pink box

In [19]:
print(len(page.get_drawings()))
drawing_0 = page.get_drawings()[0]
drawing_0

61


{'items': [('re',
   Rect(42.540000915527344, 42.53997802734375, 284.82000732421875, 388.79998779296875),
   1)],
 'type': 'f',
 'even_odd': False,
 'fill_opacity': 1.0,
 'fill': (1.0, 0.8980000019073486, 0.9490000009536743),
 'rect': Rect(42.540000915527344, 42.53997802734375, 284.82000732421875, 388.79998779296875),
 'seqno': 1,
 'layer': '',
 'closePath': None,
 'color': None,
 'width': None,
 'lineCap': None,
 'lineJoin': None,
 'dashes': None,
 'stroke_opacity': None}

In [20]:
drawing_0["items"][0][0]

're'

In [21]:
pink_fill = page.get_drawings()[0]['fill']
pink_fill

(1.0, 0.8980000019073486, 0.9490000009536743)

In [22]:
pinks = [ drawing for drawing in page.get_drawings() if drawing['fill']==pink_fill and drawing['type']=='f']
len(pinks)

54

In [23]:

print(pinks[0]['items'][0])
print(pinks[1]['items'][0])

('re', Rect(42.540000915527344, 42.53997802734375, 284.82000732421875, 388.79998779296875), 1)
('re', Rect(47.939998626708984, 42.53997802734375, 279.41998291015625, 57.17999267578125), 1)


In [24]:
colors = [ drawing for drawing in page.get_drawings() if drawing['color'] ]
print(len(colors))

0


In [25]:
# No filtering needs to be done before the definition of king_pink. You can just do the 
# min maxing stuff it should be fine.
def get_pink_boundary(drawings, pink_fill):
    """
    Return all pink fill boxes in the page, excluding those pink boxes already contained within
    another pink box on the page.
    :param drawings: List of drawing objects from get_drawings()
    :param pink_fill: tuple specifying pink colour. (1.0, 0.8980000019073486, 0.9490000009536743) for 2024 P1
    :return: Filtered list of drawings without redundant fills
    """
    # Only look at pink fill objects
    pinks = [d for d in drawings if d["type"] == "f" and d["fill"]==pink_fill ]
    if not pinks:
        return None

    def in_the_stink(pink):
        '''
        returns True if the given pink is contained in any other pink on the page.
        '''
        return any( other["rect"].contains(pink["rect"])  for other in pinks if other != pink )

    filtered_pinks = [p for p in pinks if not in_the_stink(p)]

    x0 = min([p['rect'].x0 for p in filtered_pinks] )
    y0 = min([p['rect'].y0 for p in filtered_pinks] )
    x1 = max([p['rect'].x1 for p in filtered_pinks] )
    y1 = max([p['rect'].y1 for p in filtered_pinks] )
    king_pink = fitz.Rect(x0,y0,x1,y1)

    return king_pink

def in_the_pink(block: dict, king_pink: Rect):
    x0, y0, x1, y1 = block['bbox']
    block_rect = Rect(x0,y0,x1,y1)
    return  king_pink.contains(block_rect)

drawings = page.get_drawings()
king_pink = get_pink_boundary(drawings,pink_fill)

print(king_pink)

Rect(42.540000915527344, 42.53997802734375, 535.3800048828125, 467.8800048828125)


# Ordering Blocks

- These blocks are not all in the correct order.
- This is ok if it is just a page footer appearing at the top.
- This is not ok in the case of article text appearing in the incorrect reading order.
  - block number 4 should appear after block number 2

- Any text in a double column article cannot be larger than half the page. 

- To know it is two columns: at least two text blocks with the same y position but different x positions. Or at least overlapping y ranges, though separated by their own widths in the x direction.

In [26]:
drawings  = page.get_drawings()
pink_fill = drawings[0]['fill']
king_pink = get_pink_boundary(drawings,pink_fill)

text_dict        = page.get_text("dict")
page_width       = text_dict["width"]
W = page_width/2

dual_blocks = []
for block in text_dict["blocks"]:
    type = "img" if block["type"] else "txt" 
    x0, y0, x1, y1 = block['bbox']
    dx = x1-x0
    # skip empty blocks
    if type == "txt" and not get_dict_block_text(block):
        continue
    # Only blocks in the pink
    if not in_the_pink(block, king_pink):
       continue 
    # If we have a block in the pink less than a page width
    if dx <= W:
        dual_blocks.append(block)
    # If there are other blocks on another side of it?
    
print(get_block_table(dual_blocks))


x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
47.94    280.85   44.94    386.52   232.91   341.58   txt   2       I found mys
311.46   532.79   44.94    291.42   221.33   246.48   txt   3       the window 
47.94    279.90   391.20   461.82   231.96   70.62    txt   4       undeniably 
284.46   527.94   296.16   455.16   243.48   159.00   img   7       --        


In [27]:

def isColumnSize(block, page_width):
    x0, y0, x1, y1 = block['bbox']
    col_width = x1 - x0
    return col_width <= page_width/2

def isEmptyBlock(block: dict):
    if block["type"]:
        return 0
    return 0 if get_dict_block_text(block) else 1


def identify_dual_column(page, king_pink):
    text_dict        = page.get_text("dict")
    page_width       = text_dict["width"]
    blocks           = text_dict["blocks"]

    possiBlocks     = [block for block in blocks      if isColumnSize(    block,page_width) ]   
    possiPinks      = [block for block in possiBlocks if in_the_pink(     block,king_pink) ]   
    dual_col_blocks = [block for block in possiPinks  if not isEmptyBlock(block)]

    return dual_col_blocks

dual_blocks = identify_dual_column(page, king_pink)
table = get_block_table(dual_blocks )
print(table)
sorted_blocks = sort_dual_column_blocks(dual_blocks)
sorted_table  = get_block_table(sorted_blocks)
print("\n"*3)
print(sorted_table)

x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
47.94    280.85   44.94    386.52   232.91   341.58   txt   2       I found mys
311.46   532.79   44.94    291.42   221.33   246.48   txt   3       the window 
47.94    279.90   391.20   461.82   231.96   70.62    txt   4       undeniably 
284.46   527.94   296.16   455.16   243.48   159.00   img   7       --        


NameError: name 'sort_dual_column_blocks' is not defined

In [None]:
# col1 = x0 is closer to leftmost x0
# col2 = x0 is closer to rightmost x1, or right most x0
def sort_dual_column_blocks(blocks: dict):
    coords = [block['bbox'] for block in blocks] 
    x0_min = min(coord[0] for coord in coords)
    x0_max = max(coord[0] for coord in coords)
    x1_min = min(coord[1] for coord in coords)
    x1_max = max(coord[1] for coord in coords)

    vert_ordered = sorted(blocks, key = lambda block: block["bbox"][1])

    for block in vert_ordered:
        x0, y0, x1, y1 = block['bbox']
        dl = x0-x0_min
        dr = x0-x0_max
        block["col"] = 0 if abs(dl) < abs(dr) else 1
    
    col_ordered = sorted(vert_ordered,key = lambda x: x['col'])

    return col_ordered

sort1 = sort_dual_column_blocks(dual_blocks)
print(get_block_table(sort1))