In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect

from pdf_scraper.block_utils import identify_dual_column, get_block_text, sort_dual_column_blocks
from pdf_scraper.block_utils import is_empty_block, clean_blocks, print_block_table, get_block_table
from pdf_scraper.draw_utils  import get_pink_boundary

In [15]:
pdf_file         = Path.cwd().parent / "test_pdfs" / "LC002ALP100EV_2024.pdf"
doc              = fitz.open(pdf_file)
page             = doc[6]
textPage         = page.get_textpage()
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
page_width       = text_dict["width"]   # This is a document wide thing doesn't need to be per page.
blocks           = [block for block in text_dict["blocks"] if not is_empty_block(block)]

# Sorting Blocks

The in-built sorting of get_text works well in most cases, but can have issues when:
- there is two column text. 
- Column text and header text are not correctly blocked

Therefore we need to write our own functions which can

- Identify and resort dual column text.
- Identify and split incorrectly blocked lines of text. 


# Identifying and sorting dual column text

In [16]:

pink_fill = (1.0, 0.8980000019073486, 0.9490000009536743)
king_pink = get_pink_boundary(page_drawings, pink_fill)

dual_blocks = identify_dual_column(blocks,page_width, king_pink)
table = get_block_table(dual_blocks )
print(table)
sorted_blocks = sort_dual_column_blocks(dual_blocks)
sorted_table  = get_block_table(sorted_blocks)
print("\n"*3)
print(sorted_table)

x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
311.46   532.79   44.94    291.42   221.33   246.48   txt   3       the window 
47.94    280.85   44.94    386.52   232.91   341.58   txt   2       I found mys
284.46   527.94   296.16   455.16   243.48   159.00   img   7       --        
47.94    279.90   391.20   461.82   231.96   70.62    txt   4       undeniably 
--------------------------------------------------------------------------------







x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
47.94    280.85   44.94    386.52   232.91   341.58   txt   2       I found mys
47.94    279.90   391.20   461.82   231.96   70.62    txt   4       undeniably 
311.46   532.79   44.94    291.42   221.33   246.48   txt   3       the window 
284.46   527.94   296.16   455.16

# Splitting blocks

In [17]:
from pdf_scraper.line_utils  import line_is_empty
from pdf_scraper.line_utils  import  print_line_table
from pdf_scraper.line_utils  import get_mode_font, get_common_font

In [18]:
page = doc[3]
page_dict= page.get_text("dict",sort=True)
blocks = page_dict["blocks"]
block = blocks[6]

In [19]:
print(get_block_text(block))

TEXT 2 is an edited extract from the opening of Paul Murray’s novel,  The Bee Sting,  shortlisted 
for the 2023 Booker Prize.  The novel tells the tragi-comic story of the Barnes family, set in 
contemporary Ireland.  In this extract we meet the teenage daughter, Cass, and her best friend, 
Elaine. 
Cass and Elaine first met in Chemistry class, 
when Elaine poured iodine on Cass’s eczema 
during an experiment.  It was an accident; 
she’d cried more than Cass did, and insisted on 
going with her to the nurse.  They’d been 
friends ever since.  Every morning Cass called 
to Elaine’s house and they walked to school 
together.  At lunchtime, they rolled up their 
long skirts and wandered around the 
supermarket, listening to music from Elaine’s 
phone, eating croissants from the bakery 
section that were gone by the time they got to 
the checkout.   


In [20]:
lines = block['lines']
lines = [line for line in lines if not line_is_empty(line)]
print_line_table(lines)

x0       x1       y0       y1       dx       dy       fonts                                beginning                
------------------------------------------------------------------------------------------------------------------------
47.94    519.13   65.34    77.34    471.19   12.00    Calibri,BoldItalic Calibri,Bold      TEXT 2 is an edited extra
47.94    499.81   79.98    91.98    451.87   12.00    Calibri,Bold                         for the 2023 Booker Prize
47.94    525.21   94.62    106.62   477.27   12.00    Calibri,Bold                         contemporary Ireland.  In
47.94    84.01    109.26   121.26   36.07    12.00    Calibri,Bold                         Elaine.                  
47.94    262.08   138.54   150.54   214.14   12.00    Calibri                              Cass and Elaine first met
47.94    268.27   153.24   165.24   220.33   12.00    Calibri                              when Elaine poured iodine
47.94    254.96   167.88   179.88   207.02   12.00    Calibr

## Get Mode and Common font

In [21]:
from pdf_scraper.line_utils import get_line_df

pd.set_option("display.float_format", "{:.2f}".format)
df = get_line_df(lines)
df.head(22)
    

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text
0,47.94,65.34,519.13,77.34,14.64,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",18,471.19,12.0,TEXT 2 is an edited extract from the opening o...
1,47.94,79.98,499.81,91.98,14.64,1,"[Calibri,Bold]","Calibri,Bold","Calibri,Bold",18,451.87,12.0,for the 2023 Booker Prize. The novel tells th...
2,47.94,94.62,525.21,106.62,14.64,1,"[Calibri,Bold]","Calibri,Bold","Calibri,Bold",15,477.27,12.0,contemporary Ireland. In this extract we meet...
3,47.94,109.26,84.01,121.26,29.28,1,"[Calibri,Bold]","Calibri,Bold","Calibri,Bold",1,36.07,12.0,Elaine.
4,47.94,138.54,262.08,150.54,14.7,1,[Calibri],Calibri,Calibri,8,214.14,12.0,"Cass and Elaine first met in Chemistry class,"
5,47.94,153.24,268.27,165.24,14.64,1,[Calibri],Calibri,Calibri,8,220.33,12.0,when Elaine poured iodine on Cass’s eczema
6,47.94,167.88,254.96,179.88,14.64,1,[Calibri],Calibri,Calibri,7,207.02,12.0,during an experiment. It was an accident;
7,47.94,182.52,279.52,194.52,14.64,1,[Calibri],Calibri,Calibri,10,231.58,12.0,"she’d cried more than Cass did, and insisted on"
8,47.94,197.16,251.64,209.16,14.64,1,[Calibri],Calibri,Calibri,9,203.7,12.0,going with her to the nurse. They’d been
9,47.94,211.8,270.93,223.8,14.64,1,[Calibri],Calibri,Calibri,7,222.99,12.0,friends ever since. Every morning Cass called


# Block split function

- So to make a new block, we need to assign a number, a type, and a bbox. We already have the lines. 
1. The number is just a label, so we will keep the same label for the two blocks; this will furthermore help to identify a split block.  
2. The type will be the same.
3. bbox: write a function which infers a bbox from the line. Check it on known bboxes for blocks.
   - the functin takes as x0 min(x0) for all lines, x1 is max(x1) , y0 is min(y0), and y1 is max(y1)
4. The lines are the lines put into each block according to the clustering labels. 

In [22]:
from pdf_scraper.clustering.customCluster import reblock_lines
from pdf_scraper.block_utils import split_block

block0, block1 = split_block(block)
print_line_table(block0["lines"])
print("\n\n")
print_line_table(block1["lines"])

x0       x1       y0       y1       dx       dy       fonts                                beginning                
------------------------------------------------------------------------------------------------------------------------
47.94    519.13   65.34    77.34    471.19   12.00    Calibri,BoldItalic Calibri,Bold      TEXT 2 is an edited extra
47.94    499.81   79.98    91.98    451.87   12.00    Calibri,Bold                         for the 2023 Booker Prize
47.94    525.21   94.62    106.62   477.27   12.00    Calibri,Bold                         contemporary Ireland.  In
47.94    84.01    109.26   121.26   36.07    12.00    Calibri,Bold                         Elaine.                  
------------------------------------------------------------------------------------------------------------------------






x0       x1       y0       y1       dx       dy       fonts                                beginning                
--------------------------------------------------

- Also, perhaps the bbox calculated without the empty lines are more informative. The new bboxes will not have empty lines in them. 

## Identify badly blocked blocks

So far the only type of bad blocking we have seen which interferes with block ordering, is when the title above a dual column is joined
to one of the columns. To identify this, we may use the following characteristics:

- Two font distributions
- Two width distributions (excluding low word lines and empty lines)
- Discontinuity in dL (excluding empty lines)
- In the pink

In [23]:
from scipy.stats import gaussian_kde
from scipy.signal import find_peaks
def line_space_discont(lines):
    lines = [line for line in lines if not line_is_empty(line)]
    df = get_line_df(lines)
    dLs = np.array(df.dL[:-1])
    
    for i, val in enumerate(dLs):
        temp = np.delete(dLs, i, 0)
        if all(val > temp*1.6):
            #print(i, all(val > temp*1.6) )
            return True
    return False

def find_width_peaks(lines):
    df = get_line_df(lines)
    df = df[df.n_words > 4]
    w  = np.array(df.w)
    if len(w)==0:
        return []
    elif len(w) <=2:
        return [w.mean()]
    x_grid = np.linspace(w.min()-50, w.max()+50,1000)
    kde=gaussian_kde(w,bw_method='silverman')
    kde_vals = kde(x_grid)
    peaks, _ = find_peaks(kde_vals, prominence = 0.0001)
    return peaks



In [24]:
from pdf_scraper.block_utils import in_the_pink, clean_blocks
from pdf_scraper.line_utils  import find_width_peaks, line_space_discont
def detect_bad_block(block,king_pink):
    '''
    This function
    '''
    lines=[line for line in block["lines"] if not line_is_empty(line)]
    df = get_line_df(lines)
    pink = in_the_pink(block, king_pink)
    n_base_fonts  = len(df.common_font.value_counts()) >= 2
    n_width_modes = len(find_width_peaks(lines)) >=2
    space_discont = line_space_discont(lines)
    two_o_three   = [n_base_fonts, n_width_modes, space_discont]

    if pink and sum(two_o_three) >=2:
        return True
    return False

detect_bad_block(blocks[9],king_pink)

False

In [25]:
print_block_table(blocks)

x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
47.94    48.85    27.04    31.06    0.91     4.02     txt   1                 
131.28   452.41   34.36    50.38    321.13   16.02    txt   2       TEXT 2 – FR
288.96   312.91   138.54   150.54   23.95    12.00    txt   7                 
405.00   501.72   129.78   255.78   96.72    126.00   img   14      --        
502.50   504.98   249.04   260.02   2.48     10.98    txt   13                
290.88   387.12   118.62   272.58   96.24    153.96   img   12      --        
47.94    525.21   52.66    326.34   477.27   273.68   txt   3       TEXT 2 is a
310.20   522.03   153.24   399.60   211.83   246.36   txt   8       Elaine hate
47.94    276.15   327.34   477.72   228.21   150.38   txt   4       Cass felt s
310.20   528.64   400.60   536.28   218.44   135.67   txt   9       The Tidy To
310.20   526.38   537.34   614.46   216.18   

# Clean Blocks

In [26]:
blocks = clean_blocks(blocks)
new_blocks = []
for i, block in enumerate(blocks):
    if block["type"]:
        new_blocks.append(block)
        continue
    if len(block["lines"]) <=1:
        new_blocks.append(block)
        continue
    if detect_bad_block(block,king_pink):
        two_blocks = split_block(block)
        new_blocks.extend(two_blocks)
        continue
    new_blocks.append(block)

print_block_table(new_blocks)

x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
131.28   452.41   34.36    50.38    321.13   16.02    txt   2       TEXT 2 – FR
405.00   501.72   129.78   255.78   96.72    126.00   img   14      --        
290.88   387.12   118.62   272.58   96.24    153.96   img   12      --        
47.94    121.26   65.34    525.21   73.32    459.87   txt   3       TEXT 2 is a
47.94    326.34   138.54   279.52   278.40   140.98   txt   3       Cass and El
310.20   522.03   153.24   399.60   211.83   246.36   txt   8       Elaine hate
47.94    276.15   327.34   477.72   228.21   150.38   txt   4       Cass felt s
310.20   528.64   400.60   536.28   218.44   135.67   txt   9       The Tidy To
310.20   526.38   537.34   614.46   216.18   77.12    txt   10      I’m not bei
47.94    276.13   478.72   658.38   228.19   179.66   txt   5       Elaine hate
310.20   526.81   615.46   765.78   216.61

In [27]:
def preproc_blocks(blocks: list[dict]):
    blocks = clean_blocks(blocks)
    new_blocks = []
    for i, block in enumerate(blocks):
        if block["type"]:
            new_blocks.append(block)
            continue
        if len(block["lines"]) <=1:
            new_blocks.append(block)
            continue
        if detect_bad_block(block,king_pink):
            two_blocks = split_block(block)
            new_blocks.extend(two_blocks)
            continue
        new_blocks.append(block)
    return new_blocks


In [28]:
from pdf_scraper.block_utils import clean_blocks

In [29]:
page             = doc[5]
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
blocks           =  clean_blocks(text_dict["blocks"]) 

In [30]:
print_block_table(blocks)

x0       x1       y0       y1       dx       dy       type  number  first_word
--------------------------------------------------------------------------------
154.20   428.08   47.50    63.52    273.88   16.02    txt   2       TEXT 3 – CO
316.26   499.92   565.50   687.60   183.66   122.10   img   6       --        
290.22   533.16   120.00   770.40   242.94   650.40   txt   4       new carpets
47.94    506.21   65.47    783.84   458.27   718.37   txt   3       TEXT 3 is a
42.54    555.20   795.94   819.10   512.66   23.16    txt   0       Leaving Cer
--------------------------------------------------------------------------------





In [100]:
block = blocks[3]
lines = block["lines"]
line_df = get_line_df(lines)
line_df.head(50)

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text
0,47.94,76.02,506.21,88.02,14.64,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",17,458.27,12.0,TEXT 3 is an edited article from the travel se...
1,47.94,90.66,498.05,102.66,29.34,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",15,450.11,12.0,"journalist and travel writer, entitled To Ista..."
2,47.94,120.0,272.52,132.0,14.64,1,[Calibri],Calibri,Calibri,9,224.58,12.0,I’d arrived in Strasbourg on the Eurostar from
3,47.94,134.64,269.23,146.64,14.64,1,[Calibri],Calibri,Calibri,10,221.29,12.0,"London, the first leg of my voyage by train to"
4,47.94,149.28,277.28,161.28,14.64,1,[Calibri],Calibri,Calibri,7,229.34,12.0,"Istanbul. No stranger to epic railway journeys,"
5,47.94,163.92,266.91,175.92,14.64,1,[Calibri],Calibri,Calibri,10,218.97,12.0,I was keen to retrace the route of the Orient
6,47.94,178.56,269.59,190.56,14.64,1,[Calibri],Calibri,Calibri,9,221.65,12.0,"Express 140 years after its inaugural run, at a"
7,47.94,193.2,278.31,205.2,14.64,1,[Calibri],Calibri,Calibri,9,230.37,12.0,time when Europe’s sleeper trains are seeing a
8,47.94,207.84,281.34,219.84,14.7,1,[Calibri],Calibri,Calibri,9,233.4,12.0,renaissance owing to the rise of the slow-travel
9,47.94,222.54,241.47,234.54,14.64,1,[Calibri],Calibri,Calibri,5,193.53,12.0,"movement, climate change and private"


In [32]:
14.64*1.45

21.228

In [None]:
line_df.loc[0:1]

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text
0,47.94,76.02,506.21,88.02,14.64,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",17,458.27,12.0,TEXT 3 is an edited article from the travel se...
1,47.94,90.66,498.05,102.66,29.34,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",15,450.11,12.0,"journalist and travel writer, entitled To Ista..."


In [131]:
indices_pp = [i+1 for i in indices]
np_dfs = np.split(line_df, indices_pp, axis=0)

  return bound(*args, **kwds)


In [158]:
list[np_dfs[0].index]

list[RangeIndex(start=0, stop=2, step=1)]

In [None]:
print(indices)
mask = np.zeros(len(line_df))
n_df = 0
for i in range(len(indices)):
    if i==0:
        mask[:indices[i]+1] =n_df
        n_df +=1
        continue
    mask[indices[i-1]+1:indices[i]+1] = n_df
    n_df +=1
mask[indices[-1]:] = n_df
mask

[1, 11, 18, 26]


array([0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 2., 3., 3., 3., 3., 3., 3., 3., 4., 4., 4., 4., 4., 4., 4., 4.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.])

In [117]:
median = np.median(line_df.dL[:-1])
indices = []
for i, dL in enumerate(line_df.dL):
    if dL > 1.45*median:
        indices.append(i)
#print(indices, len(indices), indices[len(indices)-1])
print(indices)

dfs = [line_df[0:indices[0]+1]]
print(0, indices[0]+1)
for i, val in enumerate(indices[:-1]):
    print(val+1, indices[i+1]+1)
    dfs.append(line_df[val+1:indices[i+1]+1])
dfs.append(line_df[indices[-1]+1:])
print(indices[-1]+1, ":")

#dfs.append(line_df[indices[len(indices)-1]+1:indices[val+1]])

[1, 11, 18, 26]
0 2
2 12
12 19
19 27
27 :


In [124]:
dfs[4].head(50)

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text
27,47.94,508.14,272.17,520.14,14.64,1,[Calibri],Calibri,Calibri,7,224.23,12.0,"Austria’s lower Alps looked magnificent, their"
28,47.94,522.78,262.71,534.78,14.7,1,[Calibri],Calibri,Calibri,8,214.77,12.0,scalps turning pink in the sun. Farms flitted
29,47.94,537.48,272.65,549.48,14.64,1,[Calibri],Calibri,Calibri,8,224.71,12.0,"past the window along with families of deer,"
30,47.94,552.12,265.42,564.12,14.64,1,[Calibri],Calibri,Calibri,7,217.48,12.0,then snow-heavy forests packed with pencil-
31,47.94,566.76,271.67,578.76,14.64,1,[Calibri],Calibri,Calibri,8,223.73,12.0,thin trees. I watched curtains draw open and
32,47.94,581.4,271.43,593.4,14.64,1,[Calibri],Calibri,Calibri,8,223.49,12.0,"lights flick on, catching the eye of commuters"
33,47.94,596.04,248.52,608.04,14.64,1,[Calibri],Calibri,Calibri,7,200.58,12.0,clearing snow from their cars. Within 20
34,47.94,610.68,252.07,622.68,14.64,1,[Calibri],Calibri,Calibri,8,204.13,12.0,minutes of stepping on to the platform in
35,47.94,625.32,244.03,637.32,14.7,3,"[Calibri, Calibri,Italic, Calibri]",Calibri,Calibri,10,196.09,12.0,"Vienna I was in front of Klimt’s The Kiss,"
36,47.94,640.02,277.23,652.02,14.64,1,[Calibri],Calibri,Calibri,9,229.29,12.0,gasping at the sight of my favourite painting in


In [111]:
dfs[1].head()

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text
2,47.94,120.0,272.52,132.0,14.64,1,[Calibri],Calibri,Calibri,9,224.58,12.0,I’d arrived in Strasbourg on the Eurostar from
3,47.94,134.64,269.23,146.64,14.64,1,[Calibri],Calibri,Calibri,10,221.29,12.0,"London, the first leg of my voyage by train to"
4,47.94,149.28,277.28,161.28,14.64,1,[Calibri],Calibri,Calibri,7,229.34,12.0,"Istanbul. No stranger to epic railway journeys,"
5,47.94,163.92,266.91,175.92,14.64,1,[Calibri],Calibri,Calibri,10,218.97,12.0,I was keen to retrace the route of the Orient
6,47.94,178.56,269.59,190.56,14.64,1,[Calibri],Calibri,Calibri,9,221.65,12.0,"Express 140 years after its inaugural run, at a"


In [None]:
indices = [0]
for i, dL in enumerate(line_df.dL):
    if dL > 1.45*median:
        indices.append(i)
indices.append(-1)
print(indices)

dfs = []
for i, index in enumerate(indices):
    if i==0:
        dfs.append(line_df[indices[i]:indices[i+1]+1])
        print(f"{i} {indices[i+1]}+1")
        continue
    elif i == len(indices)-1:
        dfs.append(line_df[indices[i]+1:])
        print(f"{indices[i]+1} :{indices[i+1]+1}")
        continue
    dfs.append(line_df[indices[i]+1:indices[i+1]+1])
    print(f"{indices[i]+1} :{indices[i+1]+1}")


[0, 1, 11, 18, 26, -1]
0 1+1
2 :12
12 :19
19 :27
27 :0


In [78]:
dfs[1].head()

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text
1,47.94,90.66,498.05,102.66,29.34,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",15,450.11,12.0,"journalist and travel writer, entitled To Ista..."


In [75]:
fart= list(range(40))
print(indices[0],indices[1]+1)
print(fart[indices[0]:indices[1]+1])
print(indices[1]+1,indices[2]+1)
print(fart[indices[1]+1:indices[2]+1])

0 2
[0, 1]
2 12
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [69]:
dfs[0]

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text
1,47.94,90.66,498.05,102.66,29.34,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",15,450.11,12.0,"journalist and travel writer, entitled To Ista..."


In [62]:
line_df[indices[0]:indices[1]]

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text
0,47.94,76.02,506.21,88.02,14.64,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",17,458.27,12.0,TEXT 3 is an edited article from the travel se...


In [59]:
dfs[1].head()

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text
1,47.94,90.66,498.05,102.66,29.34,3,"[Calibri,Bold, Calibri,BoldItalic, Calibri,Bold]","Calibri,Bold","Calibri,Bold",15,450.11,12.0,"journalist and travel writer, entitled To Ista..."
2,47.94,120.0,272.52,132.0,14.64,1,[Calibri],Calibri,Calibri,9,224.58,12.0,I’d arrived in Strasbourg on the Eurostar from
3,47.94,134.64,269.23,146.64,14.64,1,[Calibri],Calibri,Calibri,10,221.29,12.0,"London, the first leg of my voyage by train to"
4,47.94,149.28,277.28,161.28,14.64,1,[Calibri],Calibri,Calibri,7,229.34,12.0,"Istanbul. No stranger to epic railway journeys,"
5,47.94,163.92,266.91,175.92,14.64,1,[Calibri],Calibri,Calibri,10,218.97,12.0,I was keen to retrace the route of the Orient


In [None]:
from pdf_scraper.line_utils import count_vert_space_discont

In [None]:
def line_space_discont(lines):
    lines = [line for line in lines if not line_is_empty(line)]
    df = get_line_df(lines)

    dLs = np.array(df.dL[:-1])
    median = np.median(line_df.dL[:-1])

    for i, val in enumerate(dLs):
        temp = np.delete(dLs, i, 0)
        if val > 1.45*median:
            #print(i, all(val > temp*1.6) )
            return True
    return False
line_space_discont(lines)

True

In [None]:
count_vert_space_discont(lines)

0

In [None]:
print(get_block_text(block))

TEXT 3 is an edited article from the travel section of the  Financial Times  by Monisha Rajesh, 
journalist and travel writer, entitled  To Istanbul by Train .  It was published in March 2023. 
I’d arrived in Strasbourg on the Eurostar from 
London, the first leg of my voyage by train to 
Istanbul.  No stranger to epic railway journeys, 
I was keen to retrace the route of the Orient 
Express 140 years after its inaugural run, at a 
time when Europe’s sleeper trains are seeing a 
renaissance owing to the rise of the slow-travel 
movement, climate change and private 
companies proposing grand plans to revive old 
routes in style. 
It was now Sunday night and if all went to plan 
I’d be in Istanbul on Friday morning – not the 
fastest route possible but a leisurely one with 
time to dine, unwind and wander around each 
city.  On board the Nightjet at Strasbourg, I 
pressed in earplugs and dozed off as the train 
followed the bends of the Rhine. 
The next morning, as we tumbled over the 
S