In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect

from pdf_scraper.block_utils import identify_dual_column, get_block_text, sort_dual_column_blocks
from pdf_scraper.block_utils import is_empty_block, clean_blocks, print_block_table, get_block_table, rebox_blocks
from pdf_scraper.block_utils import preproc_blocks
from pdf_scraper.draw_utils  import get_pink_boundary, get_fill_df, in_the_pink
from pdf_scraper.draw_utils  import draw_rectangle_on_page, get_fill_colours
from pdf_scraper.line_utils  import get_line_df, print_line_table, get_all_lines

pd.set_option("display.float_format", "{:.3f}".format)

In [3]:
level    = "AL"
year     = 2016
fname    = f"LC002ALP100EV_{year}.pdf"
examDir  = Path.cwd().parent.parent / "Exams"  / "english" / level
pdf_file = examDir / fname


doc              = fitz.open(pdf_file)

fill_colours     = get_fill_colours(doc)

page_width       = doc[1].get_text("dict")["width"]   # This is a document wide thing doesn't need to be per page.
page_height      = doc[1].get_text("dict")["height"]  # This is a document wide thing doesn't need to be per page.


page             = doc[2]                             # Page 3
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
blocks           = text_dict["blocks"]



bounding_pink    = get_pink_boundary(page_drawings, fill_colours)
clean_blocks     = preproc_blocks(blocks, bounding_pink)

pink_blocks      = [block for block in clean_blocks if in_the_pink(block["bbox"], bounding_pink) ]
pink_lines       = get_all_lines(pink_blocks)
pink_df          = get_line_df(pink_lines)

print_block_table(pink_blocks)
draw_rectangle_on_page(pdf_file, "out.pdf", 4,bounding_pink )

x0       x1       y0       y1       dx       dy       type  number  n_lines first_word
--------------------------------------------------------------------------------
56.70    295.42   34.13    337.21   238.72   303.08   txt   0       22      It was June
317.22   557.68   34.13    282.01   240.46   247.88   txt   1       18      but it had 
317.22   557.63   296.33   447.61   240.41   151.28   txt   2       15      As we worke
56.70    295.45   351.53   447.61   238.75   96.08    txt   3       7       For most of
72.66    548.15   455.52   467.67   475.49   12.15    txt   4       1       These texts
--------------------------------------------------------------------------------





In [29]:
pink_centre = (bounding_pink.x0+bounding_pink.x1)/2
centre_rect = fitz.Rect(pink_centre-12, 0, pink_centre+12, page_height)
draw_rectangle_on_page(pdf_file, "out.pdf", 3,centre_rect)

In [22]:
pink_blocks[0]["lines"][0]

{'spans': [{'size': 12.0,
   'flags': 4,
   'bidi': 0,
   'char_flags': 16,
   'font': 'TimesNewRomanPSMT',
   'color': 0,
   'alpha': 255,
   'ascender': 0.890999972820282,
   'descender': -0.2160000056028366,
   'text': 'It was June 2012, and I had come to the Globe ',
   'origin': (56.70000076293945, 44.8197021484375),
   'bbox': (56.70000076293945,
    34.12770080566406,
    295.35845947265625,
    47.41170120239258)}],
 'wmode': 0,
 'dir': (1.0, 0.0),
 'bbox': (56.70000076293945,
  34.12770080566406,
  295.35845947265625,
  47.41170120239258)}

In [4]:
pink_df.head(100)

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text,font_sizes,font_size
0,56.700,34.128,295.358,47.412,13.800,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,11,238.658,13.284,"It was June 2012, and I had come to the Globe",[12.0],12.000
1,56.700,47.928,295.382,61.212,13.800,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,7,238.682,13.284,Theatre in London. The company was called,[12.0],12.000
2,56.700,61.728,295.327,75.012,13.800,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,9,238.627,13.284,"Rah-e-Sabz (‘Path to Hope’), and they were",[12.0],12.000
3,56.700,75.528,295.343,88.812,13.800,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,8,238.643,13.284,from Afghanistan; they were about to perform a,[12.0],12.000
4,56.700,89.328,295.342,102.612,13.800,3,"[TimesNewRomanPSMT, TimesNewRomanPS-ItalicMT, ...",TimesNewRomanPS,TimesNewRomanPS-ItalicMT,8,238.642,13.284,version of The Comedy of Errors translated into,"[12.0, 12.0, 12.0]",12.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,56.700,392.928,295.381,406.212,13.800,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,5,238.681,13.284,"improbabilities, Elizabethan wordplay, its corny",[12.0],12.000
59,56.700,406.728,295.366,420.012,13.800,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,8,238.666,13.284,"sight gags. But as I watched the performance,",[12.0],12.000
60,56.700,420.528,295.454,433.812,13.800,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,7,238.754,13.284,"set in contemporary Kabul, I saw something",[12.0],12.000
61,56.700,434.328,289.931,447.612,21.189,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,9,233.231,13.284,quite new. The word “comedy” was in the title,[12.0],12.000


In [14]:
pd.set_option('display.max_rows', None)
pink_df.x0 = pink_df.x0.map(lambda x: round(x))
pink_df.x1 = pink_df.x1.map(lambda x: round(x))
pink_df.y0 = pink_df.y0.map(lambda y: round(y))
pink_df.y1 = pink_df.y1.map(lambda y: round(y))
display(pink_df[["x0","x1","y0","y1","common_font", "n_words"]])

Unnamed: 0,x0,x1,y0,y1,common_font,n_words
0,57,295,34,47,TimesNewRomanPSMT,11
1,57,295,48,61,TimesNewRomanPSMT,7
2,57,295,62,75,TimesNewRomanPSMT,9
3,57,295,76,89,TimesNewRomanPSMT,8
4,57,295,89,103,TimesNewRomanPS,8
5,57,295,103,116,TimesNewRomanPSMT,8
6,57,295,117,130,TimesNewRomanPSMT,8
7,57,295,131,144,TimesNewRomanPSMT,10
8,57,295,145,158,TimesNewRomanPSMT,7
9,57,295,158,172,TimesNewRomanPSMT,7


In [13]:
pink_df.iloc[50:61]

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text,font_sizes,font_size
50,317,379.128,558,392.412,13.8,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,7,240.372,13.284,"trying to know and comprehend its culture,",[12.0],12.0
51,317,392.928,558,406.212,13.8,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,8,240.307,13.284,finding both less and more than you ever,[12.0],12.0
52,317,406.728,558,420.012,13.8,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,9,240.408,13.284,imagined – asked a question that lay at the root,[12.0],12.0
53,317,420.528,558,433.812,13.8,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,8,240.384,13.284,of global Shakespeare. What does it really feel,[12.0],12.0
54,317,434.328,386,447.612,-82.8,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,3,69.022,13.284,like to travel?,[12.0],12.0
55,57,351.528,295,364.812,13.8,2,"[TimesNewRomanPSMT, TimesNewRomanPS-ItalicMT]",TimesNewRomanPS,TimesNewRomanPSMT,11,238.631,13.284,"For most of its history on stage in the west, ...","[12.0, 12.0]",12.0
56,57,365.328,295,378.612,13.8,2,"[TimesNewRomanPS-ItalicMT, TimesNewRomanPSMT]",TimesNewRomanPS,TimesNewRomanPS-ItalicMT,8,238.628,13.284,Comedy of Errors has been dismissed as a,"[12.0, 12.0]",12.0
57,57,379.128,295,392.412,13.8,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,7,238.675,13.284,creaky and mechanistic farce with its rampant,[12.0],12.0
58,57,392.928,295,406.212,13.8,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,5,238.681,13.284,"improbabilities, Elizabethan wordplay, its corny",[12.0],12.0
59,57,406.728,295,420.012,13.8,1,[TimesNewRomanPSMT],TimesNewRomanPSMT,TimesNewRomanPSMT,8,238.666,13.284,"sight gags. But as I watched the performance,",[12.0],12.0


In [15]:
pink_df.x0.value_counts()

x0
57     29
317    29
356     1
381     1
440     1
500     1
73      1
Name: count, dtype: int64

In [16]:
pink_df[pink_df.x0==73]

Unnamed: 0,x0,y0,x1,y1,dL,n_spans,font_list,common_font,mode_font,n_words,w,h,text,font_sizes,font_size
62,73,456,548,468,,2,"[TimesNewRomanPS-ItalicMT, TimesNewRomanPSMT]",TimesNewRomanPS,TimesNewRomanPS-ItalicMT,16,475.485,12.155,"These texts have been adapted, for the purpose...","[10.979999542236328, 10.020000457763672]",10.02


In [17]:
pink_df.x1.value_counts()

x1
295    27
558    25
270     1
555     1
402     1
344     1
369     1
428     1
488     1
557     1
386     1
290     1
548     1
Name: count, dtype: int64

# Definition of column boundaries

In [None]:
This will be a combination of b1=(x00, x01) and b2=(x10,x11) pairs. Between x01 and x10, there is a strip of emmpty
space within the pink bounding box. 

In [None]:
This is a boundary within which:
    - All the text will have the standard font size (the mode fontsize over the last several pages)
    - The majority of the text will have the standard font type (the mode font type over the last few pages)
    - The majority of text is bound.

In [None]:
- In the pink
- Correct font size
- does not cross centre line. (does not start or stop in centre line.)