In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect

from pdf_scraper.block_utils import identify_dual_column, get_block_text, sort_dual_column_blocks
from pdf_scraper.block_utils import is_empty_block, clean_blocks, print_block_table, get_block_table, rebox_blocks
from pdf_scraper.block_utils import preproc_blocks
from pdf_scraper.draw_utils  import get_pink_boundary, get_fill_df, in_the_pink
from pdf_scraper.draw_utils  import draw_rectangles_on_page, get_fill_colours
from pdf_scraper.line_utils  import get_line_df, print_line_table, get_all_lines
from pdf_scraper.page_utils  import get_page_line_df

pd.set_option("display.float_format", "{:.3f}".format)

In [7]:
level    = "AL"
year     = 2025
fname    = f"LC002ALP100EV_{year}.pdf"
examDir  = Path.cwd().parent.parent / "Exams"  / "english" / level
pdf_file = examDir / fname


doc              = fitz.open(pdf_file)

fill_colours     = get_fill_colours(doc)

page_width       = doc[1].get_text("dict")["width"]   # This is a document wide thing doesn't need to be per page.
page_height      = doc[1].get_text("dict")["height"]   # This is a document wide thing doesn't need to be per page.


page             = doc[5]
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
blocks           = text_dict["blocks"]



bounding_pink    = get_pink_boundary(page_drawings, fill_colours)
clean_blocks     = preproc_blocks(blocks, bounding_pink)

pink_blocks      = [block for block in clean_blocks if in_the_pink(block["bbox"], bounding_pink) ]
pink_lines       = get_all_lines(pink_blocks)
pink_df          = get_line_df(pink_lines)

print_block_table(pink_blocks)
draw_rectangles_on_page(pdf_file, "out.pdf", 4,[bounding_pink] )

x0       x1       y0       y1       dx       dy       type  number  n_lines first_word
--------------------------------------------------------------------------------
126.36   465.49   53.74    69.76    339.13   16.02    txt   0       1       TEXT 3 – Pl
47.94    535.08   73.68    114.96   487.14   41.28    txt   1       3       TEXT 3 cons
47.94    540.89   101.82   143.10   492.95   41.28    txt   2       3       Text 3 cons
303.12   533.18   137.64   178.92   230.06   41.28    txt   3       3       constellati
47.94    281.42   137.64   296.10   233.48   158.46   txt   4       11      Six of them
303.12   543.25   191.34   364.50   240.13   173.16   txt   5       12      At night th
47.94    277.98   308.52   481.68   230.04   173.16   txt   6       12      Some alien 
303.12   542.63   376.92   520.74   239.51   143.82   txt   7       10      Soon things
47.94    277.85   494.10   608.64   229.91   114.54   txt   8       8       Its beauty 
308.82   528.84   533.22   742.98   220.

In [40]:
pink_df.h = pink_df.h.map(lambda x: round(x,3))
pink_df.dL = pink_df.dL.map(lambda x: round(x,3))
pink_df[["text","dL","h","font_size"]].head(40)

Unnamed: 0,text,dL,h,font_size
0,TEXT 3 – JOURNEY INTO SPACE,27.462,17.734,16.02
1,This edited text is adapted from a speech deli...,13.8,13.284,12.0
2,"Aeronautics and Space Administration (NASA), K...",12.036,13.284,12.0
3,"extract he acknowledges the history, and outli...",29.244,15.476,12.0
4,We will start by increasing NASA’s budget by,13.8,13.284,12.0
5,$6 billion over the next five years. We will,13.8,13.284,12.0
6,"ramp up robotic exploration of the solar system,",13.8,13.284,12.0
7,including a probe of the Sun’s atmosphere; new,13.8,13.284,12.0
8,scouting missions to Mars and other destinations,13.8,13.284,12.0
9,"and an advanced telescope to follow Hubble,",13.8,13.284,12.0


In [28]:
pink_df.h.mode()[0]

np.float64(13.284)

In [35]:
new_line

np.float64(1.038843721770551)

In [None]:
print(pink_df.dL.median())
print(pink_df.dL.mode()[0])
dL_median = pink_df.dL.median()
h_median  = pink_df.h.median()
new_line = dL_median/h_median

print(dL_median/h_median)
x00, y00, x01, y01 = pink_lines[10]["bbox"]
x10, y10, x11, y11 = pink_lines[11]["bbox"]
print(y00-y10)
print(new_line*h_median)

13.8
13.8
1.038843721770551
-13.79998779296875
13.8


In [11]:
pink_df.x0.map(lambda x : round(x))

0     314
1      57
2      57
3      92
4     166
     ... 
58     57
59    324
60    324
61    324
62     75
Name: x0, Length: 63, dtype: int64

In [None]:
1. Look at the widths of all pages.
2. Create a bin range based on this, and then do the binning.
3. Then you can look at this binning altogether all at once.
4. Then you can use these bins for further processing and identification of dual columns.

In [7]:

val_counts = pink_df.x1.value_counts()
def bin_values(val_counts, bin_width=10):
    '''
    make bins around values. If two values are within bin_width of each other,
    they get both put into a bin centred on the average of them.
    '''
    # what we want is to take most present values, and then merge into them all
    # other values that are within bin_width of them up or down.
    #
    # So if we could take the first value, make a sublist of all things within bin_width from it,
    # then remove this value and all the values in the sublist from the original list
    # continue the loop over the list.

    # Maybe you could use a recursive function for that?
    x      = val_counts.index.values
    counts = val_counts.values
    for i in range(2):#range(len(x)):
        sublist = [float(x[i])]
        for j in range(i+1,len(x)):
            if abs(x[i]-x[j]) < bin_width:
                #print(x[i],x[j], (x[i]+x[j])/2)
                sublist.append(float(x[j]))
        print(sublist)
bin_values(val_counts,6)

[298.46636962890625, 298.55999755859375, 298.5264587402344, 298.46875, 298.532470703125, 298.5528869628906, 298.48797607421875, 298.56964111328125, 298.5480041503906, 298.54205322265625, 295.26727294921875]
[298.55999755859375, 298.5264587402344, 298.46875, 298.532470703125, 298.5528869628906, 298.48797607421875, 298.56964111328125, 298.5480041503906, 298.54205322265625, 295.26727294921875]


In [None]:
from scipy.cluster.hierarchy import fclusterdata

x = pink_df.x1.value_counts()
# Cluster by proximity (within 5 units)
data = x.index.to_numpy().reshape(-1, 1)
groups = fclusterdata(data, t=5, criterion='distance')

# Build a DataFrame for easier grouping
df = pd.DataFrame({'x0': x.index, 'count': x.values, 'group': groups})

# Group by cluster and compute sum and mean label
grouped = df.groupby('group').agg(
    total_count=('count', 'sum'),
    mean_x0=('x0', 'mean')
)

# Set the mean x0 as index
result = grouped.set_index('mean_x0')['total_count'].sort_values(ascending=False)

print(result)

In [None]:
def get_clean_bins(x:pd.Series,bin_width):
    min = x.min()
    max = x.max()

    bins = np.arange(start=min-bin_width/2, stop=max + 2*bin_width, step=bin_width)

    x_binned = pd.cut(x, bins=bins).apply(lambda i: i.mid).value_counts()

    return x_binned[x_binned !=0]

get_clean_bins(pink_df.x1,10)

In [None]:
counts = df.x0.value_counts()
xs     = counts.index.values

xl   = xs[0]          if xs[0] < xs[1] else xs[1]
n_xl = counts.iloc[0] if xs[0] < xs[1] else counts.iloc[1]
xr = xs[0]            if xs[0] > xs[1] else xs[1]
n_xr = counts.iloc[0] if xs[0] > xs[1] else counts.iloc[1]

third   = np.nan if len(xs) <3 else xs[2]
n_third = np.nan if len(xs) <3 else counts.iloc[2]

In [None]:
year = 2010
fname = f"LC002ALP100EV_{year}.pdf"
examDir=Path.cwd().parent.parent / "Exams"  / "english" / level
pdf_file = examDir / fname


doc              = fitz.open(pdf_file)
page2_drawings   = doc[1].get_drawings()
fill_colour      = get_fill_df(page2_drawings).fill.mode().values[0]

xs = []
for page in doc[1:7]:
    text_dict        = page.get_text("dict",sort=True)
    page_drawings    = page.get_drawings()
    blocks           = text_dict["blocks"]

    bounding_pink    = get_pink_boundary(page_drawings, fill_colour)
    clean_blocks     = preproc_blocks(blocks, bounding_pink)

    pink_blocks      = [block for block in clean_blocks if in_the_pink(block["bbox"], bounding_pink) ]
    pink_lines       = get_all_lines(pink_blocks)
    pink_df          = get_line_df(pink_lines)
    #xs.append(pink_df.x0.value_counts())
    x2 =pink_df.x0.value_counts().index.values[:2]
    xl = x2[0] if x2[0] < x2[1] else x2[1]
    xr = x2[0] if x2[0] > x2[1] else x2[1]
    xs.append({"xl":xl,"xr":xr})

col_df = pd.DataFrame(xs,index=[f'page_{i+2}' for i in range(len(xs))])
col_df.head()

In [None]:
year = 2010
fname = f"LC002ALP100EV_{year}.pdf"
examDir=Path.cwd().parent.parent / "Exams"  / "english" / level
pdf_file = examDir / fname


doc              = fitz.open(pdf_file)
page2_drawings   = doc[1].get_drawings()
fill_colour      = get_fill_df(page2_drawings).fill.mode().values[0]

xs = []
#for page in doc[1:7]:
page=doc[2]
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
blocks           = text_dict["blocks"]


fill_colour2      = get_fill_df(page_drawings).fill.mode().values[0]



bounding_pink    = get_pink_boundary(page_drawings, fill_colour)
clean_blocks     = preproc_blocks(blocks, bounding_pink)

bounding_pink

#pink_blocks      = [block for block in clean_blocks if in_the_pink(block["bbox"], bounding_pink) ]
#pink_lines       = get_all_lines(pink_blocks)
#pink_df          = get_line_df(pink_lines)
##xs.append(pink_df.x0.value_counts())
#x2 =pink_df.x0.value_counts().index.values[:2]
#xl = x2[0] if x2[0] < x2[1] else x2[1]
#xr = x2[0] if x2[0] > x2[1] else x2[1]
#xs.append({"xl":xl,"xr":xr})
#
#col_df = pd.DataFrame(xs,index=[f'page_{i+2}' for i in range(len(xs))])
#col_df.head()

In [None]:
definitely_dual_column

- width limit
- has correct font size
- starts within a given tolerance of one of two possibile start positions.