In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect

from pdf_scraper.block_utils import identify_dual_column, get_block_text, sort_dual_column_blocks
from pdf_scraper.block_utils import is_empty_block, clean_blocks, print_block_table, get_block_table, rebox_blocks
from pdf_scraper.block_utils import preproc_blocks
from pdf_scraper.draw_utils  import get_pink_boundary, get_fill_df, in_the_pink
from pdf_scraper.draw_utils  import draw_rectangle_on_page, get_fill_colours
from pdf_scraper.line_utils  import get_line_df, print_line_table, get_all_lines

pd.set_option("display.float_format", "{:.8f}".format)

In [None]:
level = "AL"
year = 2011
fname = f"LC002ALP100EV_{year}.pdf"
examDir=Path.cwd().parent.parent / "Exams"  / "english" / level
pdf_file = examDir / fname


doc              = fitz.open(pdf_file)

fill_colours     = get_fill_colours(doc)

page_width       = doc[1].get_text("dict")["width"]   # This is a document wide thing doesn't need to be per page.
page_height      = doc[1].get_text("dict")["height"]   # This is a document wide thing doesn't need to be per page.


page             = doc[4]
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
blocks           = text_dict["blocks"]



bounding_pink    = get_pink_boundary(page_drawings, fill_colours)
clean_blocks     = preproc_blocks(blocks, bounding_pink)

pink_blocks      = [block for block in clean_blocks if in_the_pink(block["bbox"], bounding_pink) ]
pink_lines       = get_all_lines(pink_blocks)
pink_df          = get_line_df(pink_lines)

print_block_table(pink_blocks)
draw_rectangle_on_page(pdf_file, "out.pdf", 4,bounding_pink )

In [None]:
print_line_table(pink_lines)

In [None]:

val_counts = pink_df.x1.value_counts()
def bin_values(val_counts, bin_width=10):
    '''
    make bins around values. If two values are within bin_width of each other,
    they get both put into a bin centred on the average of them.
    '''
    # what we want is to take most present values, and then merge into them all
    # other values that are within bin_width of them up or down.
    #
    # So if we could take the first value, make a sublist of all things within bin_width from it,
    # then remove this value and all the values in the sublist from the original list
    # continue the loop over the list.

    # Maybe you could use a recursive function for that?
    x      = val_counts.index.values
    counts = val_counts.values
    for i in range(2):#range(len(x)):
        sublist = [float(x[i])]
        for j in range(i+1,len(x)):
            if abs(x[i]-x[j]) < 6:
                #print(x[i],x[j], (x[i]+x[j])/2)
                sublist.append(float(x[j]))
        print(sublist)
bin_values(val_counts)

In [None]:
from scipy.cluster.hierarchy import fclusterdata

x = pink_df.x1.value_counts()
# Cluster by proximity (within 5 units)
data = x.index.to_numpy().reshape(-1, 1)
groups = fclusterdata(data, t=5, criterion='distance')

# Build a DataFrame for easier grouping
df = pd.DataFrame({'x0': x.index, 'count': x.values, 'group': groups})

# Group by cluster and compute sum and mean label
grouped = df.groupby('group').agg(
    total_count=('count', 'sum'),
    mean_x0=('x0', 'mean')
)

# Set the mean x0 as index
result = grouped.set_index('mean_x0')['total_count'].sort_values(ascending=False)

print(result)

In [None]:
def get_clean_bins(x:pd.Series,bin_width):
    min = x.min()
    max = x.max()

    bins = np.arange(start=min-bin_width/2, stop=max + 2*bin_width, step=bin_width)

    x_binned = pd.cut(x, bins=bins).apply(lambda i: i.mid).value_counts()

    return x_binned[x_binned !=0]

get_clean_bins(pink_df.x1,10)

In [None]:
counts = df.x0.value_counts()
xs     = counts.index.values

xl   = xs[0]          if xs[0] < xs[1] else xs[1]
n_xl = counts.iloc[0] if xs[0] < xs[1] else counts.iloc[1]
xr = xs[0]            if xs[0] > xs[1] else xs[1]
n_xr = counts.iloc[0] if xs[0] > xs[1] else counts.iloc[1]

third   = np.nan if len(xs) <3 else xs[2]
n_third = np.nan if len(xs) <3 else counts.iloc[2]

In [None]:
year = 2010
fname = f"LC002ALP100EV_{year}.pdf"
examDir=Path.cwd().parent.parent / "Exams"  / "english" / level
pdf_file = examDir / fname


doc              = fitz.open(pdf_file)
page2_drawings   = doc[1].get_drawings()
fill_colour      = get_fill_df(page2_drawings).fill.mode().values[0]

xs = []
for page in doc[1:7]:
    text_dict        = page.get_text("dict",sort=True)
    page_drawings    = page.get_drawings()
    blocks           = text_dict["blocks"]

    bounding_pink    = get_pink_boundary(page_drawings, fill_colour)
    clean_blocks     = preproc_blocks(blocks, bounding_pink)

    pink_blocks      = [block for block in clean_blocks if in_the_pink(block["bbox"], bounding_pink) ]
    pink_lines       = get_all_lines(pink_blocks)
    pink_df          = get_line_df(pink_lines)
    #xs.append(pink_df.x0.value_counts())
    x2 =pink_df.x0.value_counts().index.values[:2]
    xl = x2[0] if x2[0] < x2[1] else x2[1]
    xr = x2[0] if x2[0] > x2[1] else x2[1]
    xs.append({"xl":xl,"xr":xr})

col_df = pd.DataFrame(xs,index=[f'page_{i+2}' for i in range(len(xs))])
col_df.head()

In [None]:
year = 2010
fname = f"LC002ALP100EV_{year}.pdf"
examDir=Path.cwd().parent.parent / "Exams"  / "english" / level
pdf_file = examDir / fname


doc              = fitz.open(pdf_file)
page2_drawings   = doc[1].get_drawings()
fill_colour      = get_fill_df(page2_drawings).fill.mode().values[0]

xs = []
#for page in doc[1:7]:
page=doc[2]
text_dict        = page.get_text("dict",sort=True)
page_drawings    = page.get_drawings()
blocks           = text_dict["blocks"]


fill_colour2      = get_fill_df(page_drawings).fill.mode().values[0]



bounding_pink    = get_pink_boundary(page_drawings, fill_colour)
clean_blocks     = preproc_blocks(blocks, bounding_pink)

bounding_pink

#pink_blocks      = [block for block in clean_blocks if in_the_pink(block["bbox"], bounding_pink) ]
#pink_lines       = get_all_lines(pink_blocks)
#pink_df          = get_line_df(pink_lines)
##xs.append(pink_df.x0.value_counts())
#x2 =pink_df.x0.value_counts().index.values[:2]
#xl = x2[0] if x2[0] < x2[1] else x2[1]
#xr = x2[0] if x2[0] > x2[1] else x2[1]
#xs.append({"xl":xl,"xr":xr})
#
#col_df = pd.DataFrame(xs,index=[f'page_{i+2}' for i in range(len(xs))])
#col_df.head()

In [None]:
definitely_dual_column

- width limit
- has correct font size

# Check pink fill last 10 years

In [None]:

year = 2009
fname = f"LC002ALP100EV_{year}.pdf"
pdf_file = examDir / fname
page2 = fitz.open(pdf_file)[1]
page_drawings    = page2.get_drawings()
df = get_fill_df(page_drawings)
df.head(40)


In [None]:
draws = page_drawings
n_items       = [len(draw['items']) for draw in draws]
item_types    = [ [item[0] for item in draw["items"] ] for draw in draws ]
type          = [draw['type'] for draw in draws]
fill_opacity  = [draw['fill_opacity'] for draw in draws]
#r             = [draw['fill'][0] for draw in draws]
#b             = [draw['fill'][1] for draw in draws]
#g             = [draw['fill'][2] for draw in draws]
x0            = [draw['rect'].x0 for draw in draws]
y0            = [draw['rect'].y0 for draw in draws]
x1            = [draw['rect'].x1 for draw in draws]
y1            = [draw['rect'].y1 for draw in draws]

In [None]:
dfs = []
for year in  range(2005,2025):
    fname = f"LC002ALP100EV_{year}.pdf"
    pdf_file = examDir / fname
    page2 = fitz.open(pdf_file)[1]
    page_drawings    = page2.get_drawings()
    page_width       = text_dict["width"]   # This is a document wide thing doesn't need to be per page.
    page_height      = text_dict["height"]   # This is a document wide thing doesn't need to be per page.
    df = get_fill_df(page_drawings)[[ "fill_r","fill_g","fill_b"]]
    df = df.drop_duplicates()
    df.index = [f"{year}_{i}" for i in range(len(df.index))]
    df["year"] = [year]*len(df.index)
    dfs.append(df)

bigDf = pd.concat(dfs)
#bigDf.groupby("year").count()

In [None]:
bigDf[["fill_r","fill_g","fill_b"]].drop_duplicates()

# Check if in the last 20 years there has ever been more than one font size per line

In [None]:
pd.Series.mode()

In [None]:
get_line_df(blocks[3]["lines"])

In [None]:
from scipy.stats import mode
mode([1,1,1,4]).mode

In [None]:
def open_exam(year:int):
    fname = f"LC002ALP100EV_{year}.pdf"
    examDir=Path.cwd().parent.parent / "Exams"  / "english" / 'AL'
    pdf_file = examDir / fname

    return fitz.open(pdf_file)

dfs=[]
#for year in range(2005,2025):
year=2011
print("--"*40)
print(year)
print("--"*40)
doc              = open_exam(year)

fill_colours=[]
for i in range(1,7):
    page2_drawings   = doc[i].get_drawings()
    fill_df = get_fill_df(page2_drawings)
    if len(fill_df)==0:
        print(f"page {i} no fills.")
        continue
    fill_colour      = fill_df.fill.mode().values[0]
    fill_colours.append(fill_colour)


In [None]:
fill_colours[2] in uni_cols

In [None]:
def get_fill_colours(doc):
    fill_colours=[]
    for i in range(1,7):
        page_drawings   = doc[i].get_drawings()
        fill_df = get_fill_df(page_drawings)
        if len(fill_df)==0:
            continue
        fill_colour      = fill_df.fill.mode().values[0]
        fill_colours.append(fill_colour)

    return np.unique(fill_colours,axis=0)

In [None]:
uni_cols = np.unique(fill_colours,axis=0)
uni_cols

In [None]:
for i in uni_cols:
    print(i)

In [None]:
pd.DataFrame(fill_colours).drop_duplicates()