In [81]:
import math
from utils import *
import pandas as pd
import numpy  as np
import fitz
from fitz import Rect
from line_utils import *

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose  import ColumnTransformer, make_column_selector, make_column_transformer

In [25]:
pdf_file = "test_pdfs/LC002ALP100EV_2024.pdf"
doc              = fitz.open(pdf_file)
page             = doc[3]
page_dict        = page.get_text("dict",sort=True)
blocks           = page_dict["blocks"]
block            = blocks[6]
lines            = block['lines']

In [26]:
print(get_block_text(block))

In [27]:
print(lines[0].keys()) 
print(len(lines))
def line_is_empty(line):
    return all( [span["text"].isspace() for span in line["spans"]] )
lines = [line for line in lines if not line_is_empty(line)]
print(len(lines))

In [28]:
def get_line_table(lines: dict):
    '''
    This function outputs a string which will list all the blocks in the page along with their coordinates, their
    type, and the first word if it's a text block.
    '''
    table=[f"{'x0':8} {'x1':8} {'y0':8} {'y1':8} {'dx':8} {'dy':8} {'fonts':36} {'beginning':25}", "--"*60]
    for line in lines:
        font           = line["spans"][0]["font"] 
        font_list      = list(set(span["font"] for span in line["spans"] ) )
        x0, y0, x1, y1 = line['bbox']
        beginning      = line["spans"][0]["text"][:25]
        line=f"{x0:<8.2f} {x1:<8.2f} {y0:<8.2f} {y1:<8.2f} {x1-x0:<8.2f} {y1-y0:<8.2f} {' '.join(font_list):36} {beginning:<25}"
        table.append(line)
    table.extend( ["--"*60,"\n"*2] )
    line_table = "\n".join(table)
    return line_table

def print_line_table(lines:dict):
    print(get_line_table(lines))
    return None

print_line_table(lines)

In [50]:
def get_line_text(line: dict) -> str:
    return "".join( [span["text"] for span in line["spans"] ] )

def get_line_words(line:dict) -> list:
    return re.findall(r'\b\w+\b', get_line_text(line) )
    

def get_line_df(lines):
    coords         = [line['bbox'] for line in lines]
    x0             = [coord[0] for coord in coords]
    y0             = [coord[1] for coord in coords]
    dL             = [coords[i+1][1] - coords[i][1] for i in range(len(coords)-1)] + [np.nan]
    x1             = [coord[2] for coord in coords]
    y1             = [coord[3] for coord in coords]
    n_spans        = [len(line["spans"]) for line in lines]
    font_list      = [                [span["font"] for span in line["spans"]  ]  for line in lines]
    common_font    = [get_common_font([span["font"] for span in line["spans"]  ]) for line in lines]
    mode_font      = [get_mode_font(  [span["font"] for span in line["spans"]  ]) for line in lines]
    w              = [coord[2]-coord[0] for coord in coords]
    h              = [coord[3]-coord[1] for coord in coords]
    text           = [get_line_text(line)       for line in lines]
    n_words        = [len(get_line_words(line)) for line in lines ]

    
    data_dict={"x0":x0,"y0":y0,"x1":x1,"y1":y1,"dL":dL, "n_spans":n_spans,"font_list":font_list,      
    "common_font":common_font,"mode_font":mode_font,"n_words":n_words,"w":w,"h":h,"text":text}
    return pd.DataFrame(data_dict)

pd.set_option("display.float_format", "{:.2f}".format)
df = get_line_df(lines)
df.head(10)

# Preprocessing dataframe

In [130]:
X = df.drop(columns=["font_list","text","n_spans","dL","n_words"])

num_vars = list( X.select_dtypes(include=np.number).columns )
cat_vars = list( X.select_dtypes(include='object').columns  )

X[num_vars] = StandardScaler().fit_transform(X[num_vars])

ohe = OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="error" )
X[cat_vars] = ohe.fit_transform(X[cat_vars])

basic_preproc = make_column_transformer(
    (StandardScaler(), num_vars),
    (OneHotEncoder(drop="if_binary",sparse_output=False, handle_unknown="error"), cat_vars),
    remainder="drop"
    )
basic_kmeans = make_pipeline(basic_preproc, KMeans(n_clusters=2,  n_init=400))


display(X.head(2))
display(pd.DataFrame(basic_preproc.fit_transform(df),columns=num_vars+ cat_vars).head(2))

# Clustering

## Default Kmeans

In [92]:
display_df = df.copy()
display_df["cluster"] = basic_kmeans.fit_predict(df)
display_df.head(7)

In [88]:
top_init    = X.iloc[0]
bottom_init = X.iloc[-1]  
init_centroids = [ X.iloc[0], X.iloc[-1] ]

kmeans = KMeans(n_clusters=2, random_state=42,init=init_centroids, n_init="auto")
y_pred = kmeans.fit_predict(X)
y_pred

## Weighted Kmeans

In [96]:
X_weighted = X.copy()

y_weight    = math.sqrt(2)
font_weight = math.sqrt(4)

X_weighted[["y0","y1"]]                 = X[["y0","y1"]]*y_weight
X_weighted[["common_font","mode_font"]] = X[["common_font","mode_font"]]*font_weight 
X_weighted.head()

In [103]:
init_centroids = [ X.iloc[0], X.iloc[-1] ]
kmeans = KMeans(n_clusters=2, random_state=42,init=init_centroids, n_init="auto")
#kmeans = KMeans(n_clusters=2, n_init=1000)
cluster_pred = kmeans.fit_predict(X_weighted)
pd.concat((X_weighted,pd.Series(cluster_pred,name="Cluster") ),axis=1).head(7)

# Full custom K-means

## Pre proc X

In [181]:
X = basic_preproc.fit_transform(df)
X.shape

## Initialise clusters

In [136]:
clust0 = X[0]
clust1 = X[X.shape[0]-1]

## Calculate cluster distances

In [180]:
dist0 = ((clust0-X[0])**2).T@((clust0-X[0])**2)
dist1 = ((clust1-X[0])**2).T@((clust1-X[0])**2)
print(dist0)
print(dist1)

In [189]:

display(pd.DataFrame(basic_preproc.fit_transform(df),columns=num_vars+ cat_vars).head(2))

In [196]:
(x-clust0)**2

In [195]:
print(f"{'l0':<5} {'l1':<5} {'clust':<5} {'d0_w':8} {'d1_w':8} {'d0_y0':8} {'d1_y0':8} {'d0_y1':8} {'d1_y1':8} {'d0_font':8} {'d1_font':8}")
for i, x in enumerate(X):
    l0 = np.linalg.norm(x - clust0)
    l1 = np.linalg.norm(x - clust1)
    v0 = (x - clust0)**2
    v1 = (x - clust1)**2
    d0_w    = v0[4]
    d1_w    = v1[4]
    d0_y0   = v0[1]
    d1_y0   = v1[1]
    d0_y1   = v0[3]
    d1_y1   = v1[3]
    d0_font = v0[6]
    d1_font = v1[6]
    cluster = 0 if l0 <= l1 else 1 
    if i==3:
        print(f"{l0:<5.2f} {l1:<5.2f} {cluster:<5} {d0_w:<8.2f} {d1_w:<8.2f} {d0_y0:<8.2f} {d1_y0:<8.2f} {d0_y1:<8.2f} {d1_y1:<8.2f} {d0_font:<8.2f} {d1_font:<8.2f}")
