In [None]:
import math
from utils import *
import pandas as pd
import numpy  as np
from numpy.linalg import norm
import fitz
from fitz import Rect
from line_utils import *
import re

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose  import ColumnTransformer, make_column_selector, make_column_transformer

In [None]:
pdf_file = "test_pdfs/LC002ALP100EV_2024.pdf"
doc              = fitz.open(pdf_file)
page             = doc[3]
page_dict        = page.get_text("dict",sort=True)
blocks           = page_dict["blocks"]
block            = blocks[6]
lines            = block['lines']

In [None]:
print(get_block_text(block))

In [None]:
print(lines[0].keys()) 
print(len(lines))
def line_is_empty(line):
    return all( [span["text"].isspace() for span in line["spans"]] )
lines = [line for line in lines if not line_is_empty(line)]
print(len(lines))

In [None]:
print_line_table(lines)

In [None]:
pd.set_option("display.float_format", "{:.2f}".format)
df = get_line_df(lines)
df.head(10)

# Preprocessing dataframe

In [None]:
X = df.drop(columns=["font_list","text","n_spans","dL","n_words"])

num_vars = list( X.select_dtypes(include=np.number).columns )
cat_vars = list( X.select_dtypes(include='object').columns  )

X[num_vars] = StandardScaler().fit_transform(X[num_vars])

ohe = OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="error" )
X[cat_vars] = ohe.fit_transform(X[cat_vars])

basic_preproc = make_column_transformer(
    (StandardScaler(), num_vars),
    (OneHotEncoder(drop="if_binary",sparse_output=False, handle_unknown="error"), cat_vars),
    remainder="drop"
    )
basic_kmeans = make_pipeline(basic_preproc, KMeans(n_clusters=2,  n_init=400))


display(X.head(2))
display(pd.DataFrame(basic_preproc.fit_transform(df),columns=num_vars+ cat_vars).head(2))

# Clustering

## Default Kmeans

In [None]:
display_df = df.copy()
display_df["cluster"] = basic_kmeans.fit_predict(df)
display_df.head(7)

In [None]:
top_init    = X.iloc[0]
bottom_init = X.iloc[-1]  
init_centroids = [ X.iloc[0], X.iloc[-1] ]

kmeans = KMeans(n_clusters=2, random_state=42,init=init_centroids, n_init="auto")
y_pred = kmeans.fit_predict(X)
y_pred

## Weighted Kmeans

In [None]:
X_weighted = X.copy()

y_weight    = math.sqrt(2)
font_weight = math.sqrt(4)

X_weighted[["y0","y1"]]                 = X[["y0","y1"]]*y_weight
X_weighted[["common_font","mode_font"]] = X[["common_font","mode_font"]]*font_weight 
X_weighted.head()

In [None]:
init_centroids = [ X.iloc[0], X.iloc[-1] ]
kmeans = KMeans(n_clusters=2, random_state=42,init=init_centroids, n_init="auto")
#kmeans = KMeans(n_clusters=2, n_init=1000)
cluster_pred = kmeans.fit_predict(X_weighted)
pd.concat((X_weighted,pd.Series(cluster_pred,name="Cluster") ),axis=1).head(7)

# Full custom K-means

## Pre proc X

In [None]:
X = basic_preproc.fit_transform(df)
X.shape

In [None]:
X_df = pd.DataFrame(basic_preproc.fit_transform(df),columns=num_vars+ cat_vars)
X_df.head(6)

## Initialise clusters

In [None]:
clusts = X[[0, X.shape[0]-1]]
clust0, clust1 = clusts
clusts.shape

## Calculate cluster distances

In [None]:
dist0 = (clust0-X[0]).T@(clust0-X[0])
dist1 = (clust1-X[0]).T@(clust1-X[0])
print(np.sqrt(dist0),np.sqrt(dist1))
dists = [norm( clust - X[0]) for clust in clusts]
print(dists[0],dists[1])

In [None]:
print(X.shape)
print(clusts.shape)
dist0 = norm(X-clusts[0],axis=1)
dist1 = norm(X-clusts[1],axis=1)
dists = np.vstack((dist0, dist1)).T
print(dists.shape)

### Fully vectorised

In [None]:
diff = X[:, np.newaxis, :] - clusts[np.newaxis, :, :]  #  (17, 2, 8)
dists = np.linalg.norm(diff, axis=2)  #  (17, 2)

### Examine distance components for edge point

In [None]:
print(f"{'i':<5} {'clust':<5} {'l':<8} {'dy0':8} {'dx1':8} {'dw':8} {'dfont':8}")
for i, x in enumerate(X):
    for j, clust in enumerate(clusts):
        l = norm(x - clust)
        dr = (x - clust)**2
        dw    = dr[4]
        dy0   = dr[3]
        dx1   = dr[2]
        dfont = dr[6]
        if i == 3:  
            print(f"{i:<5} {j:<5} {l:<8.2f} {dy0:<8.2f} {dx1:<8.2f} {dw:<8.2f} {dfont:<8.2f}")

## Label data points

In [None]:
y_bool = dists[:,0]< dists[:,1]
y = np.array( y_bool ,dtype= np.int64 )

print("Cluster 0\nShape:",X[y_bool].shape)
print("Cluster 1\nShape:",X[~y_bool].shape)

X_df_labelled = pd.concat((X_df,pd.Series(y,name="cluster")), axis=1) 
X_df_labelled.head(5)


### Fully vectorised

In [None]:
labels = np.argmin(dists, axis=1)  # shape (17,) 
k = clusts.shape[0]  # number of clusters (e.g. 2)

# Use list comprehension to compute new means per cluster label
new_clusts = np.vstack([X[labels == i].mean(axis=0) for i in range(k)])

## Recalculate cluster centres

In [None]:
clust0 = np.mean(X[y_bool], axis=0)
clust1 = np.mean(X[~y_bool], axis=0 )
new_clusts = np.vstack( (clust0,clust1))

In [None]:
print(X.shape)
print(new_clusts.shape)
dist0 = np.linalg.norm(X-new_clusts[0],axis=1)
dist1 = np.linalg.norm(X-new_clusts[1],axis=1)

dists = np.vstack((dist0, dist1)).T
dists.shape

In [None]:
print(X[:, np.newaxis, :].shape)
print(clusts[np.newaxis, :, :].shape)
diff = X[:, np.newaxis, :] - clusts[np.newaxis, :, :]  #  (17, 2, 8)
dists = np.linalg.norm(diff, axis=2)  #  (17, 2)

## Check cluster displacement

In [None]:
dclust = new_clusts - clusts
print(dclust.shape)
clust_delta = norm(dclust, axis=1)

In [None]:
# Relabel according to new_clust
if clust_delta[0] < tol and clust_delta[1] < tol:
    break
else:
    # recalculate clusts
    # relabel points

In [None]:
print(X.shape)
i_nword = X.shape[1]-1

In [None]:
full_vect = X[:,:i_nword]
full_vect.shape