In [None]:
import math
import pandas as pd
import numpy  as np
from numpy.linalg import norm
import fitz
from fitz import Rect
import re

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose  import ColumnTransformer, make_column_selector, make_column_transformer

from pdf_scraper.block_utils import get_block_text, print_block_table, clean_blocks
from pdf_scraper.doc_utils import open_exam
from pdf_scraper.line_utils import line_is_empty, print_line_table, get_line_df

Below we see that on page 4 of 2024, the 7th block has a subtitle and dual column text blocked together.

In [None]:
doc              = open_exam(2024)
page             = doc[3]
page_dict        = page.get_text("dict",sort=True)
blocks           = page_dict["blocks"]
block            = blocks[6]
lines            = block['lines']
lines = [line for line in lines if not line_is_empty(line)]

In [None]:
print(get_block_text(block))

In [None]:
print_line_table(lines)

In [None]:
pd.set_option("display.float_format", "{:.2f}".format)
df = get_line_df(lines)
df.head(10)

# Preprocessing dataframe

In [None]:
#bad_nums = ["n_spans","dL","x1","n_words","h","x0","y1"]
#bad_cats = ["font_list","text","mode_font"]
bad_nums = ["n_spans","dL","x0","n_words","x1","h"]
bad_cats = ["font_list","text", "font_sizes","category" ]

num_vars = [ col for col in df.select_dtypes(include=np.number).columns if col not in bad_nums ] 
cat_vars = [ col for col in df.select_dtypes(include='object').columns  if col not in bad_cats ] 
X_cols   = num_vars + cat_vars


ohe = OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="error" )

basic_preproc = make_column_transformer(
    (StandardScaler(), num_vars),
    (OneHotEncoder(drop="if_binary",sparse_output=False, handle_unknown="error"), cat_vars),
    remainder="drop"
    )
basic_kmeans = make_pipeline(basic_preproc, KMeans(n_clusters=2,  n_init=400))

X    = basic_preproc.fit_transform(df)
X_df = pd.DataFrame(X, columns=X_cols)

display(X_df.head(4))

# Clustering

## Default Kmeans

In [None]:
display_df = df.copy()
display_df["cluster"] = basic_kmeans.fit_predict(df)
display_df.head(7)

We see the default Kmeans fails with these features, the one word line "Elaine." is associated with the wrong block

In [None]:
top_init    = X[0]
bottom_init = X[-1]  
init_centroids = [ top_init, bottom_init ]

kmeans = KMeans(n_clusters=2, random_state=42,init=init_centroids, n_init="auto")
y_pred = kmeans.fit_predict(X)
y_pred

## Weighted Kmeans

In [None]:
X_weighted = X_df.copy()

y_weight    = math.sqrt(2)
font_weight = math.sqrt(4)

X_weighted[["y0","y1"]]                 = X_df[["y0","y1"]]*y_weight
X_weighted[["common_font","mode_font"]] = X_df[["common_font","mode_font"]]*font_weight 
X_weighted.head()

### Manual centroid initialisation

In [None]:
init_centroids = [ X[0], X[-1] ]
kmeans = KMeans(n_clusters=2, random_state=42,init=init_centroids, n_init="auto",verbose=True)
cluster_pred = kmeans.fit_predict(X_weighted)
pd.concat((X_weighted,pd.Series(cluster_pred,name="Cluster") ),axis=1).head(7)

### 1000 random centroids

In [None]:
kmeans = KMeans(n_clusters=2, n_init=1000)
cluster_pred = kmeans.fit_predict(X_weighted)
print(kmeans.inertia_)
pd.concat((X_weighted,pd.Series(cluster_pred,name="Cluster") ),axis=1).head(18)

# Full custom K-means

## Pre proc X

In [None]:
X = basic_preproc.fit_transform(df)
X_df = pd.DataFrame(basic_preproc.fit_transform(df),columns=X_cols)
X_df.head(6)

## Initialise clusters

In [None]:
clusts = X[[0, X.shape[0]-1]]
clust0, clust1 = clusts
clusts.shape

## Calculate cluster distances

In [None]:
dist0 = (clust0-X[0]).T@(clust0-X[0])
dist1 = (clust1-X[0]).T@(clust1-X[0])
print(np.sqrt(dist0),np.sqrt(dist1))
dists = [norm( clust - X[0]) for clust in clusts]
print(dists[0],dists[1])

In [None]:
dist0 = norm(X-clusts[0],axis=1)
dist1 = norm(X-clusts[1],axis=1)
dists = np.vstack((dist0, dist1)).T
print(X.shape)
print(clusts.shape)
print(dists.shape)

### Fully vectorised

In [None]:
diff  = X[:, np.newaxis, :] - clusts[np.newaxis, :, :]  #  (17, 2, 8)
dists = norm(diff, axis=2)  #  (17, 2)

### Examine distance components for edge point

In [None]:
print(f"{'i':<5} {'clust':<5} {'l':<8} {'dy0':8} {'dw':8} {'dfont':8}")
for i, x in enumerate(X):
    for j, clust in enumerate(clusts):
        l = norm(x - clust)
        dr = (x - clust)**2
        dw    = dr[2]
        dy0   = dr[0]
        dfont = dr[3]
        if i == 3:  
            print(f"{i:<5} {j:<5} {l:<8.2f} {dy0:<8.2f} {dw:<8.2f} {dfont:<8.2f}")

In [None]:
dists[3]

## Label data points

In [None]:
y_bool = dists[:,0]> dists[:,1] # If it is closer to 0 the label is 0. So we want dists[:,0]< dists[:,1] to give 0
y = np.array( y_bool ,dtype= np.int64 )

print("Cluster 0\nShape:",X[y_bool].shape)
print("Cluster 1\nShape:",X[~y_bool].shape)

X_df_labelled = pd.concat((X_df,pd.Series(y,name="cluster")), axis=1) 
X_df_labelled.head(6)


### Fully vectorised

In [None]:
labels = np.argmin(dists, axis=1)  # shape (17,) 
k = clusts.shape[0]  # number of clusters (e.g. 2)

# Use list comprehension to compute new means per cluster label
new_clusts = np.vstack([X[labels == i].mean(axis=0) for i in range(k)])

## Recalculate cluster centres

In [None]:
clust0 = np.mean(X[y_bool], axis=0)
clust1 = np.mean(X[~y_bool], axis=0 )
new_clusts = np.vstack( (clust0,clust1))

In [None]:
print(X.shape)
print(new_clusts.shape)
dist0 = np.linalg.norm(X-new_clusts[0],axis=1)
dist1 = np.linalg.norm(X-new_clusts[1],axis=1)

dists = np.vstack((dist0, dist1)).T
dists.shape

In [None]:
print(X[:, np.newaxis, :].shape)
print(clusts[np.newaxis, :, :].shape)
diff = X[:, np.newaxis, :] - clusts[np.newaxis, :, :]  #  (17, 2, 8)
dists = np.linalg.norm(diff, axis=2)  #  (17, 2)

## Check cluster displacement

In [None]:
dclust = new_clusts - clusts
print(dclust.shape)
clust_delta = norm(dclust, axis=1)

In [None]:
print(X.shape)
i_nword = X.shape[1]-1

In [None]:
full_vect = X[:,:i_nword]
full_vect.shape

## One Iteration Custom Cluster

### Define dataframe and word mask

In [None]:
df        = get_line_df(lines)

# We need to choose now the rows where the number of words is below 4
word_mask = df["n_words"].to_numpy() < 4

print("Raw lines dataframe:")
display(df.head(10))

## Preprocess data frame

In [None]:
# These cols of the df are not informative for text-block clustering.
bad_nums = ["n_spans","dL","x1","n_words","x0","h","y1"]
bad_cats = ["font_list","text", "mode_font", "font_sizes","category"]

num_vars = [ col for col in  df.select_dtypes(include=np.number).columns if col not in bad_nums] 
cat_vars = [ col for col in  df.select_dtypes(include='object').columns  if col not in bad_cats] 

basic_preproc = make_column_transformer(
    (StandardScaler(), num_vars),
    (OneHotEncoder(drop="if_binary",sparse_output=False, handle_unknown="error"), cat_vars),
    remainder="drop"
    )
X_cols = num_vars + cat_vars 
X      = basic_preproc.fit_transform(df)
X_df   = pd.DataFrame(X,columns=X_cols )
print(f"Preprocessed dataframe of shape {X.shape}:")
print(X_df.head(8),"\n")

## Initialise clusters

In [None]:
# initialise clusters - first and last data point are top and bottom of page
k=2
m, n = X.shape
clusts  = X[[0, m-1]]
d_clust = norm(clusts,axis=1)
inertia = d_clust.T@d_clust
i_w       = X_cols.index("w")
print(clusts.shape)
print(d_clust , inertia)

## Normal distance calc

In [None]:
# full distance calc for certain, N-1 dimensional for others.
full_vect  = X[~word_mask, :]
full_clust = clusts[:, :]

full_diff   = full_vect[:, np.newaxis, :] - full_clust[np.newaxis, :, :]  #  (m_full, 2, n)
full_dists  = norm(full_diff, axis=2)                                     #  (m_full, 2)

print(f"Full vector of shape {full_vect.shape}")
#print(pd.DataFrame(full_vect, columns= X_cols).head(8),"\n\n" )


## Distance for few-word lines

In [None]:
# If we have a line with a small n_words, the width is no longer a good variable for clustering.
small_vect  = np.delete(X[word_mask], i_w, axis=1)
small_clust = np.delete(clusts,       i_w, axis=1)

small_diff   = small_vect[:, np.newaxis, :] - small_clust[np.newaxis, :, :]  #  (m_small, 2, n -1)
small_dists  = norm(small_diff, axis=2)                                      #  (m_small, 2)

small_cols = [i for i in X_cols if i != "w" ]

print(f"Width-excluded vector of shape {small_vect.shape}")
print(pd.DataFrame(small_vect, columns = small_cols).head(2),"\n\n")

## Combine distances  - label points

In [None]:
# Combine distances and label 
dists = np.empty((m, k))
dists[word_mask]  = small_dists
dists[~word_mask] = full_dists
labels = np.argmin(dists, axis=1)

X_df["cluster"] = pd.Series(labels)
X_df.head(8)

## Calculate new clusters

In [None]:
new_clusts = np.vstack([X[labels == i].mean(axis=0) for i in range(k)])

norm_change =  norm(clusts-new_clusts,axis=1)
norm_clust  =  norm(clusts,axis = 1)

tol = 0.01
if all(norm_change/norm_clust < tol):
    print("clust has barely moved")

In [None]:
def get_variable_diffs(X, clusts, X_cols, word_mask, i_w):
    """
    Computes squared variable-wise differences between each point and clusters.
    Returns a DataFrame with columns like d0_w, d1_w, d0_y0, etc.
    """
    m, n = X.shape
    k = clusts.shape[0]

    full_vect = X[~word_mask]
    full_diffs = (full_vect[:, np.newaxis, :] - clusts[np.newaxis, :, :]) ** 2

    small_vect  = np.delete(X[word_mask], i_w, axis=1)
    small_clust = np.delete(clusts, i_w, axis=1)
    small_diffs = (small_vect[:, np.newaxis, :] - small_clust[np.newaxis, :, :]) ** 2

    all_diffs = np.empty((m, k, n))
    all_diffs[~word_mask] = full_diffs
    # Fill small_diffs into all_diffs for word_mask rows (with width excluded)
    # We must check below tomorrow pretty sure they are the same.
    all_diffs[word_mask, :, :i_w]   = small_diffs[:, :, :i_w]
    all_diffs[word_mask, :, i_w+1:] = small_diffs[:, :, i_w:]
    all_diffs[word_mask, :, i_w]    = 0  


    var_dfs = []
    for cluster_i in range(k):
        cluster_diff = all_diffs[:, cluster_i, :]
        cluster_df = pd.DataFrame(cluster_diff, columns=[f"d{cluster_i}_{col}" for col in X_cols])
        var_dfs.append(cluster_df)

    return pd.concat(var_dfs, axis=1)

In [None]:

m, n = X.shape
k = clusts.shape[0]

full_vect = X[~word_mask]
full_diffs = (full_vect[:, np.newaxis, :] - clusts[np.newaxis, :, :]) ** 2

small_vect  = np.delete(X[word_mask], i_w, axis=1)
small_clust = np.delete(clusts, i_w, axis=1)
small_diffs = (small_vect[:, np.newaxis, :] - small_clust[np.newaxis, :, :]) ** 2

all_diffs = np.empty((m, k, n))
all_diffs[~word_mask] = full_diffs
# Fill small_diffs into all_diffs for word_mask rows (with width excluded)
# We must check below tomorrow pretty sure they are the same.
all_diffs[word_mask, :, :i_w]   = small_diffs[:, :, :i_w]
all_diffs[word_mask, :, i_w+1:] = small_diffs[:, :, i_w:]
all_diffs[word_mask, :, i_w]    = 0  
all_diffs[word_mask]

# Test custom Clustering

In [None]:
from pdf_scraper.clustering.customCluster import reblock_lines
doc              = open_exam(2024)
page             = doc[1]
page_dict        = page.get_text("dict",sort=True)
blocks           = page_dict["blocks"]
blocks           = clean_blocks(blocks)
print_block_table(blocks)

In [None]:
block            = blocks[1]
lines            = [line for line in block["lines"] if not line_is_empty(line)]
print(get_block_text(block))

In [None]:
line_lables = reblock_lines(lines)
# from above we can see that first line should be one block, next two should be next
expected    = np.array([0,1,1])
assert (reblock_lines(lines) == expected).all()