In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import fitz
from fitz import Rect
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances

from pdf_scraper.doc_utils     import open_exam, get_doc_line_df, identify_section_headers, identify_text_headers, get_path_from_doc
from pdf_scraper.doc_utils     import identify_footers, identify_instructions, identify_subtitles, identify_subsubtitles
from pdf_scraper.line_utils    import clean_line_df, get_category_boxes, get_df_bbox
from pdf_scraper.doc_utils     import get_images, filter_images, assign_in_image_captions, identify_vertical_captions
from pdf_scraper.doc_utils     import enrich_doc_df_with_images
from pdf_scraper.clustering.cluster_utils import get_vert_neigh_dist, split_cluster, hdbscan, find_y0_dL
from pdf_scraper.general_utils import df_bbox_dist, df_bbox_next_row_dist
from pdf_scraper.image_utils   import get_bboxed_page_image

pd.set_option("display.float_format", "{:.2f}".format)
pd.set_option("display.max_colwidth", 200)

In [None]:
year=2001; page = 2 
def check_year_page(year, page):
    doc    = open_exam(year, "english", "al",1)
    df     = get_doc_line_df(doc)
    
    images = get_images(doc)
    images = filter_images(images)
    assign_in_image_captions(df,images)
    
    df = clean_line_df(df)
    for image in images:
        if image["page"] <2 or image["page"] >8:
            continue
        identify_vertical_captions(df, image)
    page_images = [image for image in images if image["page"]==page]
    
    page_df = df.loc[df.page==page, ["text", 'x0', 'y0', 'x1', 'y1', "page","w"]].copy()
    page_df = enrich_doc_df_with_images(page_df,page_images)
    
    page_df["dL_e2e"] = page_df.apply(lambda row: get_vert_neigh_dist(row, page_df, ["y0","y1"]),axis=1 )
    page_df["dL_y0"]  = page_df.apply(lambda row: get_vert_neigh_dist(row, page_df, ["y0"]     ),axis=1 )
    

In [None]:
year=2001; page = 2 
doc    = open_exam(year, "english", "al",1)
page_df     = get_doc_line_df(doc)

images = get_images(doc)
images = filter_images(images)
assign_in_image_captions(page_df,images)

page_df = clean_line_df(page_df)
for image in images:
    if image["page"] <2 or image["page"] >8:
        continue
    identify_vertical_captions(page_df, image)
page_images = [image for image in images if image["page"]==page]

page_df = page_df.loc[page_df.page==page, ["text", 'x0', 'y0', 'x1', 'y1', "page","w"]].copy()
page_df = enrich_doc_df_with_images(page_df,page_images)

page_df["dL_e2e"] = page_df.apply(lambda row: get_vert_neigh_dist(row, page_df, ["y0","y1"]),axis=1 )
page_df["dL_y0"]  = page_df.apply(lambda row: get_vert_neigh_dist(row, page_df, ["y0"]     ),axis=1 )


In [None]:
page_df.head(20)

In [None]:
def nn_line_distance(df, row):

    same_page  = (row.page == df.page)
    middle     = (df[same_page].x0.min() + df[same_page].x1.max())/2
    same_side  = (row.x0 < middle ) == (df.x0 < middle) 
    below      = (df.y0 > row.y0)
    not_image  = (df.category != "image")

    mask       = same_side & below & not_image
    other_rows = df.loc[mask ]

    if len(other_rows)==0:
        return np.nan
    
    dir= ["y0","y1"]
    distances = pairwise_distances(row[dir].values.reshape(1,-1) , Y=other_rows[dir].values,  metric=df_bbox_dist)
    return distances.min()

def second_nn_line_distance(df, row):

    same_page  = (row.page == df.page)
    middle     = (df[same_page].x0.min() + df[same_page].x1.max())/2
    same_side  = (row.x0 < middle ) == (df.x0 < middle) 
    below      = (df.y0 > row.y0)
    not_image  = (df.category != "image")

    mask       = same_side & below & not_image & same_page
    other_rows = df.loc[mask]

    if len(other_rows)<=1:
        return np.nan
    
    dir = ["y0","y1"]
    distances = pairwise_distances(row[dir].values.reshape(1,-1) , Y=other_rows[dir].values,  metric=df_bbox_dist)

    nn_2 = np.sort(distances[0])[1]

    return nn_2

In [None]:
page_df["nn_dist"]  = page_df.apply(lambda row: nn_line_distance(page_df, row), axis=1)
page_df["2nn_dist"] = page_df.apply(lambda row: second_nn_line_distance(page_df, row), axis=1)
page_df.head(60)

In [None]:
def all_line_distances(page_df, row):
    middle = (page_df.x0.min() + page_df.x1.max())/2
    
    dir= ["y0","y1"]
    same_side  = (row.x0 < middle ) == (page_df.x0 < middle) 
    below      = (page_df.y0 > row.y0)
    not_image  = (page_df.category != "image")
    mask       = same_side & below & not_image
    
    other_rows = page_df.loc[mask ]
    metric     = "euclidean" if len(dir)==1 else df_bbox_dist

    if len(other_rows)==0:
        return np.nan
    
    distances = pairwise_distances(row[dir].values.reshape(1,-1) , Y=other_rows[dir].values,  metric=metric)
    return distances

In [None]:
np.sort(all_line_distances(page_df, page_df.loc[57])[0])[1]

In [None]:
np.sort(all_line_distances(page_df, page_df.loc[0])[0])

In [None]:
row = page_df.loc[0]
same_page  = (row.page == page_df.page)
middle     = (page_df[same_page].x0.min() + page_df[same_page].x1.max())/2
same_side  = (row.x0 < middle ) == (page_df.x0 < middle) 
below      = (page_df.y0 > row.y0)
not_image  = (page_df.category != "image")

mask       = same_side & below & not_image & same_page
other_rows = page_df.loc[mask]

if len(other_rows)<=1:
    print("return np.nan")

dir = ["y0","y1"]
distances = pairwise_distances(row[dir].values.reshape(1,-1) , Y=other_rows[dir].values,  metric=df_bbox_dist)

nn_2 = np.sort(distances[0])[1]

In [None]:
page_df.apply(lambda row: second_nn_line_distance(page_df, row), axis=1)