### We want to create the dataset required to build an NLP based text calssification model to detect PD data in window-wise-search results

In [1]:
import numpy as np
import os
import glob
import pandas as pd
import datetime
import dateutil.parser
import dateparser
from dateutil.relativedelta import relativedelta
import re
import json
import math
import jaro
from collections import Counter
#import fitz
import cv2
import difflib

### Load JSON using filename

In [2]:
def load_json(file_number):
    with open("my_pdfs\\"+str(file_number)+'.json', 'r') as f:
        myjson = json.load(f)
    return myjson

### New way to view rotation correction in text documents

In [3]:
def polar_rotate_on_origin(point, page_angle):
    px, py = point
    px += 0.0000001 ## to ensure tan theta does not go to infinity
    theta = round(math.atan(py/px),2)
    del_theta = -(math.radians(page_angle))
    new_theta = theta+del_theta
    rx = math.sqrt(px**2 + py**2)
    # print("Distance from origin: " + str(rx))
    new_px = rx*math.cos(new_theta)
    new_py = rx*math.sin(new_theta)
    return (new_px,new_py)


def rotation_correction(bbox, page_angle, h, w):
    del_theta = -(page_angle)
    if -90<=del_theta<=0:
        phi = abs(math.radians(del_theta))
        del_x = 0
        del_y = w*math.sin(phi)
    elif 90<del_theta<=180:
        phi = math.radians(abs(del_theta)-90)
        del_x = h*math.sin(phi)
        del_y = (h*math.cos(phi)) + (w*math.sin(phi))
        
    elif 0<=del_theta<=90:
        phi = abs(math.radians(del_theta))
        del_x = h*math.sin(phi)
        del_y = 0
        
    else:
        phi = math.radians(180-abs(del_theta))
        del_x = w*math.cos(phi)
        del_y = (h*math.cos(phi)) + (w*math.sin(phi)) 
    rect_bb = np.reshape(bbox, (4,2))
    new_bb = []
    for corner in rect_bb:
        rotation_corrected = polar_rotate_on_origin(corner, page_angle)
        r_px = round(rotation_corrected[0],3)
        r_py = round(rotation_corrected[1],3)
        final_x , final_y = r_px + del_x , r_py + del_y
        new_bb.append(final_x)
        new_bb.append(final_y)
    return new_bb

### Serializing the json and computing page-wise-meta-data

In [4]:
def serialize_json(myjson):
    serialized_json = []
    try: 
        page_list = myjson["ocrStep"]["result"][0]["analyzeResult"]["readResults"]
        #print("First format found")
    except:
        page_list = myjson["OCRStep"]["Result"][0]["analyzeResult"]["readResults"]
    meta_data = []
    for page_index,page in enumerate(page_list):
        angle = page['angle']
        page_num = page_index+1
        height = page["height"]
        width = page["width"]
        meta_dict = {"page" : page_num,
                    "height" : height,
                    "width" : width,
                    "angle" : angle}
        meta_data.append(meta_dict)
        for line_index,line in enumerate(page["lines"]):
            line_num = line_index+1
            for word_in, word in enumerate(line["words"]):
                ##print("Word: " + str(word))
                bbox_ = word["boundingBox"]
                ## correcting for page rotation
                bbox_ = rotation_correction(bbox_, angle, height, width)
                ## Computing centroids
                rect_bbox = np.reshape(np.asarray(bbox_),(4,2))
                x_center = np.mean(rect_bbox,axis=0)[0]
                y_center = np.mean(rect_bbox,axis=0)[1]
                # #Rotate x_center and y_center
                # x_center, y_center = rotate((0,0), (x_center, y_center), math.radians(round(angle)))
                word_num = word_in+1
                word_dict = {"page": page_num, 
                             "line_num": line_num, 
                             "word_num" : word_num, 
                             "text" : word["text"],
                             "x_center" : x_center,
                             "y_center" : y_center,
                             "bbox": bbox_,
                            "confidence" : word["confidence"],
                            "status" : "Active"} ## Active, Inactive, Superactive
                serialized_json.append(word_dict)
    return serialized_json, meta_data

### Finding Window-Width by isolating only digits 1-9, because PD data only looks that way

In [5]:
def find_window_width(serialized_json):
    ## Computing a Window Width with all words in serialized JSON
    height_list = []
    for word in serialized_json:
        bbox = word["bbox"]
        if word["text"].isdigit():
            height = abs(bbox[1] - bbox[7])
            height_list.append(height)
    try:
        mean = sum(height_list)/len(height_list)
        return mean * 1.2, serialized_json ## making window slightly wider than the average font height
    except:
        return 0 , serialized_json

### window-search based on Serialized json and meta_data 

In [6]:
def window_search_using_serialized_JSON(serialized_json, meta_data): #meta data contains page height and widths
    window_width, serialized_json = find_window_width(serialized_json)
    if window_width == 0:
        return []
    else:
        total_window = 0
        for page in meta_data:
            h = page["height"]
            w = page["width"]
            page_angle = page["angle"]
            del_theta = math.radians(abs(page_angle))
            new_h = round(abs((h*math.cos(del_theta))) + abs((w*math.sin(del_theta))),2)
            new_w = round(abs((h*math.sin(del_theta))) + abs((w*math.cos(del_theta))),2)
            num_windows = math.ceil(new_h/window_width)
            page["window_count"] = num_windows
            total_window += num_windows
        windows = list(range(total_window))
        word_window_mapping = []
        for word in serialized_json:
            if word["status"] != "inActive":
                y_center = word["y_center"]
                page = word["page"]
                prev_page_windows = sum([x["window_count"] for x in meta_data if x["page"] < page]) - 1
                window_current_page = math.ceil(y_center/window_width)
                if prev_page_windows > 0:
                    window_number = prev_page_windows + window_current_page
                else:
                    window_number = window_current_page - 1
                temp_dict = {"window" : window_number,
                            "word" : word}
                word_window_mapping.append(temp_dict)
            else:
                continue
        window_wise_list = []
        for window in windows:
            word_list = [x["word"] for x in word_window_mapping if x["window"] == window]
            window_wise_list.append(word_list)
    return window_wise_list

### Downloading and setting up window ground truths

In [7]:
df = pd.read_excel("Window_ground_truth.xlsx")
doc_list = list(df.T.to_dict().values())
doc_list[5]

{'Document': 6,
 'PD Windows': '105_121_131_146',
 'Tooth Windows': 'None',
 'DCM': '104_147',
 'Max_Window': 175}

In [8]:
# concatenate text for each window of each document
# add lebels to each window
# DF to have only two columns "text" and "cat"

In [9]:
text_cat_map = []
for doc in doc_list:
    doc_num = doc['Document']
    if doc['PD Windows'] != 'None':
        pd_windows = doc['PD Windows'].split('_')
    else:
        pd_windows = []
    if doc['Tooth Windows'] != 'None':
        tooth_windows = doc['Tooth Windows'].split('_')
    else:
        tooth_windows = []
    if doc['DCM'] != 'None':
        dcm_windows = doc['DCM'].split('_')
    else:
        dcm_windows = []
    myjson = load_json(doc_num)
    serialized_json, meta_data = serialize_json(myjson)
    window_wise_list = window_search_using_serialized_JSON(serialized_json, meta_data)
    for i, window in enumerate(window_wise_list):
        label = None
        window_text = [word['text'] for word in window]
        window_text = " ".join(window_text)
        if str(i) in pd_windows:
            label = "PD"
        elif str(i) in tooth_windows:
            label = 'T'
        elif str(i) in dcm_windows:
            label = 'D'
        else:
            label = 'None'
        if window_text != '':
            text_cat_map.append({"text": window_text, "label":label})

In [10]:
len(text_cat_map)

4433

In [11]:
text_cat_df = pd.DataFrame(text_cat_map)

In [12]:
text_cat_df['label'].value_counts()

None    4107
PD       228
T         65
D         33
Name: label, dtype: int64

In [19]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp38-cp38-win_amd64.whl (3.5 MB)
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [68]:
!pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.0.0-cp38-cp38-win_amd64.whl (172.3 MB)
Collecting torchvision
  Downloading torchvision-0.15.1-cp38-cp38-win_amd64.whl (1.2 MB)
Collecting torchaudio
  Downloading torchaudio-2.0.1-cp38-cp38-win_amd64.whl (2.1 MB)
Installing collected packages: torch, torchvision, torchaudio
Successfully installed torch-2.0.0 torchaudio-2.0.1 torchvision-0.15.1


In [13]:
from transformers import BertTokenizerFast, BertForSequenceClassification

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
model_name = "bert-base-uncased"
max_length = 512

In [16]:
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [17]:
labels=text_cat_df.label

In [18]:
window_text = text_cat_df.text

In [19]:
target_names = list(text_cat_df.label.unique())

In [20]:
(train_texts,valid_texts,train_labels,valid_labels)=train_test_split(window_text, labels, test_size=0.3)

In [21]:
#train_texts.to_list()

In [22]:
train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True, max_length=max_length)

In [23]:
model=BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))

SSLError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/097417381d6c7230bd9e3557456d726de6e83245ec8b24f529f60198a67b203a?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1681386261&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly9jZG4tbGZzLmh1Z2dpbmdmYWNlLmNvL2JlcnQtYmFzZS11bmNhc2VkLzA5NzQxNzM4MWQ2YzcyMzBiZDllMzU1NzQ1NmQ3MjZkZTZlODMyNDVlYzhiMjRmNTI5ZjYwMTk4YTY3YjIwM2E~cmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIiwiQ29uZGl0aW9uIjp7IkRhdGVMZXNzVGhhbiI6eyJBV1M6RXBvY2hUaW1lIjoxNjgxMzg2MjYxfX19XX0_&Signature=TgwuOm2CCxMJOCO~u~8cdiJRF6l1FZYJ4KVJMeXoyKfuxIXLaOvlh109lWF~KEeNjYKu7BtRp~Ku-EBE70l34bXxTtpn47k-YXEioyFegZAczGT1KW1y1MTxOAoH-I36dWOwh9V7QFPkfMyGSxMMLFwA2aDICqos9gssyuPdYlnbdMu-YjT4VAsqA0ZSRd2N3kF1CTSKaWKRWtruZcqSDQg-2z7URVrndZixXm9POvpEJWHgx7FIEQzW9wyg66YGPNYSqSe11W2Cw2Q9PtsZJNgaX350wXJ11B3-nlwEZ12oDv1fLDjch-CNkv33DsZC1DUNodEVFnmPf4fXN5vezw__&Key-Pair-Id=KVTP0A1DKRTAX (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1125)')))

### Test Code

In [9]:
def print_windows(file_number):
    myjson = load_json(file_number)
    serialized_json, meta_data = serialize_json(myjson)
    window_wise_list = window_search_using_serialized_JSON(serialized_json, meta_data)
    print("total_windows: ", len(window_wise_list)-1)
    
    for i, window in enumerate(window_wise_list):
        print("--------------------------Window ", i, "-----------------------------")
        sentence = ""
        for word in window:
            sentence=sentence+" "+str(word["text"])
        print(sentence)

In [67]:
print_windows(54)

total_windows:  65
--------------------------Window  0 -----------------------------

--------------------------Window  1 -----------------------------

--------------------------Window  2 -----------------------------

--------------------------Window  3 -----------------------------

--------------------------Window  4 -----------------------------

--------------------------Window  5 -----------------------------

--------------------------Window  6 -----------------------------

--------------------------Window  7 -----------------------------

--------------------------Window  8 -----------------------------

--------------------------Window  9 -----------------------------

--------------------------Window  10 -----------------------------

--------------------------Window  11 -----------------------------

--------------------------Window  12 -----------------------------

--------------------------Window  13 -----------------------------

--------------------------Window  14 --