# Convert OCR Output to target .JSON output

In [None]:
import json
import pandas as pd
import numpy as np
import os
os.chdir("../")
from utils import *
import re

In [None]:
import glob

files_ocr = glob.glob('./FR_output/*.json')
files_labels = glob.glob('./GT_check/*.json')

idx = 0
filename_ocr = files_ocr[idx]
filename_label = files_labels[idx]


with open(f"./{filename_ocr}", 'r', encoding="utf8") as f:
    data = json.load(f)

with open(f"./{filename_label}", 'r', encoding="utf8") as f:
    data_label = json.load(f)

In [None]:
# convert bounding boxes
regx_img_name_png = re.compile("(^(.*?)PNG)")
regx_img_name_jpeg = re.compile("(^(.*?)jpeg)")
regx_img = re.compile("([^\/]+$)")

if ".PNG" in filename_ocr:
    img_path = regx_img_name_png.findall(filename_ocr)[0][0]
if ".jpeg" in filename_ocr:
    img_path = regx_img_name_jpeg.findall(filename_ocr)[0][0]

img_name = regx_img.findall(img_path)[0]


In [None]:
from PIL import Image

im = Image.open(f'./images/{img_name}')
width, height = im.size

print(width, height)

## Parse Document Text

### FR Output

In [None]:
base_data = data["analyzeResult"]["pages"]

#Parse original output
page_list2, text_list2,bb_list2, confidence_list2, level2 = parse_text_ocr(base_data)


# Format text into target format per page
df2 = pd.DataFrame([page_list2, text_list2,bb_list2, confidence_list2, level2]).T
df2.columns = ["page", "text","bbox", "confidence", "level"]
df = df2.copy()
#df = pd.concat([df1, df2], axis=0)
assert len(df) == len(df[~df.confidence.isna()])
# convert bounding boxes
df["bbox_formatted"] = [convert_inches_pixel(list(df["bbox"].iloc[i])) for i in range(len(df))] #df["bbox"] 
df.sample(3)

# Iterate by page
text_by_page_formatted = []
for i in df["page"].unique():
    df_page= df[df.page == i]
    text_by_page_formatted.append([format_json_sublevel(df_page, u) for u in range(len(df_page))])

### Label Output

In [None]:
base_data = data_label["labels"]

#Parse original output
page_list2, text_list2,bb_list2, level_cat = parse_text_labels(base_data)

# Format text into target format per page
df2 = pd.DataFrame([page_list2, text_list2,bb_list2, level_cat]).T
df2.columns = ["page", "text","bbox", "level"]
df_label = df2.copy()

# convert bounding boxes
df_label["bbox_formatted"] = [convert_inches_pixel_normalized_vector(list(df_label["bbox"].iloc[i]), pixel_conv_x=width, pixel_conv_y=height) for i in range(len(df_label))]


# Iterate by page
text_by_page_formatted_label = []
for i in df_label["page"].unique():
    df_label_page= df_label[df_label.page == i]
    text_by_page_formatted_label.append([format_json_sublevel_label(df_label_page, u) for u in range(len(df_label_page))])

### Parse Tables

In [None]:
try: 
    base_data_ = data["analyzeResult"]["pageResults"] #THIS IS WHEN TABLES ARE PRESENT IN THE DOCUMENT

    try:
        page_index, text_list3,bb_list3, confidence_, columnIndex, rowIndex, tableIndex =  parse_tables(base_data_)
        df_table = pd.DataFrame([page_index, text_list3,bb_list3, confidence_, columnIndex, rowIndex, tableIndex]).T
        df_table.columns = ["page", "text","bbox", "confidence", "columnIndex", "rowIndex", "tableIndex"]
        df_table["bbox_formatted"] = [convert_inches_pixel(list(df_table["bbox"].iloc[i])) for i in range(len(df_table))]

        ###############################
        ############ TABLE ############
        ###############################
        
        # iterate per page per table
        tables_by_page_formatted = []
        for i in df_table.page.unique():
            tables_by_page_formatted.append(iterate_table_per_page(df_table[df_table.page == i]))
        
    except KeyError:
        pass
except KeyError:
    print("no tables")
    #IF NO TABLES PRESENT
    pass


## OPTIONAL: Format .JSON

```
try:
    assert len(text_by_page_formatted) == len(tables_by_page_formatted)
    d_all_pages = [merge_dicts(text_by_page_formatted[ii], tables_by_page_formatted[ii]) for ii in range(len(text_by_page_formatted))]
except:
    pass
    d_all_pages = text_by_page_formatted
```
