## Post processing

The Notebook function performs data refinement tasks on JSON files(API result JSON file) derived from PDFs. It extracts essential fields specified under the 'content' key, utilizing bounding box coordinates obtained from the PDFs. The function then organizes this data into a structured format and stores it in an output JSON file, where each field is paired with its corresponding key.

#### Importing the necessary libraries

In [None]:
import json
import pandas as pd
import os
import re
import time

#### Custom Error class
This ErrorClass raise an `ProcessingError`, if the logic fails to process a document


In [None]:
class ProcessingError(Exception):
    """
    Exception raised for errors during document processing.

    Attributes:
        message (str): Explanation of the error.
    """

    def __init__(self, message="Error during document processing"):
        """
        Initialize the ProcessingError.

        Args:
            message (str, optional): Explanation of the error. Defaults to "Error during document processing".
        """
        self.message = message
        super().__init__(self.message)

### Post Processing Layer
This layer processes the JSON files and extract all the neccessary fields from the input JSON and generates the output Json file which contains all the fields extracted by the post_processing_layer

In [None]:
def post_processing_layer(api_result_file_name, input_folder_path, output_folder_path):
    """
    Perform post-processing on API result json data extracted from PDF files.

    The algorithm retrieves the essential fields identified under the 'content'
    key within the input JSON file(API result JSON file), utilizing coordinates from bounding boxes
    extracted from the PDF files. Subsequently, it stores these fields into the
    output JSON file, where each field is presented in the format of key-value pairs.

    Input:
        JSON file (API result JSON file)
    Args:
        api_result_file_name (list): List of filenames of the extracted data in JSON format.
        input_folder_path (str): Path to the directory containing the extracted data in JSON format.
        output_folder_path (str): Path to the output folder where the processed data will be stored.

    Raises:
        ProcessingError: If logic fails to process a document.

    Returns:
        None
    """

    # Construct paths for input JSON files
    test_20_res = [input_folder_path + i for i in api_result_file_name]

    # List to store failed documents
    failed_docs = []

    # Iterate over each JSON file
    for json_path in test_20_res:
        try:
            # Loading the JSON file
            with open(json_path) as f:
                res_json = json.load(f)

            # Initialize dictionary to store extracted data
            result_dict = {
                "page_num": [],
                "content": [],
                "top_left_x": [],
                "top_left_y": [],
                "top_right_x": [],
                "bottom_left_y": [],
            }

            # Iterate over each page in the JSON data
            for pg_data in res_json["pages"]:
                # Iterate over each line in the page
                for line in pg_data["lines"]:
                    # Extract relevant information and append to result_dict
                    result_dict["page_num"].append(pg_data["page_number"])
                    result_dict["content"].append(line["content"])
                    result_dict["top_left_x"].append(line["polygon"][0]["x"])
                    result_dict["top_left_y"].append(line["polygon"][0]["y"])
                    result_dict["top_right_x"].append(line["polygon"][1]["x"])
                    result_dict["bottom_left_y"].append(line["polygon"][3]["y"])

            # Convert result_dict to DataFrame
            df = pd.DataFrame(result_dict)

            # Add calculated columns 'mid_x' and 'mid_y'
            df["mid_x"] = (df["top_left_x"] + df["top_right_x"]) / 2
            df["mid_y"] = (df["top_left_y"] + df["bottom_left_y"]) / 2

            # Filter out rows with content 'RM'
            df2 = df[df["content"] != "RM"]

            # Capturing data based on rows from dataframe
            # Initialize nested_row_dict to store grouped content
            nested_row_dict = {}

            # Group content based on proximity of mid_y
            # Iterate over each page in df2
            for page_num, page_df in df2.groupby("page_num"):
                # Calculate threshold for grouping based on mean mid_y difference
                # Adjust this value as needed
                threshold = page_df["mid_y"].diff().mean()

                # Initialize variables to track grouped content
                grouped_content = []
                current_group = []
                prev_mid_y = None

                # Iterate over each row in the page_df
                for index, row in page_df.iterrows():
                    # Check proximity of mid_y with previous row
                    if (
                        prev_mid_y is None
                        or abs(row["mid_y"] - prev_mid_y) <= threshold
                    ):
                        current_group.append(row["content"])
                    else:
                        # Append current group to grouped_content and start a new group
                        grouped_content.append(current_group)
                        current_group = [row["content"]]
                    prev_mid_y = row["mid_y"]

                # Append the last group
                if current_group:
                    grouped_content.append(current_group)

                # Construct page_dict from grouped_content
                page_dict = {group[0]: group[1:] for group in grouped_content}

                # Add page_dict to nested_row_dict
                nested_row_dict[page_num] = page_dict

            # Extracting column data yearwise

            years = [
                "2008",
                "2009",
                "2010",
                "2011",
                "2012",
                "2013",
                "2014",
                "2015",
                "2016",
                "2017",
                "2018",
                "2019",
                "2020",
                "2021",
                "2022",
                "2023",
            ]

            # Filter rows with content in years list
            temp = df2[df2["content"].isin(years)]
            years_present = list(temp["content"].unique())
            print(years_present)

            # Initialize col_page_dict to store column data
            col_page_dict = {}

            # Group df2 by page_num
            grouped_df2 = df2.groupby("page_num")

            # Iterate over each page_num group
            for page_num, group in grouped_df2:
                col_page_dict[page_num] = {}

                # Calculate threshold only if the year is present in this page_num
                if all(year in group["content"].values for year in years_present):
                    threshold_list = []

                    # Calculate threshold for each year
                    if len(years_present) > 1:
                        for i in range(len(years_present) - 1):
                            difference = abs(
                                df2.loc[
                                    (df2["content"] == years_present[i])
                                    & (df2["page_num"] == page_num),
                                    "mid_x",
                                ].values[0]
                                - df2.loc[
                                    (df2["content"] == years_present[i + 1])
                                    & (df2["page_num"] == page_num),
                                    "mid_x",
                                ].values[0]
                            )
                            threshold_list.append(difference)

                        if len(threshold_list) > 1:
                            threshold_list.append(threshold_list[-2])
                        else:
                            threshold_list.append(threshold_list[0])
                    else:
                        threshold_list = [
                            df2.loc[
                                (df2["content"] == years_present[0])
                                & (df2["page_num"] == page_num),
                                "mid_x",
                            ].values[0]
                        ]

                    # Iterate over each year present in the page_num
                    for i, year in enumerate(years_present):
                        target_num = df2.loc[
                            (df2["content"] == year) & (df2["page_num"] == page_num),
                            "mid_x",
                        ].values

                        if len(target_num) > 0:
                            target_num = target_num[0]
                            col_page_dict[page_num][year] = []

                            # Adjust threshold calculation as needed for each page_num
                            threshold = threshold_list[i] - 0.5

                            for index, row in group.iterrows():
                                if abs(row["mid_x"] - target_num) < threshold:
                                    if row["content"] != year:
                                        col_page_dict[page_num][year].append(
                                            row["content"]
                                        )

            # Final dictionary in required format
            table_data_dict = {}
            for page_num in col_page_dict.keys():
                table_data_dict[page_num] = {}

                for year in years_present:
                    table_data_dict[page_num][year] = {}
                    if year in col_page_dict[page_num]:
                        for key, values in nested_row_dict[page_num].items():
                            # print(values)
                            for value in values:
                                if value in col_page_dict[page_num][year]:
                                    table_data_dict[page_num][year][key] = value

                ## Adding the local number into teh post processed json file
                for line in result_dict["content"]:
                    # Check if the line contains the string 'company number'
                    if "Company No" in line:
                        # If it does, Split the line by the string "Company No."
                        parts = line.split("Company No")
                        # The desired string will be the second part after the split
                        if len(parts) > 1:
                            # Remove any leading or trailing whitespace
                            company_number = parts[1].strip()
                            company_number1 = re.sub(
                                r"[^\w\s-]", "", str(company_number)
                            )
                        else:
                            print("Company number not found.")
                        break
                # Add the key-value pair to the table_data_dict dictionary
                for inner_dict in table_data_dict.values():
                    if inner_dict:
                        # Add the key-value pair to the first nested dictionary
                        inner_dict["Local No"] = company_number1
                        break

            # Construct destination path for the processed JSON file
            dest_path = os.path.join(output_folder_path, os.path.basename(json_path))

            # Write table_data_dict to the destination path
            with open(dest_path, "w") as json_file:
                json.dump(table_data_dict, json_file, indent=2)
            print()
            print(
                "----------------" + str(test_20_res.index(json_path)) + "-------------"
            )
            print()
        except:
            # Handle errors and append failed documents to failed_docs list
            failed_docs.append(json_path)
            print(
                f"-----------------Logic falied for {json_path}-----------------------"
            )

        # Raise an error if any documents failed during processing
        if failed_docs:
            raise ProcessingError(
                f"Failed to process the following documents: {failed_docs}"
            )

        return

### Description:

- This code snippet performs post-processing on data extracted from a PDF file.
- Defines input and output paths
- Records start and end times
- Executes the post-processing function
- Calculates the time taken, and prints the result.


In [None]:
# Define paths for input and output folders
api_result_file_name = ["API_result_file_name.json"]
input_folder_path = "path/to/input/folder"
output_folder_path = "path/to/output/folder"

# Record start time
start_time = time.time()

# Perform post-processing
post_processing_layer(api_result_file_name, input_folder_path, output_folder_path)

# Record end time
end_time = time.time()

# Calculate the time taken
time_taken = end_time - start_time

# Print the time taken
print(f"Time taken for the Post Processing: {time_taken} seconds")

['2017', '2016']

----------------0-------------

Time taken for the Post Processing: 0.39785170555114746 seconds
