In [8]:
import uuid
import json
import base64
import requests
import copy

# Define the PDF file path
pdf_path = "/content/Invoices.pdf"

# Function to send PDF for OCR processing
def get_ocr_data(pdf_path):
    # Google Document AI endpoint
    api_url = "https://us-documentai.googleapis.com/v1/projects/205398200267/locations/us/processors/2dde9d14469e70c9:process"

    # Read the PDF content and encode it in base64
    with open(pdf_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    # Convert PDF to base64 (this is required for most APIs)
    encoded_pdf = base64.b64encode(pdf_content).decode("utf-8")

    # API request payload (adjusted structure for Document AI)
    payload = {
        "skipHumanReview": True,
        "inlineDocument": {
            "mimeType": "application/pdf",
            "content": encoded_pdf
        }
    }

    # Make the API call (ensure you have an API key or OAuth token)
    headers = {"Authorization": "Bearer ya29.a0ARW5m77a9xCdpVH00-GpRh5PbqJMtJrEUIOatYVaoGZWl64wLmfPZ5400jtA3yqr-5frSaE0N9lifir1UzbDkXviKCVq6Nj6V86EdzfIQ_vS1cedWl-drw_fn48DQDw_MlrmAIzLCr8Tv9howmONQUkwjhvc5Cd_9l3mutzr3ACYJzkIaCgYKAXUSARMSFQHGX2Mi9rfaw6NRcaajB77iZdv7uw0183"}
    response = requests.post(api_url, headers=headers, json=payload)

    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")

# Call the OCR function and get the data
ocr_response = get_ocr_data(pdf_path)

def to_camel_case(snake_str):
    """Convert snake_case to camelCase."""
    components = snake_str.lower().split('_')
    return components[0] + ''.join(x.title() for x in components[1:])

def find_word_indices(content, word):
    """Find the start and end indices of a word in the digitized content, ignoring case differences."""
    content_lower = content.lower()
    word_lower = word.lower()
    start_index = content_lower.find(word_lower)
    if start_index == -1:
        return None  # Word not found
    end_index = start_index + len(word)
    return start_index, end_index

def map_kili_to_gcp(kili_response, digitized_content, ocr_response):
    """Map Kili annotations to GCP-compatible JSON format."""
    gcp_response = {
        "entities": [],
        "pages": [],
        "shardInfo": {
            "shardCount": "1"
        },
        "text": ocr_response['document']['text'],
        "uri": ""  # Assuming this is a placeholder, adjust if necessary
    }

    # Add pages to GCP response
    for page in ocr_response['document']['pages']:
        gcp_response["pages"].append(page)

    # Debugging: Check structure of Kili JSON
    if kili_response and isinstance(kili_response, list):
        kili_json_response = kili_response[0].get("latestLabel", {}).get("jsonResponse", {})
        # print("Keys in jsonResponse:", list(kili_json_response.keys()))  # Debugging

        for job_key, job_value in kili_json_response.items():
            if "annotations" in job_value:
                annotations = job_value["annotations"]
                # print(f"Found {len(annotations)} annotations under {job_key}")  # Debugging

                for annotation_entry in annotations:
                    for annotation in annotation_entry.get("annotations", []):
                        # Debugging: Print each annotation
                        # print(f"Annotation: {annotation}")

                        # Extract mention text and category
                        mention_text = annotation_entry.get("content", "").upper()
                        category = annotation_entry.get("categories", [{}])[0].get("name", "UNKNOWN")

                        # Convert category name to camelCase for GCP "type"
                        entity_type = to_camel_case(category)

                        # Generate a unique ID for each entity
                        entity_id = str(uuid.uuid4())[:16]

                        # Find the start and end indices based on the digitized content
                        indices = find_word_indices(digitized_content, mention_text)

                        if indices is None:
                            print(f"Warning: '{mention_text}' not found in digitized content.")
                            start_index, end_index = "", ""
                        else:
                            start_index, end_index = indices

                        # Map boundingPoly and normalizedVertices
                        bounding_poly = annotation.get("boundingPoly",[{}])[0].get("normalizedVertices",[{}])

                        gcp_entity = {
                            "confidence":1,
                            "id": entity_id,
                            "mentionText": mention_text,
                            "pageAnchor": {
                                "pageRefs": [
                                    {
                                        "boundingPoly": {
                                            "normalizedVertices": bounding_poly
                                        },
                                        "layoutType": "VISUAL_ELEMENT"
                                    }
                                ]
                            },
                            "textAnchor": {
                                "content": mention_text + "\n",
                                "textSegments": [
                                    {
                                        "endIndex": str(end_index) if end_index else "",
                                        "startIndex": str(start_index) if start_index else ""
                                    }
                                ]
                            },
                            "type": entity_type
                        }
                        gcp_response["entities"].append(gcp_entity)

    return gcp_response

def flatten_normalized_vertices(data):
    """
    Recursively processes the JSON to flatten nested normalizedVertices arrays.
    """
    if isinstance(data, list):
        return [flatten_normalized_vertices(item) for item in data]

    if isinstance(data, dict):
        new_dict = {}
        for key, value in data.items():
            if key == 'normalizedVertices' and isinstance(value, list) and len(value) > 0:
                # Remove one level of nesting if it exists
                new_dict[key] = value[0] if isinstance(value[0], list) else value
            else:
                new_dict[key] = flatten_normalized_vertices(value)
        return new_dict

    return data

def process_kili_json(kili_json):
    # Create a deep copy to avoid modifying the original data
    modified_json = copy.deepcopy(kili_json)
    return flatten_normalized_vertices(modified_json)

# Open the JSON file
with open('Kili_invoice_response.json', 'r') as file:
    # Load the JSON content
    kili_json = json.load(file)

digitized_content = ocr_response['document']['text']

# Example usage
# Assuming kili_json is your input JSON
modified_data = process_kili_json(kili_json)

# Map Kili annotations to GCP format
gcp_json = map_kili_to_gcp(modified_data, digitized_content,ocr_response)

# Save to a JSON file
output_file = "python_pdf_kili_to_gcp_invoice_11.json"
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(gcp_json, file, ensure_ascii=False, indent=4)

print(f"GCP JSON saved to {output_file}")

GCP JSON saved to python_pdf_kili_to_gcp_invoice_11.json


In [None]:
## Resume with single line

In [13]:
import uuid
import json
import base64
import requests
import copy

# Define the PDF file path
pdf_path = "/content/CriminalJusticeResumedocx.pdf"

# Function to send PDF for OCR processing
def get_ocr_data(pdf_path):
    # Google Document AI endpoint
    api_url = "https://us-documentai.googleapis.com/v1/projects/205398200267/locations/us/processors/2dde9d14469e70c9:process"

    # Read the PDF content and encode it in base64
    with open(pdf_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    # Convert PDF to base64 (this is required for most APIs)
    encoded_pdf = base64.b64encode(pdf_content).decode("utf-8")

    # API request payload (adjusted structure for Document AI)
    payload = {
        "skipHumanReview": True,
        "inlineDocument": {
            "mimeType": "application/pdf",
            "content": encoded_pdf
        }
    }

    # Make the API call (ensure you have an API key or OAuth token)
    headers = {"Authorization": "Bearer ya29.a0ARW5m77a9xCdpVH00-GpRh5PbqJMtJrEUIOatYVaoGZWl64wLmfPZ5400jtA3yqr-5frSaE0N9lifir1UzbDkXviKCVq6Nj6V86EdzfIQ_vS1cedWl-drw_fn48DQDw_MlrmAIzLCr8Tv9howmONQUkwjhvc5Cd_9l3mutzr3ACYJzkIaCgYKAXUSARMSFQHGX2Mi9rfaw6NRcaajB77iZdv7uw0183"}
    response = requests.post(api_url, headers=headers, json=payload)

    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")

# Call the OCR function and get the data
ocr_response = get_ocr_data(pdf_path)

def to_camel_case(snake_str):
    """Convert snake_case to camelCase."""
    components = snake_str.lower().split('_')
    return components[0] + ''.join(x.title() for x in components[1:])

def find_word_indices(content, word):
    """Find the start and end indices of a word in the digitized content, ignoring case differences."""
    content_lower = content.lower()
    word_lower = word.lower()
    start_index = content_lower.find(word_lower)
    if start_index == -1:
        return None  # Word not found
    end_index = start_index + len(word)
    return start_index, end_index

def map_kili_to_gcp(kili_response, digitized_content, ocr_response):
    """Map Kili annotations to GCP-compatible JSON format."""
    gcp_response = {
        "entities": [],
        "pages": [],
        "shardInfo": {
            "shardCount": "1"
        },
        "text": ocr_response['document']['text'],
        "uri": ""  # Assuming this is a placeholder, adjust if necessary
    }

    # Add pages to GCP response
    for page in ocr_response['document']['pages']:
        gcp_response["pages"].append(page)

    # Debugging: Check structure of Kili JSON
    if kili_response and isinstance(kili_response, list):
        kili_json_response = kili_response[0].get("latestLabel", {}).get("jsonResponse", {})
        # print("Keys in jsonResponse:", list(kili_json_response.keys()))  # Debugging

        for job_key, job_value in kili_json_response.items():
            if "annotations" in job_value:
                annotations = job_value["annotations"]
                # print(f"Found {len(annotations)} annotations under {job_key}")  # Debugging

                for annotation_entry in annotations:
                    for annotation in annotation_entry.get("annotations", []):
                        # Debugging: Print each annotation
                        # print(f"Annotation: {annotation}")

                        # Extract mention text and category
                        mention_text = annotation_entry.get("content", "").upper()
                        category = annotation_entry.get("categories", [{}])[0].get("name", "UNKNOWN")

                        # Convert category name to camelCase for GCP "type"
                        entity_type = to_camel_case(category)

                        # Generate a unique ID for each entity
                        entity_id = str(uuid.uuid4())[:16]

                        # Find the start and end indices based on the digitized content
                        indices = find_word_indices(digitized_content, mention_text)

                        if indices is None:
                            print(f"Warning: '{mention_text}' not found in digitized content.")
                            start_index, end_index = "", ""
                        else:
                            start_index, end_index = indices

                        # Map boundingPoly and normalizedVertices
                        bounding_poly = annotation.get("boundingPoly",[{}])[0].get("normalizedVertices",[{}])

                        gcp_entity = {
                            "confidence":1,
                            "id": entity_id,
                            "mentionText": mention_text,
                            "pageAnchor": {
                                "pageRefs": [
                                    {
                                        "boundingPoly": {
                                            "normalizedVertices": bounding_poly
                                        },
                                        "layoutType": "VISUAL_ELEMENT"
                                    }
                                ]
                            },
                            "textAnchor": {
                                "content": mention_text + "\n",
                                "textSegments": [
                                    {
                                        "endIndex": str(end_index) if end_index else "",
                                        "startIndex": str(start_index) if start_index else ""
                                    }
                                ]
                            },
                            "type": entity_type
                        }
                        gcp_response["entities"].append(gcp_entity)

    return gcp_response

def flatten_normalized_vertices(data):
    """
    Recursively processes the JSON to flatten nested normalizedVertices arrays.
    """
    if isinstance(data, list):
        return [flatten_normalized_vertices(item) for item in data]

    if isinstance(data, dict):
        new_dict = {}
        for key, value in data.items():
            if key == 'normalizedVertices' and isinstance(value, list) and len(value) > 0:
                # Remove one level of nesting if it exists
                new_dict[key] = value[0] if isinstance(value[0], list) else value
            else:
                new_dict[key] = flatten_normalized_vertices(value)
        return new_dict

    return data

def process_kili_json(kili_json):
    # Create a deep copy to avoid modifying the original data
    modified_json = copy.deepcopy(kili_json)
    return flatten_normalized_vertices(modified_json)

# Open the JSON file
with open('kili_resume_sample.json', 'r') as file:
    # Load the JSON content
    kili_json = json.load(file)

digitized_content = ocr_response['document']['text']
#print(digitized_content)

# Example usage
# Assuming kili_json is your input JSON
modified_data = process_kili_json(kili_json)

# Map Kili annotations to GCP format
gcp_json = map_kili_to_gcp(modified_data, digitized_content,ocr_response)

# Save to a JSON file
output_file = "python_pdf_kili_to_gcp_resume_01.json"
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(gcp_json, file, ensure_ascii=False, indent=4)

print(f"GCP JSON saved to {output_file}")

GCP JSON saved to python_pdf_kili_to_gcp_resume_01.json


In [None]:
## Resume with multiple lables within a label

In [23]:
import uuid
import json
import base64
import requests
import copy

# Define the PDF file path
pdf_path = "/content/CriminalJusticeResumedocx.pdf"

# Function to send PDF for OCR processing
def get_ocr_data(pdf_path):
    # Google Document AI endpoint
    api_url = "https://us-documentai.googleapis.com/v1/projects/205398200267/locations/us/processors/2dde9d14469e70c9:process"

    # Read the PDF content and encode it in base64
    with open(pdf_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    # Convert PDF to base64 (this is required for most APIs)
    encoded_pdf = base64.b64encode(pdf_content).decode("utf-8")

    # API request payload (adjusted structure for Document AI)
    payload = {
        "skipHumanReview": True,
        "inlineDocument": {
            "mimeType": "application/pdf",
            "content": encoded_pdf
        }
    }

    # Make the API call (ensure you have an API key or OAuth token)
    headers = {"Authorization": "Bearer ya29.a0ARW5m76KwZoNmymB7fowECoEy3mB4vvcO-7fRzLhPFSKjvKAJbcpreO8-6sx-GJ9FTzpMLrpIYfbvOAJnxWXwfSAxVfbtPKXaIluYRhhszfQLl0G4HmADmxhCjPyGrohu6K1bBA9T-_rfyi7krZQoUZx_FO6oLyRDtvfDGIOFk2Vn6fMaCgYKATESARMSFQHGX2MiUSa37Xqc0w-seT0O5rfwDg0183"}
    response = requests.post(api_url, headers=headers, json=payload)

    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")

# Call the OCR function and get the data
ocr_response = get_ocr_data(pdf_path)

def to_camel_case(snake_str):
    """Convert snake_case to camelCase."""
    components = snake_str.lower().split('_')
    return components[0] + ''.join(x.title() for x in components[1:])

def find_word_indices(content, word):
    """Find the start and end indices of a word in the digitized content, ignoring case differences."""
    content_lower = content.lower()
    word_lower = word.lower()
    start_index = content_lower.find(word_lower)
    if start_index == -1:
        return None  # Word not found
    end_index = start_index + len(word)
    return start_index, end_index

def map_kili_to_gcp(kili_response, digitized_content, ocr_response):
    """Map Kili annotations to GCP-compatible JSON format."""
    gcp_response = {
        "entities": [],
        "pages": [],
        "shardInfo": {
            "shardCount": "1"
        },
        "text": ocr_response['document']['text'],
        "uri": ""  # Assuming this is a placeholder, adjust if necessary
    }

    # Add pages to GCP response
    for page in ocr_response['document']['pages']:
        gcp_response["pages"].append(page)

    # Debugging: Check structure of Kili JSON
    if kili_response and isinstance(kili_response, list):
        kili_json_response = kili_response[0].get("latestLabel", {}).get("jsonResponse", {})
        # print("Keys in jsonResponse:", list(kili_json_response.keys()))  # Debugging

        for job_key, job_value in kili_json_response.items():
            if "annotations" in job_value:
                annotations = job_value["annotations"]
                # print(f"Found {len(annotations)} annotations under {job_key}")  # Debugging

                for annotation_entry in annotations:
                    for annotation in annotation_entry.get("annotations", []):
                        # Debugging: Print each annotation
                        # print(f"Annotation: {annotation}")

                        # Extract mention text and category
                        mention_text = annotation_entry.get("content", "").upper()
                        category = annotation_entry.get("categories", [{}])[0].get("name", "UNKNOWN")

                        # Convert category name to camelCase for GCP "type"
                        entity_type = to_camel_case(category)

                        # Generate a unique ID for each entity
                        entity_id = str(uuid.uuid4())[:16]

                        # Find the start and end indices based on the digitized content
                        indices = find_word_indices(digitized_content, mention_text)

                        if indices is None:
                            print(f"Warning: '{mention_text}' not found in digitized content.")
                            start_index, end_index = "", ""
                        else:
                            start_index, end_index = indices

                        # Map boundingPoly and normalizedVertices
                        bounding_poly = annotation.get("boundingPoly",[{}])[0].get("normalizedVertices",[{}])

                        gcp_entity = {
                            "confidence":1,
                            "id": entity_id,
                            "mentionText": mention_text,
                            "pageAnchor": {
                                "pageRefs": [
                                    {
                                        "boundingPoly": {
                                            "normalizedVertices": bounding_poly
                                        },
                                        "layoutType": "VISUAL_ELEMENT"
                                    }
                                ]
                            },
                            "textAnchor": {
                                "content": mention_text + "\n",
                                "textSegments": [
                                    {
                                        "endIndex": str(end_index) if end_index else "",
                                        "startIndex": str(start_index) if start_index else ""
                                    }
                                ]
                            },
                            "type": entity_type
                        }
                        gcp_response["entities"].append(gcp_entity)

    return gcp_response

def flatten_normalized_vertices(data):
    """
    Recursively processes the JSON to flatten nested normalizedVertices arrays.
    """
    if isinstance(data, list):
        return [flatten_normalized_vertices(item) for item in data]

    if isinstance(data, dict):
        new_dict = {}
        for key, value in data.items():
            if key == 'normalizedVertices' and isinstance(value, list) and len(value) > 0:
                # Remove one level of nesting if it exists
                new_dict[key] = value[0] if isinstance(value[0], list) else value
            else:
                new_dict[key] = flatten_normalized_vertices(value)
        return new_dict

    return data

def process_kili_json(kili_json):
    # Create a deep copy to avoid modifying the original data
    modified_json = copy.deepcopy(kili_json)
    return flatten_normalized_vertices(modified_json)

# Open the JSON file
with open('kili_sublable.json', 'r') as file:
    # Load the JSON content
    kili_json = json.load(file)

digitized_content = ocr_response['document']['text']
#print(digitized_content)

# Example usage
# Assuming kili_json is your input JSON
modified_data = process_kili_json(kili_json)

# Map Kili annotations to GCP format
gcp_json = map_kili_to_gcp(modified_data, digitized_content,ocr_response)

# Save to a JSON file
output_file = "python_pdf_kili_to_gcp_resume_07.json"
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(gcp_json, file, ensure_ascii=False, indent=4)

print(f"GCP JSON saved to {output_file}")

GCP JSON saved to python_pdf_kili_to_gcp_resume_07.json


In [None]:
## Resume with multiple lines

In [None]:
import uuid
import json
import base64
import requests
import copy

# Define the PDF file path
pdf_path = "/content/CriminalJusticeResumedocx.pdf"

# Function to send PDF for OCR processing
def get_ocr_data(pdf_path):
    # Google Document AI endpoint
    api_url = "https://us-documentai.googleapis.com/v1/projects/205398200267/locations/us/processors/2dde9d14469e70c9:process"

    # Read the PDF content and encode it in base64
    with open(pdf_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    # Convert PDF to base64 (this is required for most APIs)
    encoded_pdf = base64.b64encode(pdf_content).decode("utf-8")

    # API request payload (adjusted structure for Document AI)
    payload = {
        "skipHumanReview": True,
        "inlineDocument": {
            "mimeType": "application/pdf",
            "content": encoded_pdf
        }
    }

    # Make the API call (ensure you have an API key or OAuth token)
    headers = {"Authorization": "Bearer ya29.a0ARW5m76KwZoNmymB7fowECoEy3mB4vvcO-7fRzLhPFSKjvKAJbcpreO8-6sx-GJ9FTzpMLrpIYfbvOAJnxWXwfSAxVfbtPKXaIluYRhhszfQLl0G4HmADmxhCjPyGrohu6K1bBA9T-_rfyi7krZQoUZx_FO6oLyRDtvfDGIOFk2Vn6fMaCgYKATESARMSFQHGX2MiUSa37Xqc0w-seT0O5rfwDg0183"}
    response = requests.post(api_url, headers=headers, json=payload)

    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")

# Call the OCR function and get the data
ocr_response = get_ocr_data(pdf_path)

def to_camel_case(snake_str):
    """Convert snake_case to camelCase."""
    components = snake_str.lower().split('_')
    return components[0] + ''.join(x.title() for x in components[1:])

def find_word_indices(content, word):
    """Find the start and end indices of a word in the digitized content, ignoring case differences."""
    content_lower = content.lower()
    word_lower = word.lower()
    start_index = content_lower.find(word_lower)
    if start_index == -1:
        return None  # Word not found
    end_index = start_index + len(word)
    return start_index, end_index

def map_kili_to_gcp(kili_response, digitized_content, ocr_response):
    """Map Kili annotations to GCP-compatible JSON format."""
    gcp_response = {
        "entities": [],
        "pages": [],
        "shardInfo": {
            "shardCount": "1"
        },
        "text": ocr_response['document']['text'],
        "uri": ""  # Assuming this is a placeholder, adjust if necessary
    }

    # Add pages to GCP response
    for page in ocr_response['document']['pages']:
        gcp_response["pages"].append(page)

    # Debugging: Check structure of Kili JSON
    if kili_response and isinstance(kili_response, list):
        kili_json_response = kili_response[0].get("latestLabel", {}).get("jsonResponse", {})
        # print("Keys in jsonResponse:", list(kili_json_response.keys()))  # Debugging

        for job_key, job_value in kili_json_response.items():
            if "annotations" in job_value:
                annotations = job_value["annotations"]
                # print(f"Found {len(annotations)} annotations under {job_key}")  # Debugging

                for annotation_entry in annotations:
                    for annotation in annotation_entry.get("annotations", []):
                        # Debugging: Print each annotation
                        # print(f"Annotation: {annotation}")

                        # Extract mention text and category
                        mention_text = annotation_entry.get("content", "").upper()
                        category = annotation_entry.get("categories", [{}])[0].get("name", "UNKNOWN")

                        # Convert category name to camelCase for GCP "type"
                        entity_type = to_camel_case(category)

                        # Generate a unique ID for each entity
                        entity_id = str(uuid.uuid4())[:16]

                        # Find the start and end indices based on the digitized content
                        indices = find_word_indices(digitized_content, mention_text)

                        if indices is None:
                            print(f"Warning: '{mention_text}' not found in digitized content.")
                            start_index, end_index = "", ""
                        else:
                            start_index, end_index = indices

                        # Map boundingPoly and normalizedVertices
                        bounding_poly = annotation.get("boundingPoly",[{}])[0].get("normalizedVertices",[{}])

                        gcp_entity = {
                            "confidence":1,
                            "id": entity_id,
                            "mentionText": mention_text,
                            "pageAnchor": {
                                "pageRefs": [
                                    {
                                        "boundingPoly": {
                                            "normalizedVertices": bounding_poly
                                        },
                                        "layoutType": "VISUAL_ELEMENT"
                                    }
                                ]
                            },
                            "textAnchor": {
                                "content": mention_text + "\n",
                                "textSegments": [
                                    {
                                        "endIndex": str(end_index) if end_index else "",
                                        "startIndex": str(start_index) if start_index else ""
                                    }
                                ]
                            },
                            "type": entity_type
                        }
                        gcp_response["entities"].append(gcp_entity)

    return gcp_response

def flatten_normalized_vertices(data):
    """
    Recursively processes the JSON to flatten nested normalizedVertices arrays.
    """
    if isinstance(data, list):
        return [flatten_normalized_vertices(item) for item in data]

    if isinstance(data, dict):
        new_dict = {}
        for key, value in data.items():
            if key == 'normalizedVertices' and isinstance(value, list) and len(value) > 0:
                # Remove one level of nesting if it exists
                new_dict[key] = value[0] if isinstance(value[0], list) else value
            else:
                new_dict[key] = flatten_normalized_vertices(value)
        return new_dict

    return data

def process_kili_json(kili_json):
    # Create a deep copy to avoid modifying the original data
    modified_json = copy.deepcopy(kili_json)
    return flatten_normalized_vertices(modified_json)

# Open the JSON file
with open('kili_multiple_lines.json', 'r') as file:
    # Load the JSON content
    kili_json = json.load(file)

digitized_content = ocr_response['document']['text']
#print(digitized_content)

# Example usage
# Assuming kili_json is your input JSON
modified_data = process_kili_json(kili_json)

# Map Kili annotations to GCP format
gcp_json = map_kili_to_gcp(modified_data, digitized_content,ocr_response)

# Save to a JSON file
output_file = "python_pdf_kili_to_gcp_resume_08.json"
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(gcp_json, file, ensure_ascii=False, indent=4)

print(f"GCP JSON saved to {output_file}")

In [30]:
import uuid
import json
import base64
import requests
import copy
import re

# Define the PDF file path
pdf_path = "/content/CriminalJusticeResumedocx.pdf"

# Function to send PDF for OCR processing
def get_ocr_data(pdf_path):
    # Google Document AI endpoint
    api_url = "https://us-documentai.googleapis.com/v1/projects/205398200267/locations/us/processors/2dde9d14469e70c9:process"

    # Read the PDF content and encode it in base64
    with open(pdf_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    # Convert PDF to base64 (this is required for most APIs)
    encoded_pdf = base64.b64encode(pdf_content).decode("utf-8")

    # API request payload (adjusted structure for Document AI)
    payload = {
        "skipHumanReview": True,
        "inlineDocument": {
            "mimeType": "application/pdf",
            "content": encoded_pdf
        }
    }

    # Make the API call (ensure you have an API key or OAuth token)
    headers = {"Authorization": "Bearer ya29.a0ARW5m76KwZoNmymB7fowECoEy3mB4vvcO-7fRzLhPFSKjvKAJbcpreO8-6sx-GJ9FTzpMLrpIYfbvOAJnxWXwfSAxVfbtPKXaIluYRhhszfQLl0G4HmADmxhCjPyGrohu6K1bBA9T-_rfyi7krZQoUZx_FO6oLyRDtvfDGIOFk2Vn6fMaCgYKATESARMSFQHGX2MiUSa37Xqc0w-seT0O5rfwDg0183"}
    response = requests.post(api_url, headers=headers, json=payload)

    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")

# Call the OCR function and get the data
ocr_response = get_ocr_data(pdf_path)

def to_camel_case(snake_str):
    """Convert snake_case to camelCase."""
    components = snake_str.lower().split('_')
    return components[0] + ''.join(x.title() for x in components[1:])

def normalize_text(text):
    """
    Normalize text by:
    1. Converting to uppercase
    2. Removing extra whitespace
    3. Removing special characters
    4. Standardizing line breaks
    """
    if not isinstance(text, str):
        return ""

    # Convert to uppercase
    text = text.upper()
    # Replace multiple spaces and newlines with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,●-]', '', text)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text

def find_word_indices(content, word):
    """
    Find the start and end indices of a word in the digitized content,
    using normalized versions of both texts for comparison.
    """
    if not word or not content:
        return None

    # Normalize both the content and the search word
    normalized_content = normalize_text(content)
    normalized_word = normalize_text(word)

    # First try exact match
    start_index = normalized_content.find(normalized_word)

    if start_index == -1:
        # If exact match fails, try matching parts
        words = normalized_word.split()
        for i in range(len(words)):
            # Try matching with fewer words
            partial_phrase = ' '.join(words[:i+1])
            if partial_phrase in normalized_content:
                start_index = normalized_content.find(partial_phrase)
                end_index = start_index + len(partial_phrase)
                # Map back to original content indices
                original_start = len(normalize_text(content[:start_index]))
                original_end = original_start + len(word)
                return original_start, original_end
        return None

    end_index = start_index + len(normalized_word)

    # Map back to original content indices
    original_start = len(normalize_text(content[:start_index]))
    original_end = original_start + len(word)
    return original_start, original_end

def map_kili_to_gcp(kili_response, digitized_content, ocr_response):
    """Map Kili annotations to GCP-compatible JSON format."""
    gcp_response = {
        "entities": [],
        "pages": [],
        "shardInfo": {
            "shardCount": "1"
        },
        "text": ocr_response['document']['text'],
        "uri": ""
    }

    # Add pages to GCP response
    for page in ocr_response['document']['pages']:
        gcp_response["pages"].append(page)

    if kili_response and isinstance(kili_response, list):
        kili_json_response = kili_response[0].get("latestLabel", {}).get("jsonResponse", {})

        for job_key, job_value in kili_json_response.items():
            if "annotations" in job_value:
                annotations = job_value["annotations"]

                for annotation_entry in annotations:
                    for annotation in annotation_entry.get("annotations", []):
                        mention_text = annotation_entry.get("content", "")
                        category = annotation_entry.get("categories", [{}])[0].get("name", "UNKNOWN")

                        # Convert category name to camelCase for GCP "type"
                        entity_type = to_camel_case(category)

                        # Generate a unique ID for each entity
                        entity_id = str(uuid.uuid4())[:16]

                        # Find the start and end indices based on the digitized content
                        indices = find_word_indices(digitized_content, mention_text)

                        if indices is None:
                            print(f"Warning: Could not find exact match for: '{mention_text}'")
                            print(f"Normalized version: '{normalize_text(mention_text)}'")
                            start_index, end_index = 0, len(mention_text)  # Fallback values
                        else:
                            start_index, end_index = indices

                        # Map boundingPoly and normalizedVertices
                        bounding_poly = annotation.get("boundingPoly", [{}])[0].get("normalizedVertices", [{}])

                        gcp_entity = {
                            "confidence": 1,
                            "id": entity_id,
                            "mentionText": mention_text,
                            "pageAnchor": {
                                "pageRefs": [
                                    {
                                        "boundingPoly": {
                                            "normalizedVertices": bounding_poly
                                        },
                                        "layoutType": "VISUAL_ELEMENT"
                                    }
                                ]
                            },
                            "textAnchor": {
                                "content": mention_text + "\n",
                                "textSegments": [
                                    {
                                        "endIndex": str(end_index),
                                        "startIndex": str(start_index)
                                    }
                                ]
                            },
                            "type": entity_type
                        }
                        gcp_response["entities"].append(gcp_entity)

    return gcp_response

def flatten_normalized_vertices(data):
    """
    Recursively processes the JSON to flatten nested normalizedVertices arrays.
    """
    if isinstance(data, list):
        return [flatten_normalized_vertices(item) for item in data]

    if isinstance(data, dict):
        new_dict = {}
        for key, value in data.items():
            if key == 'normalizedVertices' and isinstance(value, list) and len(value) > 0:
                # Remove one level of nesting if it exists
                new_dict[key] = value[0] if isinstance(value[0], list) else value
            else:
                new_dict[key] = flatten_normalized_vertices(value)
        return new_dict

    return data

def process_kili_json(kili_json):
    # Create a deep copy to avoid modifying the original data
    modified_json = copy.deepcopy(kili_json)
    return flatten_normalized_vertices(modified_json)

# Open the JSON file
with open('final_kili_check.json', 'r') as file:
    # Load the JSON content
    kili_json = json.load(file)

digitized_content = ocr_response['document']['text']
#print(digitized_content)

# Example usage
# Assuming kili_json is your input JSON
modified_data = process_kili_json(kili_json)

# Map Kili annotations to GCP format
gcp_json = map_kili_to_gcp(modified_data, digitized_content,ocr_response)

# Save to a JSON file
output_file = "python_pdf_kili_to_gcp_resume_10.json"
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(gcp_json, file, ensure_ascii=False, indent=4)

print(f"GCP JSON saved to {output_file}")

GCP JSON saved to python_pdf_kili_to_gcp_resume_10.json
