In [1]:
from google.cloud import storage
from google.oauth2 import service_account
import json
import pandas as pd
import os

# Variables
bucket_name = 'this-is-the-final-dataset'  # Updated bucket name
service_account_key = '/Users/aritraghatak/Downloads/rossi-lab-ocr-f2d1ece09b96.json'  # Update with your actual service account key file path
output_directory = '/Users/aritraghatak/Desktop/Rossi_lab/Final_Excel/'  # Path to save Excel files

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Function to extract entities with bounding polygons
def extract_entities_with_bounding_polys(json_data, entity_type):
    entities_with_polys = []
    try:
        data = json.loads(json_data)
        entities = data.get('entities', [])
        for entity in entities:
            if entity.get('type', '') == entity_type:
                page_refs = entity.get('pageAnchor', {}).get('pageRefs', [])
                for page_ref in page_refs:
                    bounding_poly = page_ref.get('boundingPoly', {}).get('normalizedVertices', [])
                    entities_with_polys.append({
                        'text': entity.get('mentionText', ''),
                        'bounding_poly': bounding_poly
                    })
    except Exception as e:
        print(f"Error processing JSON data: {e}")
    return entities_with_polys

# Function to extract text from JSON based on entity type and target polygons
def extract_text_from_json(json_data, entity_type, target_polys):
    found_texts = []
    try:
        data = json.loads(json_data)
        entities = data.get('entities', [])
        for entity in entities:
            if entity.get('type', '') == entity_type:
                mention_text = entity.get('mentionText', '')
                page_refs = entity.get('pageAnchor', {}).get('pageRefs', [])
                for page_ref in page_refs:
                    bounding_poly = page_ref.get('boundingPoly', {}).get('normalizedVertices', [])
                    if bounding_poly in target_polys:
                        found_texts.append(mention_text)
    except Exception as e:
        print(f"Error processing JSON data: {e}")
    return found_texts

# Function to handle each entity type and extract its data
def process_entity_type(json_data, entity_type):
    bounding_polys = extract_entities_with_bounding_polys(json_data, entity_type)
    results = extract_text_from_json(json_data, entity_type, [bp['bounding_poly'] for bp in bounding_polys])
    return {entity_type: results}

# Function to process all JSON files in a bucket folder (both test and train)
def process_json_files_in_bucket(bucket_name, output_directory, folder_prefixes):
    try:
        # Explicitly create credentials from the service account file
        credentials = service_account.Credentials.from_service_account_file(service_account_key)

        # Initialize a client with the explicitly created credentials
        storage_client = storage.Client(credentials=credentials)

        # Get the bucket
        bucket = storage_client.bucket(bucket_name)

        # List of entity types to process
        entity_types = [
            "Defocus", "Video_Number", "Target", "Steering_Mirror",
            "Measured_Optical_Power", "Date", "Time", "OD_or_OS"
        ]

        # Process each folder (test/ and train/)
        for prefix in folder_prefixes:
            blobs = list(bucket.list_blobs(prefix=prefix))

            # Process each JSON file in the folder
            for blob in blobs:
                if blob.name.endswith('.json'):
                    print(f"Processing {blob.name}...")

                    # Download the JSON file's content as bytes
                    json_data = blob.download_as_string()

                    # Process each entity type and collect results
                    results = {}
                    for entity_type in entity_types:
                        results.update(process_entity_type(json_data, entity_type))

                    # Convert results to a pandas DataFrame
                    df = pd.DataFrame.from_dict(results, orient='index').transpose()

                    # Generate a unique Excel filename for each JSON file
                    json_filename = os.path.basename(blob.name).replace('.json', '')
                    excel_file_path = os.path.join(output_directory, f"{json_filename}.xlsx")

                    # Export DataFrame to Excel
                    df.to_excel(excel_file_path, index=False)
                    print(f"Results exported to {excel_file_path}")

    except Exception as e:
        print(f"Error processing files in bucket: {e}")

# List of folder prefixes to process (test/ and train/)
folder_prefixes = ['test/', 'train/']

# Call the function to process all files in both folders
process_json_files_in_bucket(bucket_name, output_directory, folder_prefixes)


Processing test/152e3e5c5a54315c/RLAB-0441_2022.04.07_note_page-0001.json...
Error processing files in bucket: 403 GET https://storage.googleapis.com/download/storage/v1/b/this-is-the-final-dataset/o/test%2F152e3e5c5a54315c%2FRLAB-0441_2022.04.07_note_page-0001.json?generation=1726542192069818&alt=media: The billing account for the owning project is disabled in state closed: ('Request failed with status code', 403, 'Expected one of', <HTTPStatus.OK: 200>, <HTTPStatus.PARTIAL_CONTENT: 206>)
