In [1]:
# Import requirements
import json
import pandas as pd
from ord_schema.message_helpers import load_message
from ord_schema.proto import dataset_pb2
from google.protobuf.json_format import MessageToJson

In [2]:
input_fname = "ord_dataset-0c75d67751634f0594b24b9f498b77c2.pb.gz"
dataset = load_message(
    input_fname,
    dataset_pb2.Dataset,
)

reactions_json = []

for index, rxn in enumerate(dataset.reactions):
    rxn_json = json.loads(
        MessageToJson(
            message=rxn,
            including_default_value_fields=False,
            preserving_proto_field_name=True,
            indent=2,
            sort_keys=False,
            use_integers_for_enums=False,
            descriptor_pool=None,
            float_precision=None,
            ensure_ascii=True,
        )
    )
    reactions_json.append(rxn_json)
    print(f"Converted reaction {index + 1}/{len(dataset.reactions)} to JSON.")

output_fname = 'Imidazole_Arylation.json'

with open(output_fname, 'w', encoding='utf-8') as f:
    json.dump(reactions_json, f, ensure_ascii=False, indent=4)

print(f"All reactions from {input_fname} have been converted and saved to {output_fname}")

Converted reaction 1/256 to JSON.
Converted reaction 2/256 to JSON.
Converted reaction 3/256 to JSON.
Converted reaction 4/256 to JSON.
Converted reaction 5/256 to JSON.
Converted reaction 6/256 to JSON.
Converted reaction 7/256 to JSON.
Converted reaction 8/256 to JSON.
Converted reaction 9/256 to JSON.
Converted reaction 10/256 to JSON.
Converted reaction 11/256 to JSON.
Converted reaction 12/256 to JSON.
Converted reaction 13/256 to JSON.
Converted reaction 14/256 to JSON.
Converted reaction 15/256 to JSON.
Converted reaction 16/256 to JSON.
Converted reaction 17/256 to JSON.
Converted reaction 18/256 to JSON.
Converted reaction 19/256 to JSON.
Converted reaction 20/256 to JSON.
Converted reaction 21/256 to JSON.
Converted reaction 22/256 to JSON.
Converted reaction 23/256 to JSON.
Converted reaction 24/256 to JSON.
Converted reaction 25/256 to JSON.
Converted reaction 26/256 to JSON.
Converted reaction 27/256 to JSON.
Converted reaction 28/256 to JSON.
Converted reaction 29/256 to 

In [8]:
with open('Imidazole_Arylation.json', 'r', encoding='utf-8') as file:
    try:
        data = json.load(file)
    except json.JSONDecodeError:
        print("Error decoding JSON. Ensure the file is formatted correctly.")
        
if isinstance(data, dict):
    data = [data]

reaction_data = []

for reaction in data:
    if isinstance(reaction, dict):
        reaction_row = {}
        reaction_row['reaction_id'] = reaction.get('reaction_id', None)
        temperature_info = reaction.get('conditions', {}).get('temperature', {})
        reaction_row['temperature_value'] = temperature_info.get('setpoint', {}).get('value', None)
        reaction_row['temperature_units'] = temperature_info.get('setpoint', {}).get('units', None)
        
        component_counts = {}
        def add_component_to_row(role, component, prefix):
            component_name = next((ident['value'] for ident in component.get('identifiers', []) if ident['type'] == 'NAME'), None)
            component_smiles = next((ident['value'] for ident in component.get('identifiers', []) if ident['type'] == 'SMILES'), None)
            mass_value = component.get('amount', {}).get('mass', {}).get('value', None)
            mass_units = component.get('amount', {}).get('mass', {}).get('units', None)
            mole_value = component.get('amount', {}).get('moles', {}).get('value', None)
            mole_units = component.get('amount', {}).get('moles', {}).get('units', None)
            volume_value = component.get('amount', {}).get('volume', {}).get('value', None)
            volume_units = component.get('amount', {}).get('volume', {}).get('units', None)

            reaction_row[f'{prefix}_{role}_Name'] = component_name
            reaction_row[f'{prefix}_{role}_SMILES'] = component_smiles
            reaction_row[f'{prefix}_{role}_Mass_Value'] = mass_value
            reaction_row[f'{prefix}_{role}_Mass_Units'] = mass_units
            reaction_row[f'{prefix}_{role}_Mole_Value'] = mole_value
            reaction_row[f'{prefix}_{role}_Mole_Units'] = mole_units
            reaction_row[f'{prefix}_{role}_Volume_Value'] = volume_value
            reaction_row[f'{prefix}_{role}_Volume_Units'] = volume_units

        for role, role_data in reaction.get('inputs', {}).items():
            for component in role_data.get('components', []):
                # Determine component type based on reaction role
                component_type = component.get('reaction_role', 'Other')
                
                if component_type not in component_counts:
                    component_counts[component_type] = 0
                    
                prefix = f"{component_type}_{component_counts[component_type]}"
                add_component_to_row(role=component_counts[component_type], component=component, prefix=component_type)
                component_counts[component_type] += 1

        for outcome in reaction.get('outcomes', []):
            for product in outcome.get('products', []):
                product_name = next((ident['value'] for ident in product.get('identifiers', []) if ident['type'] == 'NAME'), None)
                product_smiles = next((ident['value'] for ident in product.get('identifiers', []) if ident['type'] == 'SMILES'), None)
                yield_percent = next((m['percentage']['value'] for m in product.get('measurements', []) if 'percentage' in m), None)
                prefix = f"Product_{component_counts.get('Product', 0)}"
                reaction_row[f'{prefix}_Name'] = product_name
                reaction_row[f'{prefix}_SMILES'] = product_smiles
                reaction_row[f'{prefix}_Yield_Percent'] = yield_percent
                component_counts['Product'] = component_counts.get('Product', 0) + 1
                
        reaction_data.append(reaction_row)

df = pd.DataFrame(reaction_data)
df = df.dropna(axis='columns')
df.to_csv('full_imidazole_arylation_data.csv', encoding='utf-8', index=False)
print("CSV file created successfully.")

CSV file created successfully.
