In [8]:
# Imports
import json
import re

# Functions
def convert_schema(input_file, output_file):
    # Read in input file
    with open(input_file) as infile:
        input_schema = json.load(infile)

    # Parse and convert input file
    input_schema_filtered = {}
    output_schema = {}
    table_list = []
    relationship_list = []
    primary_keys = {}
    foreign_keys = []
    if input_schema.get("$defs"):
        # Determine the set of tables to consider
        for table, table_def in input_schema["$defs"].items():
            if table_def.get("type") == "object" and table_def.get("properties"):
                input_schema_filtered[table] = table_def
        # Loop through tables to consider and build table objects
        for table, table_def in input_schema_filtered.items():
            required_fields = table_def.get("required")
            primary_keys[table] = required_fields[0]
            properties = table_def.get("properties")
            column_list = []
            if properties:
                # Loop through columns and build column objects
                for column, column_def in properties.items():
                    # Record foreign keys
                    if column_def.get("$ref"):
                        fk_col_search = re.search(r"(^.*)_fk$", column) # requires the _fk suffix to identify a foreign key
                        if fk_col_search:
                            final_column_name = fk_col_search.group(1) 
                        else:
                            final_column_name = column
                        target_tab_search = re.search(r"\$defs\/(.*)$", column_def.get("$ref"))
                        if target_tab_search:
                            target_table = target_tab_search.group(1)
                            if target_table in input_schema_filtered.keys():
                                foreign_keys.append({"from_table": table, "from_column": final_column_name, "to_table": target_table})
                    else:
                        final_column_name = column
                    # Build column object
                    array_of = False
                    required = True if column in required_fields else False
                    initial_data_type = column_def.get("type")
                    if initial_data_type == "array":
                        final_data_type = column_def["items"].get("type") # this is returning array but should be the type of the array
                        array_of = True
                    elif initial_data_type == "number":
                        final_data_type = "numeric"
                    elif initial_data_type:
                        final_data_type = initial_data_type
                    else:
                        final_data_type = "string"
                    column_dict = {"name": final_column_name, "datatype": final_data_type, "array_of": array_of, "required": required}
                    column_list.append(column_dict)   
            primary_key = required_fields if required_fields else []
            table_dict = {"name": table, "columns": column_list, "primaryKey": primary_key}
            table_list.append(table_dict)

        # Loop through recorded foreign keys and build relationship objects
        for fk_entry in foreign_keys:
            from_table = fk_entry["from_table"]
            from_column = fk_entry["from_column"]
            to_table = fk_entry["to_table"]
            to_column = primary_keys.get(to_table)
            rel_name = from_table + "." + from_column + "_to_" + to_table + "." + to_column
            relationship_dict = {"name": rel_name, "from": {"table": from_table, "column": from_column}, "to": {"table": to_table, "column": to_column}}
            relationship_list.append(relationship_dict)

        # Add table and relationship objects to output schema
        if table_list:
            output_schema["tables"] = table_list
        if relationship_list:
            output_schema["relationships"] = relationship_list

        # Write out output file
        with open(output_file, "w") as outfile:
            json.dump(output_schema, outfile)

# Parameters
input_file = "input/AnVILBioCoreMinimal.schema.json"
output_file = "output/biocore_tdr_schema.json"

# Execution
convert_schema(input_file, output_file)

In [9]:
print("done")

done
