In [1]:
# Imports
import json
import re

# Functions
def convert_schema(input_file, output_file):
    # Read in input file
    with open(input_file) as infile:
        input_schema = json.load(infile)

    # Parse and convert input file
    output_schema = {}
    table_list = []
    relationship_list = []
    primary_keys = {}
    foreign_keys = []
    if input_schema.get("$defs"):
        # Loop through tables and build table objects
        for table, table_def in input_schema["$defs"].items():
            required_fields = table_def.get("required")
            primary_keys[table] = required_fields[0]
            properties = table_def.get("properties")
            column_list = []
            if properties:
                # Loop through columns and build column objects
                for column, column_def in properties.items():
                    # Record foreign keys
                    if column_def.get("$ref"):
                        final_column_name = re.search(r"(^.*)_fk$", column).group(1) 
                        target_table = re.search(r"\$defs\/(.*)$", column_def.get("$ref")).group(1)
                        foreign_keys.append({"from_table": table, "from_column": final_column_name, "to_table": target_table})
                    else:
                        final_column_name = column
                    # Build column object
                    array_of = False
                    required = True if column in required_fields else False
                    initial_data_type = column_def.get("type")
                    if initial_data_type == "array":
                        final_data_type = column_def["items"].get("type")
                        array_of = True
                    elif initial_data_type:
                        final_data_type = initial_data_type
                    else:
                        final_data_type = "string"
                    column_dict = {"name": final_column_name, "datatype": final_data_type, "array_of": array_of, "required": required}
                    column_list.append(column_dict)   
            primary_key = required_fields if required_fields else []
            table_dict = {"name": table, "columns": column_list, "primaryKey": primary_key}
            table_list.append(table_dict)

        # Loop through recorded foreign keys and build relationship objects
        for fk_entry in foreign_keys:
            from_table = fk_entry["from_table"]
            from_column = fk_entry["from_column"]
            to_table = fk_entry["to_table"]
            to_column = primary_keys.get(to_table)
            rel_name = from_table + "." + from_column + "_to_" + to_table + "." + to_column
            relationship_dict = {"name": rel_name, "from": {"table": from_table, "column": from_column}, "to": {"table": to_table, "column": to_column}}
            relationship_list.append(relationship_dict)

        # Add table and relationship objects to output schema
        if table_list:
            output_schema["tables"] = table_list
        if relationship_list:
            output_schema["relationships"] = relationship_list

        # Write out output file
        with open(output_file, "w") as outfile:
            json.dump(output_schema, outfile)

# Parameters
file_path = "/home/cox/git/biocore-data-model/content/"
input_file = file_path + "json_schema/AnVILBioCoreMinimal.schema.json"
output_file = "output/AnVILBioCoreMinimal.tdr.json"
# Execution
convert_schema(input_file, output_file)

AttributeError: 'NoneType' object has no attribute 'group'

In [4]:
    with open(input_file) as infile:
        input_schema = json.load(infile)



In [5]:
input_schema

{'$defs': {'AnvilBioSample': {'additionalProperties': False,
   'description': 'Contains information about the sample(s) included in the study.',
   'properties': {'anatomical_site': {'description': 'A reference to the site within the organism from which the BioSample was taken.',
     'type': 'string'},
    'apriori_cell_type': {'description': 'A priori cell type(s) for the sample, a human assignment of cell type. This should be used when the cell type is known, but not necessarily confirmed by primary experimental data.',
     'items': {'type': 'string'},
     'type': 'array'},
    'biosample_id': {'type': 'string'},
    'biosample_type': {'description': 'The type of biosample represented by the record. This is a controlled vocabulary term from BioCoreTerms.',
     'type': 'string'},
    'disease': {'description': 'A property that identifies a disease or condition has been reported in this entity.',
     'type': 'string'},
    'donor_age_at_collection_lower_bound': {'description': 'L