### This notebook contains python code to extract the minimal metadata from the MaveDB data dump. 

The notebook reads the MaveDB data, parses it to extract the relevant metadata, such as study details and associated publication details, and writes a separate flat json with the extracted metadata.

In [5]:
import json

In [6]:
# read json file
with open('Dump/mavedb-data.20240520/main.json') as input_file:
    original_json = input_file.read()

json_data = json.loads(original_json)['experimentSets']


In [7]:
# create a structure that will be used to create the json file
output_data = []

# loop through the json data and extract the required fields
exp_id = 1
for exp_set in json_data:
    # print(exp_set)
    experiments = exp_set['experiments']
    # print(experiments)
    for exp in experiments:
        # print(exp)
        mave_urn = exp['experimentSetUrn']
        study_title = exp['title']
        study_abstract = exp['abstractText']
        study_short_summary = exp['shortDescription']
        study_published_date = exp['publishedDate']
        study_doi = exp['doiIdentifiers']
        keywords = exp['keywords']

        # extract publications data
        # primary publications
        primary_publications = exp['primaryPublicationIdentifiers']

        primary_pub_url_list = []
        primary_pub_dbname_list = []
        primary_pub_title_list = []
        primary_pub_abstract_list = []
        primary_pub_authors_list = []
        primary_pub_doi_list = []
        primary_pub_date_list = []
        
        # loop through the primary publications and extract the required fields
        for pub in primary_publications:
            primary_pub_url = pub['url']
            primary_pub_dbname = pub['dbName']
            primary_pub_title = pub['title']
            primary_pub_abstract = pub['abstract']
            primary_pub_authors = [e['name'] for e in pub['authors']]
            if pub['publicationDoi'] == None:
                primary_pub_doi = pub['preprintDoi']
            else:
                primary_pub_doi = pub['publicationDoi']
            if pub['publicationYear'] == None:
                primary_pub_date = pub['preprintDate']
            else:
                primary_pub_date = pub['publicationYear'] 

            primary_pub_url_list.append(primary_pub_url)
            primary_pub_dbname_list.append(primary_pub_dbname)
            primary_pub_title_list.append(primary_pub_title)
            primary_pub_abstract_list.append(primary_pub_abstract)
            primary_pub_authors_list.append(primary_pub_authors)
            primary_pub_doi_list.append(primary_pub_doi)
            primary_pub_date_list.append(primary_pub_date)
        
        # flatten the lists if they contain only one element (most of the time they do)
        primary_pub_url_list = primary_pub_url_list[0] if len(primary_pub_url_list) == 1 else primary_pub_url_list
        primary_pub_dbname_list = primary_pub_dbname_list[0] if len(primary_pub_dbname_list) == 1 else primary_pub_dbname_list
        primary_pub_title_list = primary_pub_title_list[0] if len(primary_pub_title_list) == 1 else primary_pub_title_list
        primary_pub_abstract_list = primary_pub_abstract_list[0] if len(primary_pub_abstract_list) == 1 else primary_pub_abstract_list
        primary_pub_authors_list = primary_pub_authors_list[0] if len(primary_pub_authors_list) == 1 else primary_pub_authors_list
        primary_pub_doi_list = primary_pub_doi_list[0] if len(primary_pub_doi_list) == 1 else primary_pub_doi_list
        primary_pub_date_list = primary_pub_date_list[0] if len(primary_pub_date_list) == 1 else primary_pub_date_list

        # replace [] with None
        primary_pub_url_list = None if primary_pub_url_list == [] else primary_pub_url_list
        primary_pub_dbname_list = None if primary_pub_dbname_list == [] else primary_pub_dbname_list
        primary_pub_title_list = None if primary_pub_title_list == [] else primary_pub_title_list
        primary_pub_abstract_list = None if primary_pub_abstract_list == [] else primary_pub_abstract_list
        primary_pub_authors_list = None if primary_pub_authors_list == [] else primary_pub_authors_list
        primary_pub_doi_list = None if primary_pub_doi_list == [] else primary_pub_doi_list
        primary_pub_date_list = None if primary_pub_date_list == [] else primary_pub_date_list

        # secondary publications
        secondary_publications = exp['secondaryPublicationIdentifiers']

        secondary_pub_url_list = []
        secondary_pub_dbname_list = []
        secondary_pub_title_list = []
        secondary_pub_abstract_list = []
        secondary_pub_authors_list = []
        secondary_pub_doi_list = []
        secondary_pub_date_list = []
        
        # loop through the secondary publications and extract the required fields
        for pub in secondary_publications:
            secondary_pub_url = pub['url']
            secondary_pub_dbname = pub['dbName']
            secondary_pub_title = pub['title']
            secondary_pub_abstract = pub['abstract']
            secondary_pub_authors = [e['name'] for e in pub['authors']]
            if pub['publicationDoi'] == None:
                secondary_pub_doi = pub['preprintDoi']
            else:
                secondary_pub_doi = pub['publicationDoi']
            if pub['publicationYear'] == None:
                secondary_pub_date = pub['preprintDate']
            else:
                secondary_pub_date = pub['publicationYear'] 

            secondary_pub_url_list.append(secondary_pub_url)
            secondary_pub_dbname_list.append(secondary_pub_dbname)
            secondary_pub_title_list.append(secondary_pub_title)
            secondary_pub_abstract_list.append(secondary_pub_abstract)
            secondary_pub_authors_list.append(secondary_pub_authors)
            secondary_pub_doi_list.append(secondary_pub_doi)
            secondary_pub_date_list.append(secondary_pub_date)
        
        # flatten the lists if they contain only one element (most of the time they do)
        secondary_pub_url_list = secondary_pub_url_list[0] if len(secondary_pub_url_list) == 1 else secondary_pub_url_list
        secondary_pub_dbname_list = secondary_pub_dbname_list[0] if len(secondary_pub_dbname_list) == 1 else secondary_pub_dbname_list
        secondary_pub_title_list = secondary_pub_title_list[0] if len(secondary_pub_title_list) == 1 else secondary_pub_title_list
        secondary_pub_abstract_list = secondary_pub_abstract_list[0] if len(secondary_pub_abstract_list) == 1 else secondary_pub_abstract_list
        secondary_pub_authors_list = secondary_pub_authors_list[0] if len(secondary_pub_authors_list) == 1 else secondary_pub_authors_list
        secondary_pub_doi_list = secondary_pub_doi_list[0] if len(secondary_pub_doi_list) == 1 else secondary_pub_doi_list
        secondary_pub_date_list = secondary_pub_date_list[0] if len(secondary_pub_date_list) == 1 else secondary_pub_date_list

        # replace [] with None
        secondary_pub_url_list = None if secondary_pub_url_list == [] else secondary_pub_url_list
        secondary_pub_dbname_list = None if secondary_pub_dbname_list == [] else secondary_pub_dbname_list
        secondary_pub_title_list = None if secondary_pub_title_list == [] else secondary_pub_title_list
        secondary_pub_abstract_list = None if secondary_pub_abstract_list == [] else secondary_pub_abstract_list
        secondary_pub_authors_list = None if secondary_pub_authors_list == [] else secondary_pub_authors_list
        secondary_pub_doi_list = None if secondary_pub_doi_list == [] else secondary_pub_doi_list
        secondary_pub_date_list = None if secondary_pub_date_list == [] else secondary_pub_date_list
        
        # populate the output_data
        output_data.append({
            'exp_id': exp_id,
            'mave_urn': mave_urn,
            'study_title': study_title,
            'study_abstract': study_abstract,
            'study_short_summary': study_short_summary,
            'study_published_date': study_published_date,
            # 'study_doi': study_doi,
            'keywords': keywords,
            
            'pub_url': primary_pub_url_list if primary_pub_url_list != None else secondary_pub_url_list,
            'pub_dbname': primary_pub_dbname_list if primary_pub_dbname_list != None else secondary_pub_dbname_list,
            'pub_title': primary_pub_title_list if primary_pub_title_list != None else secondary_pub_title_list,
            'pub_abstract': primary_pub_abstract_list if primary_pub_abstract_list != None else secondary_pub_abstract_list,
            'pub_authors': primary_pub_authors_list if primary_pub_authors_list != None else secondary_pub_authors_list,
            'pub_doi': primary_pub_doi_list if primary_pub_doi_list != None else secondary_pub_doi_list,
            'pub_date': primary_pub_date_list if primary_pub_date_list != None else secondary_pub_date_list
        })
        # increment the exp_id
        exp_id += 1



In [8]:
# write the output_data to a json file

with open("MaveDB_minimal_metadata.json", mode="w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)