### This notebook contains python code to extract the **full** metadata from the MaveDB data dump. 

The notebook reads the MaveDB data, parses it to extract all the metadata (apart from some irrelevant fields), and writes a separate json and csv with the extracted metadata.

In [2]:
import json
import pandas as pd

In [3]:
# read json file
with open('Dump/mavedb-data.20240520/main.json') as input_file:
    original_json = input_file.read()

# extract experimentSets which contains the data
json_data = json.loads(original_json)['experimentSets']

In [8]:
# create the output dataframe to store the data
output_df = pd.DataFrame()

# loop through the json data and extract the required fields
for exp_set in json_data:
    experiments = exp_set["experiments"]
    # loop through the experiments and extract all the data
    # this way I disregard the experiment set as a structure
    for exp in experiments:
        exp_meta_df = pd.DataFrame(
            {
                "exp_title": exp["title"],
                "exp_short_description": exp["shortDescription"],
                "exp_abstract": exp["abstractText"],
                "exp_published_date": exp["publishedDate"],
                "exp_associated_datasets": [exp["rawReadIdentifiers"]],
                "exp_urn": exp["urn"]
                # "keywords": exp["keywords"]
            },
            index=[0]
        )
        
        # extract the score set data
        score_set_output_df = pd.DataFrame()
        
        for score_set in exp["scoreSets"]:
            # extract the relevant score set data
            score_set_df = pd.DataFrame(
                {
                    "score_set_title": score_set["title"],
                    "score_set_method": score_set["methodText"],
                    "score_set_abstract": score_set["abstractText"],
                    "score_set_short_description": score_set["shortDescription"],
                    "score_set_num_variants": score_set["numVariants"],
                    "score_set_published_date": score_set["publishedDate"],
                    "score_set_keywords": [score_set["keywords"]],
                    "score_set_dataset_count_columns": [score_set["datasetColumns"]['countColumns']],
                    "score_set_dataset_score_columns": [score_set["datasetColumns"]['scoreColumns']],
                    "score_set_processing_state": score_set["processingState"],
                    "exp_urn": exp["urn"],
                    "score_set_urn": score_set["urn"]
                },
                index=[0]
            )
            # extract the primary and secondary publication data
            primary_pub_df = pd.DataFrame(score_set["primaryPublicationIdentifiers"])
            # remove the id column
            if 'id' in primary_pub_df.columns:
                primary_pub_df = primary_pub_df.drop(columns=["id"])
            # add prefix to column names
            primary_pub_df.columns = ["primary_pub_" + col for col in primary_pub_df.columns]
            primary_pub_df["score_set_urn"] = score_set["urn"]
            primary_pub_df["exp_urn"] = exp["urn"]
            secondary_pub_df = pd.DataFrame(score_set["secondaryPublicationIdentifiers"])
            # remove the id column
            if 'id' in secondary_pub_df.columns:
                secondary_pub_df = secondary_pub_df.drop(columns=["id"])
            # add prefix to column names
            secondary_pub_df.columns = ["secondary_pub_" + col for col in secondary_pub_df.columns]
            secondary_pub_df["score_set_urn"] = score_set["urn"]
            secondary_pub_df["exp_urn"] = exp["urn"]
            
            # join score set + primary and secondary publication dataframes
            score_set_df = pd.merge(score_set_df, primary_pub_df, on=["score_set_urn", "exp_urn"], how="outer")
            score_set_df = pd.merge(score_set_df, secondary_pub_df, on=["score_set_urn", "exp_urn"], how="outer")

            # extract the target genes data
            target_genes_df = pd.DataFrame()
            
            # if the targetGenes field is empty, create an empty dataframe with all the columns and score_set_urn
            # else, loop through the targetGenes and extract the required fields
            if score_set['targetGenes'] == []:
                target_gene_df = pd.DataFrame(
                    {
                        "gene_name": None,
                        "gene_category": None,
                        "sequence_type": None,
                        "sequence": None,
                        "label": None,
                        "taxId": None,
                        "organismName": None,
                        "score_set_urn": score_set["urn"]
                    },
                    index=[0]
                )
                # append the target gene dataframe to the target genes dataframe
                target_genes_df = pd.concat([target_genes_df, target_gene_df], axis=0)
            else:
                for gene in score_set['targetGenes']:
                    target_gene_df = pd.DataFrame(
                        {
                            "gene_name": gene["name"],
                            "gene_category": gene["category"]
                        },
                        index=[0]
                    )
                    for gene_external_id in gene['externalIdentifiers']:
                        gene_external_id_df = pd.DataFrame(gene_external_id['identifier'], index=[0])
                        gene_external_id_df.columns = [gene_external_id['identifier']['dbName'] + '_' + col for col in gene_external_id_df.columns]
                        target_gene_df = pd.concat([target_gene_df, gene_external_id_df], axis=1)
                    target_gene_df["sequence_type"] = gene["targetSequence"]["sequenceType"]
                    target_gene_df["sequence"] = gene["targetSequence"]["sequence"]
                    target_gene_df["label"] = gene["targetSequence"]["label"]
                    target_gene_taxonomy_df = pd.DataFrame(gene["targetSequence"]["taxonomy"], index=[0]).drop(columns=["id"])
                    target_gene_df = pd.concat([target_gene_df, target_gene_taxonomy_df], axis=1)
                    target_gene_df["score_set_urn"] = score_set["urn"]
                    # append the target gene dataframe to the target genes dataframe
                    target_genes_df = pd.concat([target_genes_df, target_gene_df], axis=0)
            
            # join the score set dataframe with the target genes dataframe      
            score_set_df = pd.merge(score_set_df, target_genes_df, on="score_set_urn", how="outer")
                        
            # append the score set dataframe to the score set output dataframe
            score_set_output_df = pd.concat([score_set_output_df, score_set_df], axis=0)
            
        # join the experiment metadata dataframe with the score set output dataframe
        exp_meta_df = pd.merge(exp_meta_df, score_set_output_df, on="exp_urn", how="outer")

        # append the experiment metadata dataframe to the output dataframe
        output_df = pd.concat([output_df, exp_meta_df], axis=0)
        
output_df.reset_index(drop=True, inplace=True)

  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = pd.concat([output_df, exp_meta_df], axis=0)
  output_df = 

In [10]:
# save the output dataframe as a csv file
output_df.to_csv("MaveDB_metadata.csv", index=False)

# save the output dataframe as a json file
output_df.to_json("MaveDB_metadata.json", orient="records", lines=False, indent=4)

In [9]:
output_df.head()

Unnamed: 0,exp_title,exp_short_description,exp_abstract,exp_published_date,exp_associated_datasets,exp_urn,score_set_title,score_set_method,score_set_abstract,score_set_short_description,...,secondary_pub_url,secondary_pub_referenceHtml,secondary_pub_title,secondary_pub_abstract,secondary_pub_authors,secondary_pub_publicationDoi,secondary_pub_preprintDoi,secondary_pub_publicationYear,secondary_pub_preprintDate,secondary_pub_publicationJournal
0,UBE2I yeast complementation,A Deep Mutational Scan of the human SUMO E2 co...,Although we now routinely sequence human genom...,2018-06-26,"[{'identifier': 'SRP109101', 'id': 2, 'url': '...",urn:mavedb:00000001-a,UBE2I imputed & refined,##Scoring procedure:\r\nDMS-BarSeq and DMS-Til...,Although we now routinely sequence human genom...,A joint Deep Mutational Scan of the human SUMO...,...,,,,,,,,,,
1,UBE2I yeast complementation,A Deep Mutational Scan of the human SUMO E2 co...,Although we now routinely sequence human genom...,2018-06-26,"[{'identifier': 'SRP109101', 'id': 2, 'url': '...",urn:mavedb:00000001-a,UBE2I DMS-BarSeq,##Scoring procedure:\r\nDMS-BarSeq and reads w...,Although we now routinely sequence human genom...,A Deep Mutational Scan of the human SUMO E2 co...,...,,,,,,,,,,
2,UBE2I yeast complementation,A Deep Mutational Scan of the human SUMO E2 co...,Although we now routinely sequence human genom...,2018-06-26,"[{'identifier': 'SRP109101', 'id': 2, 'url': '...",urn:mavedb:00000001-a,UBE2I DMS-TileSeq,##Scoring procedure:\r\nDMS-TileSeq reads were...,Although we now routinely sequence human genom...,A Deep Mutational Scan of the human SUMO E2 co...,...,,,,,,,,,,
3,UBE2I yeast complementation,A Deep Mutational Scan of the human SUMO E2 co...,Although we now routinely sequence human genom...,2018-06-26,"[{'identifier': 'SRP109101', 'id': 2, 'url': '...",urn:mavedb:00000001-a,UBE2I joint data,##Scoring procedure:\r\nDMS-BarSeq and DMS-Til...,Although we now routinely sequence human genom...,A joint Deep Mutational Scan dataset of the hu...,...,,,,,,,,,,
4,SUMO1 yeast complementation,A Deep Mutational Scan of the human SUMO1 usin...,Although we now routinely sequence human genom...,2018-06-29,"[{'identifier': 'SRP109119', 'id': 3, 'url': '...",urn:mavedb:00000001-b,SUMO1 imputed and refined,##Scoring procedure:\r\nDMS-TileSeq reads were...,Although we now routinely sequence human genom...,A machine-learning imputed and refined Deep Mu...,...,,,,,,,,,,
