In [1]:
from main import load_data,prior_from_weights
import os
import json
import numpy as np

  from tqdm.autonotebook import tqdm


In [2]:
dataset_ids = "IGVFFI5226LVQD IGVFFI5890AHYL urn_mavedb_00000013-a-1 urn_mavedb_00000054-a-1 urn_mavedb_00000068-b-1 urn_mavedb_00000097-0-1 urn_mavedb_00000108-a-2 urn_mavedb_00000001-c-1 urn_mavedb_00000050-a-1 urn_mavedb_00000068-a-1 urn_mavedb_00000068-c-1 urn_mavedb_00000108-a-1 Adomovich_BRCA1_CR Boettcher_TP53 Hu_BRCA2 Adomovich_BRCA1_HDR CHK2_MAVE Kozek_KCNH2 Adomovich_BRCA1_Low_Throughput".split(" ")

In [3]:
dataset_ids

['IGVFFI5226LVQD',
 'IGVFFI5890AHYL',
 'urn_mavedb_00000013-a-1',
 'urn_mavedb_00000054-a-1',
 'urn_mavedb_00000068-b-1',
 'urn_mavedb_00000097-0-1',
 'urn_mavedb_00000108-a-2',
 'urn_mavedb_00000001-c-1',
 'urn_mavedb_00000050-a-1',
 'urn_mavedb_00000068-a-1',
 'urn_mavedb_00000068-c-1',
 'urn_mavedb_00000108-a-1',
 'Adomovich_BRCA1_CR',
 'Boettcher_TP53',
 'Hu_BRCA2',
 'Adomovich_BRCA1_HDR',
 'CHK2_MAVE',
 'Kozek_KCNH2',
 'Adomovich_BRCA1_Low_Throughput']

In [4]:
statistics = {}
config_name = 'missense_config'
for dataset_id in dataset_ids:
    X,S,_ = load_data(dataset_id=dataset_id,config_name=config_name,data_directory='/mnt/d/mave_calibration/data/')
    with open(os.path.join("/mnt/d/mave_calibration/data", f"{dataset_id}/{config_name}.json")) as f:
        config = json.load(f)
    try:
        with open(os.path.join(f"/mnt/d/mave_calibration/results/{config_name}/{dataset_id}/result.json")) as f:
            result = json.load(f)
        prior_estimate = prior_from_weights(np.array(result['weights'])).item()
    except FileNotFoundError:
        prior_estimate = np.nan
    statistics[dataset_id] = {
        'n_pathogenic' : S[:,0].sum(),
        'n_control' : S[:,1].sum(),
        f'n_{config_name.split("_")[0]}' : S[:,2].sum(),
        'total' : len(S),
        'control_type' : list(config['sample_definitions'].keys())[1],
        'prior_estimate' : prior_estimate,
    }

In [None]:
import pandas as pd

In [None]:
publication_info = {
    "Kozek_KCNH2" : dict(url="https://pubmed.ncbi.nlm.nih.gov/32522694/",
                         first_author="Kozek",
                         year='2020',
                         gene_symbol='KCNH2'),
    "Boettcher_TP53" : dict(url='https://pubmed.ncbi.nlm.nih.gov/31395785/',
                            first_author='Boettcher',
                            year='2019',
                            gene_symbol='TP53'),
    "Hu_BRCA2" : dict(url='https://pubmed.ncbi.nlm.nih.gov/38417439/',
                      first_author='Hu',
                      year='2024',
                      gene_symbol='BRCA2'),
    "CHK2_MAVE" : dict(url='https://www.biorxiv.org/content/10.1101/2024.02.13.579700v1',
                        first_author='Gebbia',
                       year='2024',
                       gene_symbol='CHEK2'),
    'Adomovich_BRCA1_CR' : dict(url='https://www.cell.com/ajhg/fulltext/S0002-9297(22)00050-7',
                                first_author='Adomovich',
                                year='2022',
                                note='cisplatin resistance',
                                gene_symbol='BRCA1'),
    'Adomovich_BRCA1_HDR' : dict(url='https://www.cell.com/ajhg/fulltext/S0002-9297(22)00050-7',
                                first_author='Adomovich',
                                year='2022',
                                note='homology-directed repair',
                                gene_symbol='BRCA1'),
    'urn_mavedb_00000013-a-1' : dict(url='https://pubmed.ncbi.nlm.nih.gov/29785012/',
                                     first_author='Matreyek',
                                     year='2018',
                                     gene_symbol='PTEN'),

    'urn_mavedb_00000001-c-1' : dict(url='https://pubmed.ncbi.nlm.nih.gov/29269382/',
                                     first_author='Weile',
                                     year='2017',
                                     gene_symbol='CALM1'),
    'urn_mavedb_00000050-a-1' : dict(url='https://pubmed.ncbi.nlm.nih.gov/33357406/',
                                     first_author='Jia',
                                     year='2021',
                                     gene_symbol='MSH2'),
    
    'urn_mavedb_00000054-a-1' : dict(url='https://pubmed.ncbi.nlm.nih.gov/29706350/',
                                     first_author='Mighell',
                                     year='2018',
                                     gene_symbol='PTEN'),
    'urn_mavedb_00000068-a-1' : dict(url='https://pubmed.ncbi.nlm.nih.gov/30224644/',
                                        first_author='Giacomelli',
                                        year='2018',
                                        gene_symbol='TP53',note='Mutated p53 paired with wildtype under nutlin-3'),
    'urn_mavedb_00000068-b-1' : dict(url='https://pubmed.ncbi.nlm.nih.gov/30224644/',
                                        first_author='Giacomelli',
                                        year='2018',
                                        gene_symbol='TP53',note='Mutated p53 with nutlin-3'),
    'urn_mavedb_00000068-c-1' : dict(url='https://pubmed.ncbi.nlm.nih.gov/30224644/',
                                        first_author='Giacomelli',
                                        year='2018',
                                        gene_symbol='TP53',note='Mutated p53 with etoposide'),
    'urn_mavedb_00000097-0-1' : dict(url='https://pubmed.ncbi.nlm.nih.gov/30209399/',
                                        first_author='Findlay',
                                        year='2018',
                                        gene_symbol='BRCA1'),
    'urn_mavedb_00000108-a-1' : dict(url='https://www.cell.com/ajhg/fulltext/S0002-9297(23)00288-4',
                                        first_author='van Loggerenberg',
                                        year='2023',
                                        gene_symbol='HMBS',
                                        note='Missense variant effect map for the human ubiquitous HMBS isoform'),
    'urn_mavedb_00000108-a-2' : dict(url='https://www.cell.com/ajhg/fulltext/S0002-9297(23)00288-4',
                                        first_author='van Loggerenberg',
                                        year='2023',
                                        gene_symbol='HMBS',
                                        note='Missense variant effect map for the human erythroid-specific HMBS isoform'),
    'IGVFFI5226LVQD' : dict(first_author='IGVF',gene_symbol='F9',year='2024'),
    'IGVFFI5890AHYL' : dict(first_author='IGVF',gene_symbol='CYP2C19',year='2024'),
    'Adomovich_BRCA1_Low_Throughput' : dict(url='https://www.cell.com/ajhg/fulltext/S0002-9297(22)00050-7',
                                first_author='Adomovich',
                                year='2022',
                                note='Supp. Table S7',
                                gene_symbol='BRCA1'),
}

In [None]:
for dataset_id, dataset_info in publication_info.items():
    with open(f"/mnt/d/mave_calibration/data/{dataset_id}/info_{config_name}.json",'w') as f:
        json.dump(dataset_info, f)

In [None]:
stat_table = pd.DataFrame.from_records(statistics).T
stat_table = pd.concat([stat_table,pd.DataFrame.from_dict(publication_info,orient='index')],axis=1)

In [None]:
stat_table.sort_values(by='total').to_json(f"/mnt/d/mave_calibration/dataset_summaries_{config_name}.json",orient='records',)
stat_table.sort_values(by='total').to_csv(f"/mnt/d/mave_calibration/dataset_summaries_{config_name}.csv")

In [None]:
stat_table.sort_values(by='total')