# Verify Data Fields
2023-04-05 ZD  

This notebook will compare high-level data elements between versions of data received from CHoP for use in MTP.

## Import modules and define relative paths
Data structure will be based on the `verify_data_displayed_in_mtp.ipynb` DV3 Notebook. Edit core older/newer versions and all other file

In [None]:
import pandas as pd
import os

In [None]:
# Core Older Versions for Comparison
OLD_OPENPEDCAN_SOMATIC_ALTERATIONS_VERSION = 'v12pre'
OLD_OPENPEDCAN_GENE_EXPRESSION_VERSION = 'v12pre'
OLD_OPENPEDCAN_EPIGENETIC_MODIFICATION_VERSION = 'v12pre'

# Core Newer Versions for Comparison
NEW_OPENPEDCAN_SOMATIC_ALTERATIONS_VERSION = 'v12pre2'
NEW_OPENPEDCAN_GENE_EXPRESSION_VERSION = 'v12pre2'
NEW_OPENPEDCAN_EPIGENETIC_MODIFICATION_VERSION = 'v12pre2'

# Do not change below
# ------

# Static CHoP: Epigenetic Modification (Methylation)
# Does not change between versions
METHYL_FILENAME = 'isoform-methyl-beta-values-summary'
METHYLGENE_FILENAME = 'gene-methyl-beta-values-summary'

# ---

# Older Data from CHoP
OLD_CHOP_SA_PATH = 'data/raw/chopOpenPedCan/somaticAlterations/' + OLD_OPENPEDCAN_SOMATIC_ALTERATIONS_VERSION + '/'
OLD_CHOP_GX_PATH = 'data/raw/chopOpenPedCan/geneExpression/' + OLD_OPENPEDCAN_GENE_EXPRESSION_VERSION + '/'
OLD_CHOP_EM_PATH = 'data/raw/chopOpenPedCan/epigeneticModification/' + OLD_OPENPEDCAN_EPIGENETIC_MODIFICATION_VERSION + '/'

# Older CHoP: Somatic Alterations
OLD_CNV_PATH = OLD_CHOP_SA_PATH + 'gene-level-cnv-consensus-annotated-mut-freq.jsonl.gz'
OLD_SNVGENE_PATH = OLD_CHOP_SA_PATH + 'gene-level-snv-consensus-annotated-mut-freq.jsonl.gz'
OLD_SNV_PATH = OLD_CHOP_SA_PATH + 'variant-level-snv-consensus-annotated-mut-freq.jsonl.gz'
OLD_FUSIONGENE_PATH = OLD_CHOP_SA_PATH + 'putative-oncogene-fused-gene-freq.jsonl.gz'
OLD_FUSION_PATH = OLD_CHOP_SA_PATH + 'putative-oncogene-fusion-freq.jsonl.gz'

# Older CHoP: Gene Expression
OLD_TPMGENE_PATH = OLD_CHOP_GX_PATH + 'long_n_tpm_mean_sd_quantile_gene_wise_zscore.jsonl.gz'
OLD_TPMGROUP_PATH = OLD_CHOP_GX_PATH + 'long_n_tpm_mean_sd_quantile_group_wise_zscore.jsonl.gz'

# Older CHoP: Raw Methylation (large files)
OLD_METHYL_PATH = OLD_CHOP_EM_PATH + METHYL_FILENAME + '.jsonl.gz'
OLD_METHYLGENE_PATH = OLD_CHOP_EM_PATH + METHYLGENE_FILENAME + '.jsonl.gz'

# Older CHoP: Grouped Methylation (aggregated small files)
OLD_GROUPED_CHOP_EM_PATH = 'data/processed/chopOpenPedCan/epigeneticModification_grouped/' + OLD_OPENPEDCAN_EPIGENETIC_MODIFICATION_VERSION + '/'
OLD_GROUPED_METHYL_PATH = OLD_GROUPED_CHOP_EM_PATH + METHYL_FILENAME + '/'
OLD_GROUPED_METHYLGENE_PATH = OLD_GROUPED_CHOP_EM_PATH + METHYLGENE_FILENAME + '/'

# ------

# Newer Data from CHoP 
NEW_CHOP_SA_PATH = 'data/raw/chopOpenPedCan/somaticAlterations/' + NEW_OPENPEDCAN_SOMATIC_ALTERATIONS_VERSION + '/'
NEW_CHOP_GX_PATH = 'data/raw/chopOpenPedCan/geneExpression/' + NEW_OPENPEDCAN_GENE_EXPRESSION_VERSION + '/'
NEW_CHOP_EM_PATH = 'data/raw/chopOpenPedCan/epigeneticModification/' + NEW_OPENPEDCAN_EPIGENETIC_MODIFICATION_VERSION + '/'

# Newer CHoP: Somatic Alterations
NEW_CNV_PATH = NEW_CHOP_SA_PATH + 'gene-level-cnv-consensus-annotated-mut-freq.jsonl.gz'
NEW_SNVGENE_PATH = NEW_CHOP_SA_PATH + 'gene-level-snv-consensus-annotated-mut-freq.jsonl.gz'
NEW_SNV_PATH = NEW_CHOP_SA_PATH + 'variant-level-snv-consensus-annotated-mut-freq.jsonl.gz'
NEW_FUSIONGENE_PATH = NEW_CHOP_SA_PATH + 'putative-oncogene-fused-gene-freq.jsonl.gz'
NEW_FUSION_PATH = NEW_CHOP_SA_PATH + 'putative-oncogene-fusion-freq.jsonl.gz'

# Newer CHoP: Gene Expression
NEW_TPMGENE_PATH = NEW_CHOP_GX_PATH + 'long_n_tpm_mean_sd_quantile_gene_wise_zscore.jsonl.gz'
NEW_TPMGROUP_PATH = NEW_CHOP_GX_PATH + 'long_n_tpm_mean_sd_quantile_group_wise_zscore.jsonl.gz'

# Newer CHoP: Raw Methylation (large files)
NEW_METHYL_PATH = NEW_CHOP_EM_PATH + METHYL_FILENAME + '.jsonl.gz'
NEW_METHYLGENE_PATH = NEW_CHOP_EM_PATH + METHYLGENE_FILENAME + '.jsonl.gz'

# Newer CHoP: Grouped Methylation (aggregated small files)
NEW_GROUPED_CHOP_EM_PATH = 'data/processed/chopOpenPedCan/epigeneticModification_grouped/' + NEW_OPENPEDCAN_EPIGENETIC_MODIFICATION_VERSION + '/'
NEW_GROUPED_METHYL_PATH = NEW_GROUPED_CHOP_EM_PATH + METHYL_FILENAME + '/'
NEW_GROUPED_METHYLGENE_PATH = NEW_GROUPED_CHOP_EM_PATH + METHYLGENE_FILENAME + '/'


## Compare data versions

In [None]:
def load_jsonl_as_chunks(path:str, limit=None, chunksize=2.5e5):
    """
    Load large jsonl file in chunks and then combine to save memory.

    :param path: Relative filepath to jsonl file
    :param limit: max number of chunks to include in concat. Use to
        load a subset of the data
    :param chunksize: int size of the lines to include

    """
    # Load jsonl in chunks
    df_chunks = pd.read_json(path, orient='records', lines=True, chunksize=chunksize)

    # Create empty dataframe
    df = pd.DataFrame()

    # Fill empty dataframe with chunks up to limit
    for label, chunk in enumerate(df_chunks):
        if label == limit:
            break
        df = pd.concat([df,chunk], ignore_index=True)
    
    return df

In [None]:
# Create tuples of (old, new) paths for iteration
pathTuples = [
            (OLD_CNV_PATH,          NEW_CNV_PATH),
            (OLD_SNVGENE_PATH,      NEW_SNVGENE_PATH),
            (OLD_SNV_PATH,          NEW_SNV_PATH),
            (OLD_FUSIONGENE_PATH,   NEW_FUSIONGENE_PATH),
            (OLD_FUSION_PATH,       NEW_FUSION_PATH),
            (OLD_TPMGENE_PATH,      NEW_TPMGENE_PATH),
            (OLD_TPMGROUP_PATH,     NEW_TPMGROUP_PATH),
            (OLD_METHYL_PATH,       NEW_METHYL_PATH),
            (OLD_METHYLGENE_PATH,   NEW_METHYLGENE_PATH)
            ]


### Check fields present across file versions

In [None]:
# Output version descriptions
introText = (
    f'Comparing data fields between versions:\n---\n'
    f'    Somatic Alterations:        {OLD_OPENPEDCAN_SOMATIC_ALTERATIONS_VERSION:8}'
                            f'-->     {NEW_OPENPEDCAN_SOMATIC_ALTERATIONS_VERSION:8}\n'
    f'    Gene Expression:            {OLD_OPENPEDCAN_GENE_EXPRESSION_VERSION:8}'
                            f'-->     {NEW_OPENPEDCAN_GENE_EXPRESSION_VERSION:8}\n'
    f'    Epigenetic Modification:    {OLD_OPENPEDCAN_EPIGENETIC_MODIFICATION_VERSION:8}'
                            f'-->     {NEW_OPENPEDCAN_EPIGENETIC_MODIFICATION_VERSION:8}\n')
print(introText, '---\n')

# Iterate though data version groups
for oldPath, newPath in pathTuples:

    # Get filename(s) for output labeling
    filename = oldPath.rsplit('/',1)[-1]
    if filename != newPath.rsplit('/',1)[-1]:
        filename = oldPath.rsplit('/',1)[-1]+' renamed '+newPath.rsplit('/',1)[-1]

    print(f'{filename} ...')

    # If new or old file is missing, skip comparison
    if not os.path.exists(oldPath) or not os.path.exists(newPath):
        print(f'    Comparison not available - File(s) not found.')
        continue
    
    # Load first 10,000 lines of each dataset to gather columns
    # (Loading more than 1 to avoid dropping columns with NaN values)
    old = load_jsonl_as_chunks(oldPath, limit=1, chunksize=1e4)
    new = load_jsonl_as_chunks(newPath, limit=1, chunksize=1e4)

    # Find different columns between versions
    old_unique = list(set(old.columns.tolist()) - set(new.columns.tolist()))
    new_unique = list(set(new.columns.tolist()) - set(old.columns.tolist()))
    
    # Print output with filename label if any have mismatches
    if len(new_unique) + len(old_unique) == 0:
        print(f'    All good. No field changes.')
    if len(new_unique) > 0:
        print('    New fields added:', *new_unique, sep='\n        ')
    if len(old_unique) > 0:
        print('    Old fields dropped:', *old_unique, sep='\n        ')


## Show sample first line of data for each new file

In [None]:
# Output intro descriptions
introText = (
f'Sample first record from each datasource:\n---\n'
f'    Somatic Alterations:        {NEW_OPENPEDCAN_SOMATIC_ALTERATIONS_VERSION:8}\n'
f'    Gene Expression:            {NEW_OPENPEDCAN_GENE_EXPRESSION_VERSION:8}\n'
f'    Epigenetic Modification:    {NEW_OPENPEDCAN_EPIGENETIC_MODIFICATION_VERSION:8}\n')
print(introText, '---\n')

# Iterate though data version groups
for oldPath, newPath in pathTuples:

    # Get filename(s) for output labeling
    filename = newPath.rsplit('/',1)[-1]

    print(f'{filename} ...\n')

    # If new file is missing, skip output
    if not os.path.exists(newPath):
        print(f'    File not found. \n---\n')
        continue
    
    # Load first 10,000 lines of each dataset to gather columns
    # (Loading more than 1 to avoid dropping columns with NaN values)
    new = load_jsonl_as_chunks(newPath, limit=1, chunksize=1e4)

    print(f'{new.iloc[0]} \n---\n')
