In [None]:
import pandas as pd


In [None]:
grouped_coding_consequences = {
    'all': [ 'synonymous_variant', 'stop_retained_variant', 'start_retained_variant', 'missense_variant', 'stop_gained', 'stop_lost', 'start_lost',
        'splice_acceptor_variant', 'splice_donor_variant', 'frameshift_variant', 'inframe_insertion', 'inframe_deletion',
        'protein_altering_variant', 'incomplete_terminal_codon_variant', 'coding_sequence_variant' ],
    'synonymous':  ['synonymous_variant', 'stop_retained_variant', 'start_retained_variant'],
    'non-synonymous':  ['missense_variant', 'stop_gained', 'stop_lost', 'start_lost'],
    'splice': ['splice_acceptor_variant', 'splice_donor_variant'],
    'frameshift':  ['frameshift_variant'],
    'inframe':  ['inframe_insertion', 'inframe_deletion']
}

grouped_noncoding_consequences = {
    'all': [ '3_prime_UTR_variant', '5_prime_UTR_variant', 'NMD_transcript_variant', 'TFBS_ablation', 'TFBS_amplification',
        'TF_binding_site_variant', 'downstream_gene_variant', 'feature_elongation', 'feature_truncation', 'intergenic_variant',
        'intron_variant', 'mature_miRNA_variant', 'non_coding_transcript_exon_variant', 'non_coding_transcript_variant',
        'regulatory_region_ablation', 'regulatory_region_amplification', 'regulatory_region_variant', 'splice_region_variant',
        'transcript_ablation', 'transcript_amplification', 'upstream_gene_variant'],
    '3_prime_UTR_variant': ['3_prime_UTR_variant'],
    '5_prime_UTR_variant': ['5_prime_UTR_variant'],
    'intron_variant': ['intron_variant', 'splice_region_variant'],
    'intergenic': ['downstream_gene_variant', 'upstream_gene_variant', 'intergenic_variant'],
    'regulatory': ['TFBS_ablation', 'TFBS_amplification', 'TF_binding_site_variant', 
                   'regulatory_region_variant', 'regulatory_region_ablation', 'regulatory_region_amplification'],
}


In [None]:
FILTER = 'FAIL'
# chromosomes = [f'{i}' for i in range(1, 23)]
chromosomes = [f'{i}' for i in range(1, 23)] + ['X']

In [None]:
filenames = [f'chr{c}.summary.txt' for c in chromosomes]

summary = []
for filename in filenames:
    summary.append(pd.read_csv(filename, sep = '\t'))
summary = pd.concat(summary)
summary = summary.groupby(['FILTER', 'VARIANT_TYPE', 'CONTEXT_TYPE', 'CONSEQUENCE', 'COUNT']).sum().reset_index()



## 1. Variants genome-wide
---

In [None]:
index = []
counts = []

for variant_type in ['ALL', 'SNV', 'INDEL']:
    index.append(variant_type)
    df_n = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == variant_type) &
        (summary.CONTEXT_TYPE == 'ALL') &
        (summary.CONSEQUENCE == 'ALL') & 
        (summary.COUNT == 'N')]
    df_cpg = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == variant_type) &
        (summary.CONTEXT_TYPE == 'ALL') &
        (summary.CONSEQUENCE == 'ALL') & 
        (summary.COUNT == 'CpG')]
    df_ts = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == variant_type) &
        (summary.CONTEXT_TYPE == 'ALL') &
        (summary.CONSEQUENCE == 'ALL') & 
        (summary.COUNT == 'TS')]
        
    counts.append([
        df_n.ALL.values[0],
        df_n.SINGLETONS.values[0] / df_n.ALL.values[0] * 100,
        df_cpg.ALL.values[0] / df_n.ALL.values[0] * 100 if len(df_cpg) > 0 else None,
        df_ts.ALL.values[0] / (df_n.ALL.values[0] - df_ts.ALL.values[0]) if len(df_ts) > 0 else None,
        df_ts.SINGLETONS.values[0] / (df_n.SINGLETONS.values[0] - df_ts.SINGLETONS.values[0]) if len(df_ts) > 0 else None,
        
        df_n.ALL_NOVEL.values[0],
        df_n.SINGLETONS_NOVEL.values[0] / df_n.ALL_NOVEL.values[0] * 100,
        df_cpg.ALL_NOVEL.values[0] / df_n.ALL_NOVEL.values[0] * 100 if len(df_cpg) > 0 else None,
        df_ts.ALL_NOVEL.values[0] / (df_n.ALL_NOVEL.values[0] - df_ts.ALL_NOVEL.values[0]) if len(df_ts) > 0 else None,
        df_ts.SINGLETONS_NOVEL.values[0] / (df_n.SINGLETONS_NOVEL.values[0] - df_ts.SINGLETONS_NOVEL.values[0]) if len(df_ts) > 0 else None,
        
        df_n.ALL_KNOWN.values[0],
        df_n.SINGLETONS_KNOWN.values[0] / df_n.ALL_KNOWN.values[0] * 100,
        df_cpg.ALL_KNOWN.values[0] / df_n.ALL_KNOWN.values[0] * 100 if len(df_cpg) > 0 else None,
        df_ts.ALL_KNOWN.values[0] / (df_n.ALL_KNOWN.values[0] - df_ts.ALL_KNOWN.values[0]) if len(df_ts) > 0 else None,
        df_ts.SINGLETONS_KNOWN.values[0] / (df_n.SINGLETONS_KNOWN.values[0] - df_ts.SINGLETONS_KNOWN.values[0]) if len(df_ts) > 0 else None,
    ])
    
df = pd.DataFrame(counts,
    index=pd.Index(index, name='Variants'),
    columns=pd.MultiIndex.from_product([
        ['All', 'Novel', 'In dbSNP'], ['N', '% singletons', '% CpG', 'TsTv', 'TsTv singletons']], names=['', '']))

df.style.format(formatter = {('All', 'N'): '{:,.0f}', 
                             ('All', '% singletons'): '{:.1f}',
                             ('All', '% CpG'): '{:.1f}',
                             ('All', 'TsTv'): '{:.1f}',
                             ('All', 'TsTv singletons'): '{:.1f}',
                             ('Novel', 'N'): '{:,.0f}', 
                             ('Novel', '% singletons'): '{:.1f}',
                             ('Novel', '% CpG'): '{:.1f}',
                             ('Novel', 'TsTv'): '{:.1f}',
                             ('Novel', 'TsTv singletons'): '{:.1f}',
                             ('In dbSNP', 'N'): '{:,.0f}',
                             ('In dbSNP', '% singletons'): '{:,.1f}',
                             ('In dbSNP', '% CpG'): '{:,.1f}',
                             ('In dbSNP', 'TsTv'): '{:.1f}',
                             ('In dbSNP', 'TsTv singletons'): '{:.1f}',
                            })



## 2. Variants in protein coding genes
---
### 2.1. Variants inside CDS

In [None]:
index = []
counts = []
for group, consequences in grouped_coding_consequences.items():
    df = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == 'ALL') & 
        (summary.CONTEXT_TYPE == 'CODING') & 
        (summary.CONSEQUENCE.isin(consequences)) & 
        (summary.COUNT == 'N')].groupby(['FILTER', 'VARIANT_TYPE', 'CONTEXT_TYPE', 'COUNT']).sum().reset_index()
    index.append(group)
    counts.append([
    df.ALL.values[0],
    round(df.SINGLETONS.values[0] / df.ALL.values[0] * 100, 1),
    df.ALL_NOVEL.values[0],
    round(df.SINGLETONS_NOVEL.values[0] / df.ALL_NOVEL.values[0] * 100, 1),
    df.ALL_KNOWN.values[0],
    round(df.SINGLETONS_KNOWN.values[0] / df.ALL_KNOWN.values[0] * 100, 1)])

df = pd.DataFrame(counts,
    index=pd.Index(index, name='Variants'),
    columns=pd.MultiIndex.from_product([['All', 'Novel', 'In dbSNP'], ['N', '% singletons']], names=['', '']))

df.style.format(formatter = {('All', 'N'): '{:,.0f}', 
                             ('All', '% singletons'): '{:.1f}',
                             ('Novel', 'N'): '{:,.0f}', 
                             ('Novel', '% singletons'): '{:.1f}',
                             ('In dbSNP', 'N'): '{:,.0f}',
                             ('In dbSNP', '% singletons'): '{:,.1f}',
                            })

### 2.2. SNVs inside CDS

In [None]:
index = []
counts = []
for group, consequences in grouped_coding_consequences.items():
    df_n = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == 'SNV') & 
        (summary.CONTEXT_TYPE == 'CODING') & 
        (summary.CONSEQUENCE.isin(consequences)) & 
        (summary.COUNT == 'N')].groupby(['FILTER', 'VARIANT_TYPE', 'CONTEXT_TYPE', 'COUNT']).sum().reset_index()
    df_cpg = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == 'SNV') & 
        (summary.CONTEXT_TYPE == 'CODING') & 
        (summary.CONSEQUENCE.isin(consequences)) & 
        (summary.COUNT == 'CpG')].groupby(['FILTER', 'VARIANT_TYPE', 'CONTEXT_TYPE', 'COUNT']).sum().reset_index()
    df_ts = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == 'SNV') & 
        (summary.CONTEXT_TYPE == 'CODING') & 
        (summary.CONSEQUENCE.isin(consequences)) & 
        (summary.COUNT == 'TS')].groupby(['FILTER', 'VARIANT_TYPE', 'CONTEXT_TYPE', 'COUNT']).sum().reset_index()
    
    if df_n.ALL.values[0] == 0:
        continue
    
    index.append(group)
    counts.append([
        df_n.ALL.values[0],
        df_n.SINGLETONS.values[0] / df_n.ALL.values[0] * 100,
        df_cpg.ALL.values[0] / df_n.ALL.values[0] * 100 if len(df_cpg) > 0 else None,
        df_ts.ALL.values[0] / (df_n.ALL.values[0] - df_ts.ALL.values[0]),
        df_ts.SINGLETONS.values[0] / (df_n.SINGLETONS.values[0] - df_ts.SINGLETONS.values[0]),

        df_n.ALL_NOVEL.values[0],
        df_n.SINGLETONS_NOVEL.values[0] / df_n.ALL_NOVEL.values[0] * 100,
        df_cpg.ALL_NOVEL.values[0] / df_n.ALL_NOVEL.values[0] * 100 if len(df_cpg) > 0 else None,
        df_ts.ALL_NOVEL.values[0] / (df_n.ALL_NOVEL.values[0] - df_ts.ALL_NOVEL.values[0]),
        df_ts.SINGLETONS_NOVEL.values[0] / (df_n.SINGLETONS_NOVEL.values[0] - df_ts.SINGLETONS_NOVEL.values[0]),

        df_n.ALL_KNOWN.values[0],
        df_n.SINGLETONS_KNOWN.values[0] / df_n.ALL_KNOWN.values[0] * 100,
        df_cpg.ALL_KNOWN.values[0] / df_n.ALL_KNOWN.values[0] * 100 if len(df_cpg) > 0 else None,
        df_ts.ALL_KNOWN.values[0] / (df_n.ALL_KNOWN.values[0] - df_ts.ALL_KNOWN.values[0]),
        df_ts.SINGLETONS_KNOWN.values[0] / (df_n.SINGLETONS_KNOWN.values[0] - df_ts.SINGLETONS_KNOWN.values[0]),
    ])

df = pd.DataFrame(counts,
    index=pd.Index(index, name='SNVs'),
    columns=pd.MultiIndex.from_product([
        ['All', 'Novel', 'In dbSNP'], 
        ['N', '% singletons', '% CpG', 'TsTv', 'TsTv singletons']], names=['', '']))

df.style.format(formatter = {('All', 'N'): '{:,.0f}', 
                             ('All', '% singletons'): '{:.1f}',
                             ('All', '% CpG'): '{:.1f}',
                             ('All', 'TsTv'): '{:.1f}',
                             ('All', 'TsTv singletons'): '{:.1f}',
                             ('Novel', 'N'): '{:,.0f}', 
                             ('Novel', '% singletons'): '{:.1f}',
                             ('Novel', '% CpG'): '{:.1f}',
                             ('Novel', 'TsTv'): '{:.1f}',
                             ('Novel', 'TsTv singletons'): '{:.1f}',
                             ('In dbSNP', 'N'): '{:,.0f}',
                             ('In dbSNP', '% singletons'): '{:,.1f}',
                             ('In dbSNP', '% CpG'): '{:,.1f}',
                             ('In dbSNP', 'TsTv'): '{:.1f}',
                             ('In dbSNP', 'TsTv singletons'): '{:.1f}',
                            })

### 2.3 Variants outside CDS

In [None]:
index = []
counts = []
for group, consequences in grouped_noncoding_consequences.items():
    df = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == 'ALL') & 
        (summary.CONTEXT_TYPE == 'CODING') & 
        (summary.CONSEQUENCE.isin(consequences)) & 
        (summary.COUNT == 'N')].groupby(['FILTER', 'VARIANT_TYPE', 'CONTEXT_TYPE', 'COUNT']).sum().reset_index()
    if df.ALL.values[0] > 0:
        index.append(group)
        counts.append([
            df.ALL.values[0],
            round(df.SINGLETONS.values[0] / df.ALL.values[0] * 100, 1),
            df.ALL_NOVEL.values[0],
            round(df.SINGLETONS_NOVEL.values[0] / df.ALL_NOVEL.values[0] * 100, 1),
            df.ALL_KNOWN.values[0],
            round(df.SINGLETONS_KNOWN.values[0] / df.ALL_KNOWN.values[0] * 100, 1)])

df = pd.DataFrame(counts,
    index=pd.Index(index, name='Variants'),
    columns=pd.MultiIndex.from_product([['All', 'Novel', 'In dbSNP'], ['N', 'Singletons (%)']], names=['', '']))

df.style.format(formatter = {('All', 'N'): '{:,.0f}', 
                             ('All', 'Singletons (%)'): '{:.1f}',
                             ('Novel', 'N'): '{:,.0f}', 
                             ('Novel', 'Singletons (%)'): '{:.1f}',
                             ('In dbSNP', 'N'): '{:,.0f}',
                             ('In dbSNP', 'Singletons (%)'): '{:,.1f}',
                            })


## 3. Variants in other genes
---
### 3.1. Variants inside CDS

In [None]:
index = []
counts = []
for group, consequences in grouped_coding_consequences.items():
    df = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == 'ALL') & 
        (summary.CONTEXT_TYPE == 'NONCODING') & 
        (summary.CONSEQUENCE.isin(consequences)) & 
        (summary.COUNT == 'N')].groupby(['FILTER', 'VARIANT_TYPE', 'CONTEXT_TYPE', 'COUNT']).sum().reset_index()

    index.append(group)
    counts.append([
    df.ALL.values[0],
    round(df.SINGLETONS.values[0] / df.ALL.values[0] * 100, 1),
    df.ALL_NOVEL.values[0],
    round(df.SINGLETONS_NOVEL.values[0] / df.ALL_NOVEL.values[0] * 100, 1),
    df.ALL_KNOWN.values[0],
    round(df.SINGLETONS_KNOWN.values[0] / df.ALL_KNOWN.values[0] * 100, 1)])

df = pd.DataFrame(counts,
    index=pd.Index(index, name='Variants'),
    columns=pd.MultiIndex.from_product([['All', 'Novel', 'In dbSNP'], ['N', 'Singletons (%)']], names=['', '']))

df.style.format(formatter = {('All', 'N'): '{:,.0f}', 
                             ('All', 'Singletons (%)'): '{:.1f}',
                             ('Novel', 'N'): '{:,.0f}', 
                             ('Novel', 'Singletons (%)'): '{:.1f}',
                             ('In dbSNP', 'N'): '{:,.0f}',
                             ('In dbSNP', 'Singletons (%)'): '{:,.1f}',
                            })


### 3.2 Variants outside CDS

In [None]:
index = []
counts = []
for group, consequences in grouped_noncoding_consequences.items():
    df = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == 'ALL') & 
        (summary.CONTEXT_TYPE == 'NONCODING') & 
        (summary.CONSEQUENCE.isin(consequences)) & 
        (summary.COUNT == 'N')].groupby(['FILTER', 'VARIANT_TYPE', 'CONTEXT_TYPE', 'COUNT']).sum().reset_index()

    if (df.ALL.values[0] > 0):
        index.append(group)
        counts.append([
            df.ALL.values[0],
            round(df.SINGLETONS.values[0] / df.ALL.values[0] * 100, 1),
            df.ALL_NOVEL.values[0],
            round(df.SINGLETONS_NOVEL.values[0] / df.ALL_NOVEL.values[0] * 100, 1),
            df.ALL_KNOWN.values[0],
            round(df.SINGLETONS_KNOWN.values[0] / df.ALL_KNOWN.values[0] * 100, 1)])

df = pd.DataFrame(counts,
    index=pd.Index(index, name='Variants'),
    columns=pd.MultiIndex.from_product([['All', 'Novel', 'In dbSNP'], ['N', 'Singletons (%)']], names=['', '']))

df.style.format(formatter = {('All', 'N'): '{:,.0f}', 
                             ('All', 'Singletons (%)'): '{:.1f}',
                             ('Novel', 'N'): '{:,.0f}', 
                             ('Novel', 'Singletons (%)'): '{:.1f}',
                             ('In dbSNP', 'N'): '{:,.0f}',
                             ('In dbSNP', 'Singletons (%)'): '{:,.1f}',
                            })


## 3. Variants in intergenic regions
---

In [None]:
index = []
counts = []
for group, consequences in grouped_noncoding_consequences.items():
    df = summary[
        (summary.FILTER == FILTER) & 
        (summary.VARIANT_TYPE == 'ALL') & 
        (summary.CONTEXT_TYPE == 'INTERGENIC') & 
        (summary.CONSEQUENCE.isin(consequences)) & 
        (summary.COUNT == 'N')].groupby(['FILTER', 'VARIANT_TYPE', 'CONTEXT_TYPE', 'COUNT']).sum().reset_index()

    if (df.ALL.values[0] > 0):
        index.append(group)
        counts.append([
            df.ALL.values[0],
            round(df.SINGLETONS.values[0] / df.ALL.values[0] * 100, 1),
            df.ALL_NOVEL.values[0],
            round(df.SINGLETONS_NOVEL.values[0] / df.ALL_NOVEL.values[0] * 100, 1),
            df.ALL_KNOWN.values[0],
            round(df.SINGLETONS_KNOWN.values[0] / df.ALL_KNOWN.values[0] * 100, 1)])

df = pd.DataFrame(counts,
    index=pd.Index(index, name='Variants'),
    columns=pd.MultiIndex.from_product([['All', 'Novel', 'In dbSNP'], ['N', 'Singletons (%)']], names=['', '']))

df.style.format(formatter = {('All', 'N'): '{:,.0f}', 
                             ('All', 'Singletons (%)'): '{:.1f}',
                             ('Novel', 'N'): '{:,.0f}', 
                             ('Novel', 'Singletons (%)'): '{:.1f}',
                             ('In dbSNP', 'N'): '{:,.0f}',
                             ('In dbSNP', 'Singletons (%)'): '{:,.1f}',
                            })