# Parse the Ella Analysis

In the previous notebook I demonstrated how to query the database directly to create a custom report. This is an example of you could then query and output the data.

Note that the analysis output format may change!

This assumes that you ran the 'Analysis-brca_sample_1.HB0C' notebook to generate the data file.

In [1]:
import json
import pandas as pd
import os
import copy

In [2]:
# There was an illegal escape sequence in the original file 
# Make sure not to have any slashes in front of analysis_data. (not /analysis_data)
analysis_data_file = '/data/analysis_data.json'
analysis_data_file

'/data/analysis_data.json'

In [3]:
with open(analysis_data_file, 'r') as reader:
    analysis_data = json.load(reader)
    
acmg_config = []
#Class 1 benign
#Class 2 likely benign
#Class 3 variant of uncertain significance (VUS)
#Class 4 likely pathogenic
#Class 5 pathogenic

acmg_codes = {
    '1': 'Benign',
    '2': 'Likely Benign',
    '3': 'Variant of Uncertain Significance',
    '4': 'Likely Pathogenic',
    '5': 'Pathogenic',
    'U': 'Unknown'
}

for idx, assessment in enumerate(analysis_data['alleleassessment']):
    acmg_config_t = {
        'id': assessment['id'],
        'acmg_code' : assessment['classification'],
        'acmg_class' : acmg_codes[assessment['classification']],
        'allele_id': assessment['allele_id']
    }
    acmg_eval = assessment['evaluation']['acmg']['included']
    codes = map(lambda x: x['code'], acmg_eval)
    codes = list(codes)
    acmg_config_t['codes'] = ', '.join(codes)
    acmg_config.append(acmg_config_t)
    assessment['acmg_class'] = acmg_config_t['acmg_class']
    
analysis_data.keys()

dict_keys(['sample', 'alleleassessment', 'allelereport', 'geneassessment', 'analysisinterpretation', 'analysis', 'annotation', 'annotationshadowtranscript', 'allele', 'genotype', 'genotypesampledata', 'filtered_allele_data', 'filtered_allele_ids'])

## Sample

In [4]:
pd.DataFrame.from_records(analysis_data['sample'])

Unnamed: 0,id,identifier,analysis_id,sample_type,date_deposited,affected,family_id,father_id,mother_id,sibling_id,proband,sex
0,2,brca_sample_1,2,HTS,1603104403434,True,,,,,True,


## Gene Panel : HB0C v01

## Genotypes

In [5]:
genotype_df = pd.DataFrame.from_records(analysis_data['genotype'])
genotype_df.head()

Unnamed: 0,id,allele_id,secondallele_id,sample_id,variant_quality,filter_status
0,3,3,,2,5000,PASS
1,4,4,,2,5000,PASS
2,5,5,,2,5000,PASS
3,6,6,,2,5000,PASS
4,7,7,,2,5000,PASS


## Alleles

In [6]:
allele_df = pd.DataFrame.from_records(analysis_data['allele'])
allele_df

Unnamed: 0,id,genome_reference,chromosome,start_position,open_end_position,change_from,change_to,change_type,vcf_pos,vcf_ref,vcf_alt
0,3,GRCh37,13,32890606,32890607,G,T,SNP,32890607,G,T
1,4,GRCh37,13,32890645,32890647,AC,,del,32890645,GAC,G
2,5,GRCh37,13,32890665,32890666,T,A,SNP,32890666,T,A
3,6,GRCh37,13,32893217,32893218,A,T,SNP,32893218,A,T
4,7,GRCh37,13,32893242,32893243,G,T,SNP,32893243,G,T
5,8,GRCh37,13,32893343,32893344,A,G,SNP,32893344,A,G


In [7]:
# just get the allele ids

#allele_df.id.isin(filtered_allele_ids)

## Transcripts

In [8]:
annotation_shadow_transcript_df =  pd.DataFrame.from_records(analysis_data['annotationshadowtranscript'])
annotation_shadow_transcript_df.head()

Unnamed: 0,id,allele_id,hgnc_id,symbol,transcript,hgvsc,protein,hgvsp,consequences,exon_distance,coding_region_distance
0,49,3,37116,ZAR1L,ENST00000345108,,,,[upstream_gene_variant],,
1,50,3,1101,BRCA2,ENST00000380152,c.10G>T,ENSP00000369497.3,p.Gly4Ter,[stop_gained],0.0,0.0
2,51,3,1101,BRCA2,ENST00000530893,c.-356G>T,,,[5_prime_UTR_variant],0.0,-356.0
3,52,3,37116,ZAR1L,ENST00000533490,,,,[upstream_gene_variant],,
4,53,3,1101,BRCA2,ENST00000544455,c.10G>T,ENSP00000439902.1,p.Gly4Ter,[stop_gained],0.0,0.0


## Gene Symbols

In [9]:
gene_symbols_df = annotation_shadow_transcript_df[['allele_id', 'hgnc_id', 'symbol']].copy()
gene_symbols_df = gene_symbols_df.drop_duplicates()
gene_symbols_df.head()

Unnamed: 0,allele_id,hgnc_id,symbol
0,3,37116,ZAR1L
1,3,1101,BRCA2
7,4,37116,ZAR1L
8,4,1101,BRCA2
14,5,37116,ZAR1L


## ACMG Assessments

In [10]:
# TODO Get Gene Symbol from transcripts
acmg_config_df = pd.DataFrame.from_records(acmg_config)
acmg_config_df.head()

Unnamed: 0,id,acmg_code,acmg_class,allele_id,codes
0,1,4,Likely Pathogenic,3,"PVS1, PPxPM2"
1,2,4,Likely Pathogenic,4,"PVS1, PPxPM2"
2,3,4,Likely Pathogenic,5,"PVS1, PPxPM2"
3,4,U,Unknown,6,
4,5,4,Likely Pathogenic,7,"PVS1, PPxPM2"


In [11]:
allele_data_df = acmg_config_df.join(gene_symbols_df.set_index('allele_id'), on='allele_id')
allele_data_df.join(allele_df.set_index('id'), on='allele_id')
allele_data_df.head()

Unnamed: 0,id,acmg_code,acmg_class,allele_id,codes,hgnc_id,symbol
0,1,4,Likely Pathogenic,3,"PVS1, PPxPM2",37116,ZAR1L
0,1,4,Likely Pathogenic,3,"PVS1, PPxPM2",1101,BRCA2
1,2,4,Likely Pathogenic,4,"PVS1, PPxPM2",37116,ZAR1L
1,2,4,Likely Pathogenic,4,"PVS1, PPxPM2",1101,BRCA2
2,3,4,Likely Pathogenic,5,"PVS1, PPxPM2",37116,ZAR1L


## Allele Report

These contain custom comments by the Genetic Counselor.

In [12]:
allele_report_df = pd.DataFrame.from_records(analysis_data['allelereport'])
allele_report_df.head()

Unnamed: 0,id,evaluation,user_id,date_created,date_superceeded,previous_report_id,allele_id,analysis_id,alleleassessment_id,usergroup_id
0,1,{'comment': ''},1,1603104584640,,,3,2,,1
1,2,{'comment': ''},1,1603104600778,,,4,2,,1
2,3,{'comment': ''},1,1603104614366,,,5,2,,1
3,4,{'comment': ''},1,1603104626999,,,6,2,,1
4,5,{'comment': ''},1,1603104638892,,,7,2,,1


## Genotype Sample Data

In [13]:
genotype_sample_data_report_df = pd.DataFrame.from_records(analysis_data['genotypesampledata'])
genotype_sample_data_report_df.head()

Unnamed: 0,id,genotype_id,secondallele,multiallelic,type,sample_id,genotype_quality,sequencing_depth,genotype_likelihood,allele_depth,allele_ratio
0,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",0.427807
1,4,4,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'G': 80, 'REF (GAC)': 107}",0.427807
2,5,5,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'A': 80, 'REF (T)': 107}",0.427807
3,6,6,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (A)': 107}",0.427807
4,7,7,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",0.427807


## Generate a Merged table with ALL THE ALLELE THINGS

Obviously,throwing everything into a single table is not suitable for very large datasets. Then we want to use something smarter like Xarray + Dask or proper SQL queries.

In [14]:
genotype_data_df = pd.merge(genotype_sample_data_report_df, genotype_df, left_on=['genotype_id','sample_id'], right_on=['id', 'sample_id'])
genotype_data_df.head()

Unnamed: 0,id_x,genotype_id,secondallele,multiallelic,type,sample_id,genotype_quality,sequencing_depth,genotype_likelihood,allele_depth,allele_ratio,id_y,allele_id,secondallele_id,variant_quality,filter_status
0,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",0.427807,3,3,,5000,PASS
1,4,4,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'G': 80, 'REF (GAC)': 107}",0.427807,4,4,,5000,PASS
2,5,5,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'A': 80, 'REF (T)': 107}",0.427807,5,5,,5000,PASS
3,6,6,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (A)': 107}",0.427807,6,6,,5000,PASS
4,7,7,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",0.427807,7,7,,5000,PASS


In [15]:
analysis_data_merged_report = pd.merge(genotype_data_df, allele_data_df, left_on='allele_id', right_on='allele_id')
analysis_data_merged_report.head()

Unnamed: 0,id_x,genotype_id,secondallele,multiallelic,type,sample_id,genotype_quality,sequencing_depth,genotype_likelihood,allele_depth,...,allele_id,secondallele_id,variant_quality,filter_status,id,acmg_code,acmg_class,codes,hgnc_id,symbol
0,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,3,,5000,PASS,1,4,Likely Pathogenic,"PVS1, PPxPM2",37116,ZAR1L
1,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,3,,5000,PASS,1,4,Likely Pathogenic,"PVS1, PPxPM2",1101,BRCA2
2,4,4,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'G': 80, 'REF (GAC)': 107}",...,4,,5000,PASS,2,4,Likely Pathogenic,"PVS1, PPxPM2",37116,ZAR1L
3,4,4,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'G': 80, 'REF (GAC)': 107}",...,4,,5000,PASS,2,4,Likely Pathogenic,"PVS1, PPxPM2",1101,BRCA2
4,5,5,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'A': 80, 'REF (T)': 107}",...,5,,5000,PASS,3,4,Likely Pathogenic,"PVS1, PPxPM2",37116,ZAR1L


In [16]:
analysis_data_merged_report = pd.merge(analysis_data_merged_report, annotation_shadow_transcript_df, left_on='allele_id', right_on='allele_id')
analysis_data_merged_report.head()

Unnamed: 0,id_x,genotype_id,secondallele,multiallelic,type,sample_id,genotype_quality,sequencing_depth,genotype_likelihood,allele_depth,...,id_y,hgnc_id_y,symbol_y,transcript,hgvsc,protein,hgvsp,consequences,exon_distance,coding_region_distance
0,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,49,37116,ZAR1L,ENST00000345108,,,,[upstream_gene_variant],,
1,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,50,1101,BRCA2,ENST00000380152,c.10G>T,ENSP00000369497.3,p.Gly4Ter,[stop_gained],0.0,0.0
2,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,51,1101,BRCA2,ENST00000530893,c.-356G>T,,,[5_prime_UTR_variant],0.0,-356.0
3,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,52,37116,ZAR1L,ENST00000533490,,,,[upstream_gene_variant],,
4,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,53,1101,BRCA2,ENST00000544455,c.10G>T,ENSP00000439902.1,p.Gly4Ter,[stop_gained],0.0,0.0


In [17]:
allele_assessment_df = pd.DataFrame.from_records(analysis_data['alleleassessment']) 
allele_assessment_df.head()

Unnamed: 0,id,classification,evaluation,user_id,date_created,date_superceeded,previous_assessment_id,allele_id,genepanel_name,genepanel_version,analysis_id,annotation_id,custom_annotation_id,usergroup_id,acmg_class
0,1,4,"{'acmg': {'included': [{'op': None, 'code': 'P...",1,1603104584630,,,3,HBOCUTV,v01,2,3,,1,Likely Pathogenic
1,2,4,"{'acmg': {'included': [{'op': None, 'code': 'P...",1,1603104600770,,,4,HBOCUTV,v01,2,4,,1,Likely Pathogenic
2,3,4,"{'acmg': {'included': [{'op': None, 'code': 'P...",1,1603104614357,,,5,HBOCUTV,v01,2,5,,1,Likely Pathogenic
3,4,U,"{'acmg': {'included': [], 'suggested': [{'op':...",1,1603104626990,,,6,HBOCUTV,v01,2,6,,1,Unknown
4,5,4,"{'acmg': {'included': [{'op': None, 'code': 'P...",1,1603104638883,,,7,HBOCUTV,v01,2,7,,1,Likely Pathogenic


In [18]:
allele_assessment_df.iloc[0]['classification']

'4'

In [19]:
analysis_data_merged_report = pd.merge(analysis_data_merged_report, allele_assessment_df, left_on='allele_id', right_on='allele_id')
analysis_data_merged_report

Unnamed: 0,id_x,genotype_id,secondallele,multiallelic,type,sample_id,genotype_quality,sequencing_depth,genotype_likelihood,allele_depth,...,date_created,date_superceeded,previous_assessment_id,genepanel_name,genepanel_version,analysis_id,annotation_id,custom_annotation_id,usergroup_id,acmg_class_y
0,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,1603104584630,,,HBOCUTV,v01,2,3,,1,Likely Pathogenic
1,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,1603104584630,,,HBOCUTV,v01,2,3,,1,Likely Pathogenic
2,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,1603104584630,,,HBOCUTV,v01,2,3,,1,Likely Pathogenic
3,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,1603104584630,,,HBOCUTV,v01,2,3,,1,Likely Pathogenic
4,3,3,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'T': 80, 'REF (G)': 107}",...,1603104584630,,,HBOCUTV,v01,2,3,,1,Likely Pathogenic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,8,8,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'G': 80, 'REF (A)': 107}",...,1603104647757,,,HBOCUTV,v01,2,8,,1,Unknown
68,8,8,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'G': 80, 'REF (A)': 107}",...,1603104647757,,,HBOCUTV,v01,2,8,,1,Unknown
69,8,8,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'G': 80, 'REF (A)': 107}",...,1603104647757,,,HBOCUTV,v01,2,8,,1,Unknown
70,8,8,False,False,Heterozygous,2,99,187,"[2048, 0, 2917]","{'G': 80, 'REF (A)': 107}",...,1603104647757,,,HBOCUTV,v01,2,8,,1,Unknown


This is a lot of data, so let's subset just a few to keep in our report.

Disclaimer: I am not a variant scientist. The examples given are meant to help to give an understanding of the data structures for people who *know what they are doing* to generate reports.

In [20]:
keep_these = ['c.10G>T', 'c.-172A>G']

subset_df = analysis_data_merged_report[analysis_data_merged_report["hgvsc"].isin(keep_these)]
#subset_df = subset_df[0:2]
subset_df.columns

Index(['id_x', 'genotype_id', 'secondallele', 'multiallelic', 'type',
       'sample_id', 'genotype_quality', 'sequencing_depth',
       'genotype_likelihood', 'allele_depth', 'allele_ratio', 'id_y',
       'allele_id', 'secondallele_id', 'variant_quality', 'filter_status',
       'id_x', 'acmg_code', 'acmg_class_x', 'codes', 'hgnc_id_x', 'symbol_x',
       'id_y', 'hgnc_id_y', 'symbol_y', 'transcript', 'hgvsc', 'protein',
       'hgvsp', 'consequences', 'exon_distance', 'coding_region_distance',
       'id', 'classification', 'evaluation', 'user_id', 'date_created',
       'date_superceeded', 'previous_assessment_id', 'genepanel_name',
       'genepanel_version', 'analysis_id', 'annotation_id',
       'custom_annotation_id', 'usergroup_id', 'acmg_class_y'],
      dtype='object')

In [21]:
df1 = subset_df[['symbol_y',  'hgvsc', 'type', 'acmg_class_y']]
df1 = df1.drop_duplicates()
df1


Unnamed: 0,symbol_y,hgvsc,type,acmg_class_y
1,BRCA2,c.10G>T,Heterozygous,Likely Pathogenic
63,BRCA2,c.-172A>G,Heterozygous,Unknown


In [22]:
df1.columns = ['Gene', 'Variant', 'Zygosity', 'Variant Classification']
df1

Unnamed: 0,Gene,Variant,Zygosity,Variant Classification
1,BRCA2,c.10G>T,Heterozygous,Likely Pathogenic
63,BRCA2,c.-172A>G,Heterozygous,Unknown


In [23]:
! pip install tabulate



In [24]:
from tabulate import tabulate
markdown_table_str = tabulate(df1, headers='keys', tablefmt='pipe', showindex=False)

# Create a Custom Report Template

This is just an example of how you could create a custom report. Every institute is going to have it's own custom reporting.

In [25]:
report_data = {
    "sample_name": "Sample-N",
    "patient_data": {
        "patient_first_name": "Todd",
        "patient_last_name": "Smith",
        "patient_dob": "12/11/1977",
        "patient_gender": "Male",
    },
    "sample_data": {
        "sample_type": "blood",
        "sample_collection_date": "12/9/2020",
        "sample_id": "1234567",
        "sample_accession": "1234567",
        "sample_accession_date": "12/9/2020",
    },
    "test_data": {
        "test_ordered": "BRCA2 sequencing & Deletion/Duplication analysis",
        "test_code": "ABCD",
        "test_indication": "Patient has family history of breast cancer.",
        "test_accession": "BRCA122345",
        "test_requesting_physician": "Dr. Who",
    },
    "report_data": {
        "report_date": "12/9/2020",
    },
    "methods": {
      "mlpa": """MLPA.""",
      "amplicon": """AMPLICON""",  
    },
    #"mlpa_analysis_data": None,
    #"ngs_analysis_data": None,
    #"analysis_data": df1,
    "markdown_table_str" : markdown_table_str,
}

with open('/data/mock_patient_data.json', 'w') as fp:
    json.dump(report_data, fp, sort_keys=True, indent=4)
    
report_data['analysis_data'] = df1

In [26]:
from jinja2 import Environment, BaseLoader


report_header_table = """
# Report

## Sample-1

---

|  |  |  |  |
| --- | --- | --- | --- |
| **Patient Name:** | {{ patient_data.patient_last_name }}, {{ patient_data.patient_first_name }} | **Ordered By:** | {{test_data.test_requesting_physician}} |
| **Date of Birth:** | {{patient_data.patient_dob}} | **Sample Collection Date:** | {{ sample_data.sample_collection_date }} | 
| **Gender**: | {{patient_data.patient_gender}} | **Sample Accession Date:** | {{ sample_data.sample_accession_date }} |
| **Sample Type:** | {{sample_data.sample_type}} | **Report Data:** | {{report_data.report_date}} |
| **Sample ID/MRN:** | {{ sample_data.sample_id }} | | |
| **Test Accession:** | {{ test_data.test_accession }} | | | 
| **Test Ordered:** | {{test_data.test_ordered}} | | | 
| **Test Code:** | {{ test_data.test_code }} | | |
| **Test Indictation** | {{ test_data.test_indication }} | | |

---

## Result: Postitive

---

{{ markdown_table_str }}

"""

rtemplate = Environment(loader=BaseLoader()).from_string(report_header_table)
rendered_report_header_table = rtemplate.render(**report_data)
print(rendered_report_header_table)



# Report

## Sample-1

---

|  |  |  |  |
| --- | --- | --- | --- |
| **Patient Name:** | Smith, Todd | **Ordered By:** | Dr. Who |
| **Date of Birth:** | 12/11/1977 | **Sample Collection Date:** | 12/9/2020 | 
| **Gender**: | Male | **Sample Accession Date:** | 12/9/2020 |
| **Sample Type:** | blood | **Report Data:** | 12/9/2020 |
| **Sample ID/MRN:** | 1234567 | | |
| **Test Accession:** | BRCA122345 | | | 
| **Test Ordered:** | BRCA2 sequencing & Deletion/Duplication analysis | | | 
| **Test Code:** | ABCD | | |
| **Test Indictation** | Patient has family history of breast cancer. | | |

---

## Result: Postitive

---

| Gene   | Variant   | Zygosity     | Variant Classification   |
|:-------|:----------|:-------------|:-------------------------|
| BRCA2  | c.10G>T   | Heterozygous | Likely Pathogenic        |
| BRCA2  | c.-172A>G | Heterozygous | Unknown                  |



In [27]:
# This doesn't work in the jupyterhub output
from IPython.display import display, Markdown, Latex, HTML
display(Markdown(rendered_report_header_table))



# Report

## Sample-1

---

|  |  |  |  |
| --- | --- | --- | --- |
| **Patient Name:** | Smith, Todd | **Ordered By:** | Dr. Who |
| **Date of Birth:** | 12/11/1977 | **Sample Collection Date:** | 12/9/2020 | 
| **Gender**: | Male | **Sample Accession Date:** | 12/9/2020 |
| **Sample Type:** | blood | **Report Data:** | 12/9/2020 |
| **Sample ID/MRN:** | 1234567 | | |
| **Test Accession:** | BRCA122345 | | | 
| **Test Ordered:** | BRCA2 sequencing & Deletion/Duplication analysis | | | 
| **Test Code:** | ABCD | | |
| **Test Indictation** | Patient has family history of breast cancer. | | |

---

## Result: Postitive

---

| Gene   | Variant   | Zygosity     | Variant Classification   |
|:-------|:----------|:-------------|:-------------------------|
| BRCA2  | c.10G>T   | Heterozygous | Likely Pathogenic        |
| BRCA2  | c.-172A>G | Heterozygous | Unknown                  |
