In [1]:
import os
import sqlalchemy as sa
from sqlalchemy import create_engine
from sqlalchemy import inspect
import json
from pprint import pprint
import pandas as pd


In [2]:
os.environ.get('DB_URL')

'postgresql://postgres:password123@postgresql/postgres'

In [3]:
engine = sa.create_engine(os.environ.get('DB_URL'))

In [4]:
%load_ext sql

In [5]:
%sql postgresql://postgres:password123@postgresql/postgres

In [6]:
# Get column information
def get_column_info(tables):
    for table in tables:
        print(table)
        columns = inspector.get_columns(table)
        pprint(columns)

db_uri = os.environ.get('DB_URL')
engine = create_engine(db_uri)

inspector = inspect(engine)

# Get table information
tables = inspector.get_table_names()
pprint(tables)

['annotation',
 'gene',
 'allele',
 'transcript',
 'phenotype',
 'usergroup',
 'genepanel_transcript',
 'usergroupgenepanel',
 'user',
 'customannotation',
 'usersession',
 'useroldpassword',
 'attachment',
 'genotype',
 'alleleinterpretation',
 'reference',
 'interpretationstatehistory',
 'resourcelog',
 'alleleassessmentattachment',
 'alleleassessmentreferenceassessment',
 'alembic_version',
 'analysisinterpretation',
 'annotationjob',
 'genepanel',
 'genepanel_phenotype',
 'sample',
 'analysis',
 'broadcast',
 'genotypesampledata',
 'clilog',
 'usergroupfilterconfig',
 'jsonschema',
 'filterconfig',
 'alleleinterpretationsnapshot',
 'analysisinterpretationsnapshot',
 'interpretationlog',
 'alleleassessment',
 'allelereport',
 'referenceassessment',
 'geneassessment',
 'annotationshadowfrequency',
 'annotationshadowtranscript']


## Report Generation

Most of the time a report is generated for a particular sample. 

Sample Id -> Analysis Id -> List(Interpretation Ids) -> List(Allele Id)

In [7]:
%%sql

SELECT * FROM sample;

 * postgresql://postgres:***@postgresql/postgres
7 rows affected.


id,identifier,analysis_id,sample_type,date_deposited,affected,family_id,father_id,mother_id,sibling_id,proband,sex
1,brca_allfiltered,1,HTS,2020-10-19 07:29:42.471284+00:00,True,,,,,True,
2,brca_sample_1,2,HTS,2020-10-19 07:29:46.227075+00:00,True,,,,,True,
4,brca_sample_3,5,HTS,2020-10-19 07:30:02.294520+00:00,True,,,,,True,
5,brca_sample_1,6,HTS,2020-10-19 07:30:05.715108+00:00,True,,,,,True,
6,brca_sample_master,7,HTS,2020-10-19 07:30:09.451317+00:00,True,,,,,True,
7,brca_sample_2,8,HTS,2020-10-19 07:30:12.751708+00:00,True,,,,,True,
8,brca_sample_master,9,HTS,2020-10-19 07:30:16.355499+00:00,True,,,,,True,


## Analysis brca_sample_1.HBOCUTV_v01

Since any sample can duplicate records with more than 1 analysis, make sure you are getting the correct analysis and the correct. On the UI there needs to be a filter by sample_name, sample_type, date_deposited, genepanel name, and/or genepanel version.

Generally the Sample Name is going to be read in from a LIMs system somewhere anyways.

In [8]:
# make sure to use single quotes, '', not ""

report_data = {}

In [9]:
sample = %sql select * from sample where analysis_id = 2 AND identifier = 'brca_sample_1';
sample_df = sample.DataFrame()
sample_df

 * postgresql://postgres:***@postgresql/postgres
1 rows affected.


Unnamed: 0,id,identifier,analysis_id,sample_type,date_deposited,affected,family_id,father_id,mother_id,sibling_id,proband,sex
0,2,brca_sample_1,2,HTS,2020-10-19 07:29:46.227075+00:00,True,,,,,True,


In [11]:
report_data['sample'] = sample_df.to_dict('records')

In [12]:
analysis = %sql SELECT * FROM analysis where name = 'brca_sample_1.HBOCUTV_v01';
analysis_df = analysis.DataFrame()
analysis_df

 * postgresql://postgres:***@postgresql/postgres
1 rows affected.


Unnamed: 0,id,name,genepanel_name,genepanel_version,warnings,report,date_deposited,properties,date_requested
0,2,brca_sample_1.HBOCUTV_v01,HBOCUTV,v01,,,2020-10-19 07:29:46.204898+00:00,,


In [13]:
report_data['analysis'] = analysis_df.to_dict('records')

In [14]:
# At some point I will want to filter on analyses that are complete

analysis_interpretation = %sql select * from analysisinterpretation where analysis_id = 2;
analysis_interprtetation_df = analysis_interpretation.DataFrame()
analysis_interprtetation_df

 * postgresql://postgres:***@postgresql/postgres
1 rows affected.


Unnamed: 0,id,genepanel_name,genepanel_version,user_state,state,status,date_last_update,date_created,analysis_id,user_id,finalized,workflow_status
0,2,HBOCUTV,v01,"{'allele': {'3': {'sections': {}, 'allele_id':...",{'allele': {'3': {'report': {'included': True}...,Ongoing,2020-10-19 08:01:33.706453+00:00,2020-10-19 07:29:46.217740+00:00,2,1,,Interpretation


In [15]:
report_data['analysis_interprtetation'] = analysis_interprtetation_df.to_dict('records')

In [16]:
allele_assessment = %sql select * from alleleassessment where analysis_id = 2;
allele_assessment_df = allele_assessment.DataFrame()

report_data['allele_assessment'] = allele_assessment_df.to_dict('records')

allele_assessment_df

 * postgresql://postgres:***@postgresql/postgres
6 rows affected.


Unnamed: 0,id,classification,evaluation,user_id,date_created,date_superceeded,previous_assessment_id,allele_id,genepanel_name,genepanel_version,analysis_id,annotation_id,custom_annotation_id,usergroup_id
0,1,4,"{'acmg': {'included': [{'op': None, 'code': 'P...",1,2020-10-19 07:43:59.585285+00:00,,,3,HBOCUTV,v01,2,3,,1
1,2,4,"{'acmg': {'included': [{'op': None, 'code': 'P...",1,2020-10-19 07:44:15.013829+00:00,,,4,HBOCUTV,v01,2,4,,1
2,3,4,"{'acmg': {'included': [{'op': None, 'code': 'P...",1,2020-10-19 07:44:30.335239+00:00,,,5,HBOCUTV,v01,2,5,,1
3,4,U,"{'acmg': {'included': [], 'suggested': [{'op':...",1,2020-10-19 07:44:43.334923+00:00,,,6,HBOCUTV,v01,2,6,,1
4,5,4,"{'acmg': {'included': [{'op': None, 'code': 'P...",1,2020-10-19 07:44:58.945348+00:00,,,7,HBOCUTV,v01,2,7,,1
5,6,U,"{'acmg': {'included': [], 'suggested': [{'op':...",1,2020-10-19 07:45:11.324654+00:00,,,8,HBOCUTV,v01,2,8,,1


In [19]:
# Now let's grab the allele ids
allele_ids = allele_assessment_df['allele_id'].tolist() 
query = sa.text(""" 
                SELECT *
                FROM annotation
                WHERE allele_id IN :values; 
""")
query = query.bindparams(values=tuple(allele_ids))
allele_annotation_df = pd.read_sql(query, engine)
allele_annotation_df

Unnamed: 0,id,allele_id,annotations,previous_annotation_id,date_superceeded,date_created,schema_version
0,3,3,"{'external': {'HGMD': {'tag': 'DM', 'codon': 4...",,,2020-10-19 07:29:46.327638+00:00,1
1,4,4,"{'external': {'HGMD': {'tag': 'DM', 'codon': 1...",,,2020-10-19 07:29:46.327657+00:00,1
2,5,5,"{'external': {'HGMD': {'tag': 'DM', 'acc_num':...",,,2020-10-19 07:29:46.327665+00:00,1
3,6,6,"{'external': {'HGMD': {'tag': 'DM', 'codon': 2...",,,2020-10-19 07:29:46.327672+00:00,1
4,7,7,"{'external': {'HGMD': {'tag': 'DM', 'codon': 3...",,,2020-10-19 07:29:46.327678+00:00,1
5,8,8,"{'external': {'HGMD': {'tag': 'DM?', 'codon': ...",,,2020-10-19 07:29:46.327685+00:00,1


In [20]:
report_data['allele_annotation'] = allele_annotation_df.to_dict('records')

## Sample Analysis DataSet

The dataset from a sample analysis should look like:

In [22]:
pprint(report_data.keys())

dict_keys(['sample', 'analysis', 'analysis_interprtetation', 'allele_assessment', 'allele_annotation'])


In [23]:
report_data

{'sample': [{'id': 2,
   'identifier': 'brca_sample_1',
   'analysis_id': 2,
   'sample_type': 'HTS',
   'date_deposited': Timestamp('2020-10-19 07:29:46.227075+0000', tz='psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)'),
   'affected': True,
   'family_id': None,
   'father_id': None,
   'mother_id': None,
   'sibling_id': None,
   'proband': True,
   'sex': None}],
 'analysis': [{'id': 2,
   'name': 'brca_sample_1.HBOCUTV_v01',
   'genepanel_name': 'HBOCUTV',
   'genepanel_version': 'v01',
   'report': None,
   'date_deposited': Timestamp('2020-10-19 07:29:46.204898+0000', tz='psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)'),
   'properties': None,
   'date_requested': None}],
 'analysis_interprtetation': [{'id': 2,
   'genepanel_name': 'HBOCUTV',
   'genepanel_version': 'v01',
   'user_state': {'allele': {'3': {'sections': {},
      'allele_id': 3,
      'showExcludedReferences': False},
     '4': {'sections': {}, 'allele_id': 4, 'showExcludedReferences': False},
     '5