In [1]:
import os
from pprint import pprint
from importlib import import_module

from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload, subqueryload, Load, load_only
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy.dialects import postgresql

from dataservice.extensions import db
from dataservice.utils import iterate_pairwise
from dataservice import create_app
from dataservice.api.investigator.models import Investigator
from dataservice.api.study.models import Study
from dataservice.api.participant.models import Participant, AliasGroup
from dataservice.api.biospecimen.models import Biospecimen
from dataservice.api.family.models import Family
from dataservice.api.family_relationship.models import FamilyRelationship
from dataservice.api.diagnosis.models import Diagnosis
from dataservice.api.outcome.models import Outcome
from dataservice.api.phenotype.models import Phenotype
from dataservice.api.genomic_file.models import GenomicFile
from dataservice.api.sequencing_experiment.models import SequencingExperiment
from dataservice.api.workflow.models import Workflow, WorkflowGenomicFile
from dataservice.api.study_file.models import StudyFile

from dataservice.util.data_import.utils import to_camel_case
from dataservice.util.data_import.etl.defaults import DEFAULT_ENTITY_TYPES

In [2]:
def setup():
    app = create_app('testing')
#     app.config['SQLALCHEMY_ECHO'] = True
    app_context = app.app_context()
    app_context.push()
    db.drop_all()
    db.create_all()
    return app_context

def teardown(app_context):
    db.session.remove()
    db.drop_all()
    app_context.pop()

In [3]:
app_context = setup()

In [4]:
# Create entities
study = Study(external_id='study_0')
f1 = Family(external_id='f1')
f2 = Family(external_id='f2')
for i in range(4):
    p = Participant(external_id='p{}'.format(i), is_proband=True)
    if i % 2 == 0:      
        f1.participants.append(p)
    else:
        f2.participants.append(p)
    study.participants.append(p)
db.session.add(study)
db.session.commit()

In [5]:
# Delete participants from a family to orphan it
for p in Family.query.filter_by(external_id='f2').one().participants:
    db.session.delete(p)
# for p in Family.query.filter_by(external_id='f1').one().participants:
#     db.session.delete(p)
db.session.commit()

In [None]:
teardown(app_context)

In [6]:
db.session.delete(Family.query.first())
db.session.commit()

# ETL Demo

In [None]:
# https://bitbucket.org/zzzeek/sqlalchemy/wiki/UsageRecipes/ManyToManyOrphan
q = (db.session.query(Family).filter(~Family.participants.any()))
q.delete(synchronize_session='fetch')
# print(q.statement.compile(dialect=postgresql.dialect()))
# q.count()

In [52]:
import os
import pandas as pd
import random
DATA_DIR = '/Users/singhn4/Projects/kids_first/data/demo'

In [55]:
subjects = []
families = []
data = {}
family = {}
for i in range(100):
    subject = {'subject_id': 'SUBJ_{}'.format(i),
           'family_id': 'FAM_{}'.format(i),
              'is_proband': random.choice([True, False])}
    if (i % 3) == 0:
        family = {'family_id': 'FAM_{}'.format(i),
                 'members': random.randint(2,5)}
        families.append(family)
    subjects.append(subject)

# Study Df
studies = [{'study_name': 'Demo study', 'version': 1.0, 'external_id': 'demo_study_0'}]
study_df = pd.DataFrame(studies)
study_df.to_csv(os.path.join(DATA_DIR, 'study.txt'), index=False)
    
# Subject Df    
subject_df = pd.DataFrame(subjects)
subject_df.to_csv(os.path.join(DATA_DIR, 'subjects.txt'), index=False)

# Family Df    
family_df = pd.DataFrame(families)
family_df.to_csv(os.path.join(DATA_DIR, 'families.txt'), index=False)

In [33]:
study_df.head()

Unnamed: 0,external_id,study_name,version
0,demo_study_0,Demo study,1.0


In [34]:
print(subject_df.shape)
subject_df.describe(include='O')

(100, 3)


Unnamed: 0,family_id,subject_id
count,100,100
unique,100,100
top,FAM_49,SUBJ_32
freq,1,1


In [35]:
print(family_df.shape)
family_df.head()
family_df.describe(include='O')

(34, 2)


Unnamed: 0,family_id
count,34
unique,34
top,FAM_30
freq,1


In [36]:
participant_df = pd.merge(subject_df, family_df)

In [37]:
print(participant_df.shape)
participant_df.head()

(34, 4)


Unnamed: 0,family_id,is_proband,subject_id,members
0,FAM_0,True,SUBJ_0,3
1,FAM_3,False,SUBJ_3,3
2,FAM_6,True,SUBJ_6,3
3,FAM_9,False,SUBJ_9,4
4,FAM_12,False,SUBJ_12,5


In [45]:
# Add study to participant
participant_df['study_external_id'] = study_df['external_id'][0]

In [46]:
participant_df.head()

Unnamed: 0,family_id,is_proband,subject_id,members,study_external_id
0,FAM_0,True,SUBJ_0,3,demo_study_0
1,FAM_3,False,SUBJ_3,3,demo_study_0
2,FAM_6,True,SUBJ_6,3,demo_study_0
3,FAM_9,False,SUBJ_9,4,demo_study_0
4,FAM_12,False,SUBJ_12,5,demo_study_0


In [42]:
study_df['external_id'][0]

'demo_study_0'