In [1]:
# if running for first time, you may need to install spacy model
# > python -m spacy download en_core_web_sm
import dotenv
dotenv.load_dotenv()


from pathlib import Path
from datetime import datetime
import pandas as pd
import pyreadr
import sqlalchemy as sa
import sqlalchemy.orm as so
from py_hemonc.matchers.spacy_config import matchers, nlp, rules, match_entities, get_modifier_child, get_nouns  
from py_hemonc.model.hemonc_model import Hemonc_Study, Hemonc_Condition, Hemonc_Component, Hemonc_Modality, Hemonc_Component_Class, Hemonc_Regimen, Hemonc_Variant, Hemonc_Regimen_Part, Part_Phase, Hemonc_Cycle_Sig, Hemonc_Sig, Sig_Days, Base, component_to_class_map, Hemonc_Ref, Hemonc_Component_Role, Hemonc_Context, Hemonc_Branch_Conditional
from py_hemonc.model.hemonc_enums import Intent, Setting, Phase, Risk, Phenotype, PriorTherapy, Phase, StudyDesign, SponsorType, BranchConditionalType, ComponentRole

oa_configurator.config - DEBUG - Application config path set: /Users/georginakennedy/cloudstor/CBDRH/ACDN/py_hemonc/oa_system_config.yaml
oa_configurator.config - DEBUG - Log path set: /Users/georginakennedy/cloudstor/CBDRH/ACDN/py_hemonc/py_hemonc/logs
oa_configurator.config - DEBUG - DB connection string: sqlite:////Users/georginakennedy/cloudstor/CBDRH/ACDN/py_hemonc/py_hemonc/data/dash.db


In [2]:
dotenv.find_dotenv()

'/Users/georginakennedy/cloudstor/CBDRH/ACDN/py_hemonc/.env'

In [3]:
import os
os.getcwd()

'/Users/georginakennedy/cloudstor/CBDRH/ACDN/py_hemonc/hemonc_sig_updates/notebooks'

In [4]:
# load data objects


TABLE_PATH = Path('../..') / 'Hemonc KB' / 'Tables'
CDM_PATH = Path('..') / 'py_hemonc' / 'db' 

In [5]:
concept_columns = ['concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code','valid_start_date', 'valid_end_date', 'invalid_reason']
concept_relationship_columns = ['concept_code_1', 'concept_code_2', 'vocabulary_id_1', 'vocabulary_id_2', 'relationship_id', 'valid_start_date', 'valid_end_date', 'invalid_reason']

sigs = pd.read_csv(TABLE_PATH / 'sigs.csv')
omop = pyreadr.read_r(TABLE_PATH / 'omop.RData')
concept_stage = omop['concept_stage'][concept_columns].copy()
concept_relationship_stage = omop['concept_relationship_stage'][concept_relationship_columns].copy()
concept_synonym_stage = omop['concept_synonym_stage'].copy()

# using RData files where possible, to avoid versioning issues
context = pyreadr.read_r(TABLE_PATH / 'context.table.RData')['context.table']
page = pyreadr.read_r(TABLE_PATH / 'page.table.RData')['page.table']
pointer = pyreadr.read_r(TABLE_PATH / 'pointer.table.RData')['pointer.table']
ref = pyreadr.read_r(TABLE_PATH / 'ref.table.RData')['ref.table']
sequence = pyreadr.read_r(TABLE_PATH / 'sequence.table.RData')['sequence.table']
study = pyreadr.read_r(TABLE_PATH / 'study.table.RData')['study.table']
#author = pyreadr.read_r(TABLE_PATH / 'author.table.RData')['author.table']
#person = pyreadr.read_r(TABLE_PATH / 'person.table.RData')['person.table']
#variant = pyreadr.read_r(TABLE_PATH / 'variant.table.RData')['variant.table']

# following files had non-utf characters, requiring R-based conversion to csv
author = pd.read_csv(TABLE_PATH / 'author.table.csv').drop(columns=['Unnamed: 0'])
person = pd.read_csv(TABLE_PATH / 'person.table.csv', encoding="ISO-8859-1").drop(columns=['Unnamed: 0'])
variant = pd.read_csv(TABLE_PATH / 'variant.table.csv', encoding="ISO-8859-1").drop(columns=['Unnamed: 0'])


regimen = pointer[pointer.regimen_cui!='NOT YET ASSIGNED'][['regimen', 'regimen_cui']].drop_duplicates().copy()

In [7]:
concepts_with_ids = pd.read_csv(CDM_PATH / 'CONCEPT.csv', delimiter='\t', low_memory=False)

In [38]:
concept_relationships = pd.read_csv(CDM_PATH / 'CONCEPT_RELATIONSHIP.csv', delimiter='\t', low_memory=False)
concept_ancestors = pd.read_csv(CDM_PATH / 'CONCEPT_ANCESTOR.csv', delimiter='\t', low_memory=False)

In [8]:
100*(study[study.start.str.contains('NR')].study.nunique() - 265)/study.study.nunique()

1.364522417153996

In [9]:
# engine = sa.create_engine(f"sqlite:///{DATABASE_OUT_PATH}/ho.db")
# Base.metadata.create_all(engine)

import omop_alchemy as oa

engine = oa.oa_config.engine
oa.Base.metadata.create_all(engine)

In [10]:
concepts_with_ids.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,1146945,concept.concept_id,Metadata,CDM,Field,S,CDM1,20141111,20991231,
1,1146954,concept.invalid_reason,Metadata,CDM,Field,S,CDM10,20141111,20991231,
2,1147044,observation_period.observation_period_id,Metadata,CDM,Field,S,CDM100,20141111,20991231,
3,756315,metadata.metadata_type_concept_id,Metadata,CDM,Field,S,CDM1000,20210925,20991231,
4,756316,metadata.name,Metadata,CDM,Field,S,CDM1001,20210925,20991231,


In [142]:
concept_stage_id[(concept_stage_id.concept_class_id=='Component Class')&(concept_stage_id.invalid_reason=='')].to_csv('component_class.csv', index=False)

In [11]:
non_omop_class = ['Author', 'Reference', 'Study', 'PubMedURL', 'ReferenceDOI', 'Clinical trial ID', 'Regimen Variant', 
                  'ReferenceURL', 'PubMedCentralURL', 'City', 'Cycle Sigs', 'Regimen Stub', 'Study Group', 'Journal', 
                  'Endpoint', 'Duration', 'Year', 'Numeric', 'Endpoint Type', 'Experimental design', 'Study Class']

cc = concept_stage[~concept_stage.concept_class_id.isin(non_omop_class)].merge(concepts_with_ids, on='concept_code', how='left')

In [131]:
len(cc[cc.concept_id.isna()])/len(cc)

0.111341059602649

In [12]:
# which current hemonc concepts are missing IDs from Athena?

cc[cc.concept_id.isna()].concept_class_id_x.value_counts()

concept_class_id_x
Regimen            945
Regimen Class      413
Brand Name         137
Component           57
Component Class     39
Unit                12
Condition            8
Component Role       3
Name: count, dtype: int64

In [13]:
concept_relationship_stage.concept_code_2 = concept_relationship_stage.apply(lambda x: x.concept_code_2.replace('-', '') if x.vocabulary_id_2 == "NDC" else x.concept_code_2, axis=1)
cr = concept_relationship_stage[concept_relationship_stage.vocabulary_id_2 != 'HemOnc'].merge(concepts_with_ids, left_on=['concept_code_2', 'vocabulary_id_2'], right_on=['concept_code', 'vocabulary_id'], how='left')

In [14]:
# which non-hemonc vocabs do we need to have available for allowing this to actually extend the CDM?
concept_relationship_stage.vocabulary_id_2.value_counts()

vocabulary_id_2
HemOnc              387300
RxNorm               17642
NDC                  14588
ICD-10-CM             1437
ICD-9-CM              1030
NCIT                   718
RxNorm Extension       645
HCPCS                  461
ICD-O-3                246
SEER Site Recode       164
OncoTree               140
Name: count, dtype: int64

In [15]:
concepts_with_ids[(concepts_with_ids.vocabulary_id=='NDC')&(concepts_with_ids.concept_code.str.contains('-'))]

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason


In [16]:
concept_relationship_stage[concept_relationship_stage.vocabulary_id_2=='NDC']

Unnamed: 0_level_0,concept_code_1,concept_code_2,vocabulary_id_1,vocabulary_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1100235,4,00024483,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
285916,4,00024815,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
310209,4,00025337,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
410201,4,00026216,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
510173,6,00931125,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
...,...,...,...,...,...,...,...,...
1296910,93818,50242103,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
1297010,93818,50242105,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
1396710,94609,25682022,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
1396810,94609,25682025,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,


In [17]:
concepts_with_ids.vocabulary_id.value_counts()

vocabulary_id
RxNorm Extension        2145325
NDC                     1195711
RxNorm                   308709
OSM                      203339
ICD10CM                   99130
ICDO3                     64471
NAACCR                    34473
ICD9CM                    17564
HCPCS                     11948
HemOnc                     8028
Cancer Modifier            6043
OMOP Extension             1444
UCUM                       1120
CDM                        1060
OncoTree                    885
Relationship                718
Concept Class               423
UB04 Typ bill               298
SOPT                        168
Vocabulary                  148
Condition Type              118
Procedure Type               97
Type Concept                 80
Domain                       65
UB04 Pt dis status           55
Cost                         51
Observation Type             29
UB04 Point of Origin         23
Condition Status             22
Visit                        19
Visit Type                

In [18]:
# not sure how much this matters, but there seems to be some issues with hemonc mappings to NDC - codes are a bit of a mish-mash with dropped hyphens and leading 0s removed
cr[cr.concept_id.isna()].vocabulary_id_2.value_counts()

vocabulary_id_2
NDC                 10615
ICD-10-CM            1437
ICD-9-CM             1030
NCIT                  718
ICD-O-3               246
SEER Site Recode      164
HCPCS                   4
RxNorm                  1
Name: count, dtype: int64

In [19]:
code_mods = concept_stage.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id', 'concept_name']
    ], 
    on=['concept_code', 'vocabulary_id'],
    how='left'
)

In [20]:
# there are some hemonc components that have changed concept_code? this will not be backwards compatible?
code_mods[(code_mods.concept_name_x != code_mods.concept_name_y) & ~code_mods.concept_class_id.isin(non_omop_class) & code_mods.concept_name_y.notna()]

Unnamed: 0,concept_name_x,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,concept_id,concept_name_y
0,Eluvixtamab,drug,HemOnc,Component,,1,2019-05-27,2099-12-31,,35802855.0,AMG 330
55,BHQ-880,drug,HemOnc,Component,,57,2019-05-27,2099-12-31,,35802910.0,BHQ880
57,BCG vaccine,drug,HemOnc,Component,,59,2019-05-27,2099-12-31,,35802912.0,Bacillus Calmette-Guerin
208,Andexanet alfa,drug,HemOnc,Component,,211,2019-05-27,2099-12-31,,35803063.0,Factor Xa recombinant inactivated-zhzo
226,Leucovorin,drug,HemOnc,Component,,229,2019-05-27,2099-12-31,,35803081.0,Folinic acid
...,...,...,...,...,...,...,...,...,...,...,...
54790,Docetaxel and Ziv-aflibercept,regimen,HemOnc,Regimen,S,76409,2019-05-27,2099-12-31,,905812.0,Docetaxel and ziv-Aflibercept
60769,HPV Oropharyngeal cancer,condition,HemOnc,Condition,,83479,2019-07-22,2099-12-31,,42542448.0,HPV-positive Oropharyngeal cancer
67099,Nystatin,drug,HemOnc,Component,,91138,2019-07-22,2099-12-31,,35101066.0,Mycostatin
67791,Mycostatin,drug,HemOnc,Brand Name,,91912,2019-07-22,2099-12-31,,35101023.0,Nystatin


In [50]:
concepts_with_ids.vocabulary_id.value_counts()

vocabulary_id
RxNorm Extension        2145325
NDC                     1195711
RxNorm                   308709
OSM                      203339
ICD10CM                   99130
ICDO3                     64471
NAACCR                    34473
ICD9CM                    17564
HCPCS                     11948
HemOnc                     8028
Cancer Modifier            6043
OMOP Extension             1444
UCUM                       1120
CDM                        1060
OncoTree                    885
Relationship                718
Concept Class               423
UB04 Typ bill               298
SOPT                        168
Vocabulary                  148
Condition Type              118
Procedure Type               97
Type Concept                 80
Domain                       65
UB04 Pt dis status           55
Cost                         51
Observation Type             29
UB04 Point of Origin         23
Condition Status             22
Visit                        19
Visit Type                

In [51]:
vocab_id_lookup = {
    'ICD-10-CM': 'ICD10CM',
    'ICD-9-CM': 'ICD9CM',
    'ICD-O-3': 'ICDO3'
}

concept_relationship_stage['vocabulary_id'] = concept_relationship_stage.vocabulary_id_2.map(lambda x: vocab_id_lookup[x] if x in vocab_id_lookup else x)

In [52]:
concept_relationship_stage.vocabulary_id.value_counts()

vocabulary_id
HemOnc              387300
RxNorm               17642
NDC                  14588
ICD10CM               1437
ICD9CM                1030
NCIT                   718
RxNorm Extension       645
HCPCS                  461
ICDO3                  246
SEER Site Recode       164
OncoTree               140
Name: count, dtype: int64

In [54]:
concept_stage_id = concept_stage.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id', 'concept_name'] # note that merging on concept_name shouldn't be required here, but as per above with code mods, some of the concept codes in hemonc have changed over time
    ], 
    how='left'
)

concept_relationship_stage_id = concept_relationship_stage.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id']
    ].rename(
        columns={'concept_code': 'concept_code_1', 
                 'concept_id': 'concept_id_1', 
                 'vocabulary_id': 'vocabulary_id_1'}
        ), 
    how='left'
)

concept_relationship_stage_id = concept_relationship_stage_id.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id']
    ].rename(
        columns={'concept_code': 'concept_code_2', 
                 'concept_id': 'concept_id_2'}
        ), 
    how='left'
)

In [21]:
vocabularies = pd.read_csv(CDM_PATH / 'VOCABULARY.csv', delimiter='\t', low_memory=False)
concept_class = pd.read_csv(CDM_PATH / 'CONCEPT_CLASS.csv', delimiter='\t', low_memory=False)
domain = pd.read_csv(CDM_PATH / 'DOMAIN.csv', delimiter='\t', low_memory=False)
relationship = pd.read_csv(CDM_PATH / 'RELATIONSHIP.csv', delimiter='\t', low_memory=False)

In [22]:
vocab_concepts = vocabularies.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'vocabulary_id']], 
    left_on='vocabulary_concept_id', 
    right_on='concept_id'
)

rel_concepts = relationship.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'relationship_id']], 
    left_on='relationship_concept_id', 
    right_on='concept_id'
)

domain_concepts = domain.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'domain_id']], 
    left_on='domain_concept_id', 
    right_on='concept_id'
)

concept_class_concepts = concept_class.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'concept_class_id']], 
    left_on='concept_class_concept_id', 
    right_on='concept_id'
)

In [32]:
from omop_alchemy.model.vocabulary import Concept, Vocabulary, Domain, Concept_Class, Relationship, Concept_Relationship, Concept_Ancestor

def make_concept(concept_row):
    return Concept(concept_id = int(concept_row.concept_id),
                   concept_name = concept_row.concept_name,
                   domain_id = concept_row.domain_id,
                   vocabulary_id = concept_row.vocabulary_id,
                   concept_class_id = concept_row.concept_class_id,
                   standard_concept = concept_row.standard_concept if not pd.isna(concept_row.standard_concept) else None,
                   concept_code = concept_row.concept_code,
                   valid_start_date = datetime.strptime(str(concept_row.valid_start_date), '%Y%m%d'), 
                   valid_end_date = datetime.strptime(str(concept_row.valid_end_date), '%Y%m%d'), 
                   invalid_reason = concept_row.invalid_reason if not pd.isna(concept_row.invalid_reason) else None)

def make_vocab(vocab_row):
    v = Vocabulary(vocabulary_id = vocab_row.vocabulary_id,
                   vocabulary_name = vocab_row.vocabulary_name,
                   vocabulary_reference = vocab_row.vocabulary_reference,
                   vocabulary_concept_id = vocab_row.vocabulary_concept_id, 
                   vocabulary_version = vocab_row.vocabulary_version)
    v.vocabulary_concept = vocab_row.concept_object
    return v
    
def make_domain(domain_row):
    d = Domain(domain_id = domain_row.domain_id,
               domain_name = domain_row.domain_name,
               domain_concept_id = domain_row.domain_concept_id)
    d.domain_concept = domain_row.concept_object
    return d

def make_concept_class(cc_row):
    c = Concept_Class(concept_class_id = cc_row.concept_class_id,
                      concept_class_name = cc_row.concept_class_name,
                      concept_class_concept_id = cc_row.concept_class_concept_id)
    c.concept_class_concept = cc_row.concept_object
    return c


def make_rel(r_row):
    r = Relationship(relationship_id = r_row.relationship_id,
                      relationship_name = r_row.relationship_name,
                      is_hierarchical = r_row.is_hierarchical,
                      defines_ancestry = r_row.defines_ancestry,
                      reverse_relationship_id = r_row.reverse_relationship_id,
                      relationship_concept_id = r_row.relationship_concept_id)
    r.relationship_concept = r_row.concept_object
    return r

In [None]:
# preserve referential integrity by adding all the concept class, vocabulary and domain concepts in one go
meta_concepts = pd.concat([concept_class_concepts.concept_id, domain_concepts.concept_id, vocab_concepts.concept_id, rel_concepts.concept_id])
meta_concepts_df = concepts_with_ids.merge(meta_concepts)
meta_concepts_df['concept_object'] = meta_concepts_df.apply(make_concept, axis=1)


concept_class_concepts = concept_class_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='concept_class_concept_id', right_on='concept_id')
domain_concepts = domain_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='domain_concept_id', right_on='concept_id')
vocab_concepts = vocab_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='vocabulary_concept_id', right_on='concept_id')
rel_concepts = rel_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='relationship_concept_id', right_on='concept_id')

In [None]:
# query why one of the vocabularies has a null ID from athena?
vocab_concepts.vocabulary_id = vocab_concepts.vocabulary_id.fillna('empty') 

In [86]:
with so.Session(engine) as sess:
    existing_vocabs = pd.DataFrame(sess.query(Concept.vocabulary_id).distinct().all())
    existing_concepts = pd.DataFrame(sess.query(Concept.concept_id).distinct().all())

In [114]:
def make_ancestor(a_row):
    return Concept_Ancestor(ancestor_concept_id = int(a_row.ancestor_concept_id),
                            descendant_concept_id = int(a_row.descendant_concept_id))
                            #min_levels_of_separation = a_row.min_levels_of_separation,
                            #max_levels_of_separation = a_row.max_levels_of_separation)
                    

In [119]:
with so.Session(engine) as sess:
    sess.add_all(concepts_with_ids[concepts_with_ids.vocabulary_id=='Cancer Modifier'].apply(make_concept, axis=1))
    sess.commit()

In [None]:
# create metadata concepts & make objects for other reference tables

concept_class_concepts['ob'] = concept_class_concepts.apply(make_concept_class, axis=1)
domain_concepts['ob'] = domain_concepts.apply(make_domain, axis=1)
vocab_concepts['ob'] = vocab_concepts.apply(make_vocab, axis=1)
rel_concepts['ob'] = rel_concepts.apply(make_rel, axis=1)

all_ob = pd.concat([concept_class_concepts[['concept_id_x', 'ob']], 
                    domain_concepts[['concept_id_x', 'ob']], 
                    vocab_concepts[['concept_id_x', 'ob']], 
                    rel_concepts[['concept_id_x', 'ob']]])

In [None]:

with so.Session(engine) as sess:
    sess.add_all(list(concept_class_concepts.ob))
    sess.add_all(list(domain_concepts.ob))
    sess.add_all(list(vocab_concepts.ob))
    sess.add_all(list(rel_concepts.ob))
    sess.add_all(list(meta_concepts_df.concept_object))
    sess.commit()

In [None]:
concept_stage_id[~concept_stage_id.concept_class_id.isin(non_omop_class) & 
                 ~concept_stage_id.vocabulary_id.isin(['Concept Class', 'Domain', 'Vocabulary', 'Relationship'])  
#                 &~concept_stage_id.concept_id.isin(existing_concepts.concept_id.unique())
                 ].concept_class_id.value_counts()

In [129]:
len(concepts_with_ids[concepts_with_ids.vocabulary_id=='Gender'])

0

In [65]:
concepts_to_add = concept_relationship_stage_id[
    ['concept_code_2', 'vocabulary_id']
    ].drop_duplicates().merge(
        concepts_with_ids, 
        left_on=['concept_code_2', 'vocabulary_id'], 
        right_on=['concept_code', 'vocabulary_id']
    )

In [66]:
len(concepts_to_add[~concepts_to_add.concept_id.isin(existing_concepts.concept_id.unique())])

1880

In [68]:
#missed = concepts_to_add[~concepts_to_add.concept_id.isin(existing_concepts.concept_id.unique())].apply(make_concept, axis=1)

In [None]:
ho_conc = concepts_with_ids[concepts_with_ids.vocabulary_id=='HemOnc'].apply(make_concept, axis=1)
non_ho_conc = concepts_to_add[~concepts_to_add.vocabulary_id.isin(['Concept Class', 'Domain', 'Vocabulary', 'Relationship', 'HemOnc'])].apply(make_concept, axis=1)

In [69]:
with so.Session(engine) as sess:
    sess.add_all(ho_conc)
    sess.add_all(non_ho_conc)
    sess.commit()

In [35]:
def make_relationship(rel_row):
    return Concept_Relationship(concept_id_1 = rel_row.concept_id_1,
                         concept_id_2 = rel_row.concept_id_2,
                         relationship_id = rel_row.relationship_id)


ho_rels = concept_relationship_stage_id[
    concept_relationship_stage_id.concept_id_1.notna() &
    concept_relationship_stage_id.concept_id_2.notna()
    ].drop_duplicates(
        subset=['concept_id_1', 'concept_id_2', 'relationship_id']
        ).apply(make_relationship, axis=1)

In [74]:
concept_relationship_stage_id[
    concept_relationship_stage_id.concept_id_1.notna() &
    concept_relationship_stage_id.concept_id_2.notna() & 
    concept_relationship_stage_id.concept_id_2.isin(concepts_to_add[~concepts_to_add.concept_id.isin(existing_concepts.concept_id.unique())].concept_id.unique()) & 
    concept_relationship_stage_id.relationship_id.str.contains('map', case=False)
    ].vocabulary_id_2.value_counts()

vocabulary_id_2
ICD-10-CM    1414
ICD-9-CM      876
ICD-O-3       246
Name: count, dtype: int64

In [77]:
# missed_rels = concept_relationship_stage_id[
#     concept_relationship_stage_id.concept_id_1.notna() &
#     concept_relationship_stage_id.concept_id_2.notna() & 
#     concept_relationship_stage_id.concept_id_2.isin(concepts_to_add[~concepts_to_add.concept_id.isin(existing_concepts.concept_id.unique())].concept_id.unique()) & 
#     concept_relationship_stage_id.relationship_id.str.contains('map', case=False)
#     ].drop_duplicates(
#         subset=['concept_id_1', 'concept_id_2', 'relationship_id']
#         ).apply(make_relationship, axis=1)

In [115]:

ancestors = concept_ancestors.merge(
    existing_concepts[
        ['concept_id']
        ].rename(columns={'concept_id': 'ancestor_concept_id'})
    ).merge(
        existing_concepts[
            ['concept_id']
        ].rename(columns={'concept_id': 'descendant_concept_id'})
    ).apply(make_ancestor, axis=1)  

In [127]:
ancestors = concept_ancestors.merge(
    concepts_with_ids[concepts_with_ids.vocabulary_id=='Cancer Modifier'][
        ['concept_id']
        ].rename(columns={'concept_id': 'ancestor_concept_id'})
    ).merge(
        concepts_with_ids[concepts_with_ids.vocabulary_id=='Cancer Modifier'][
            ['concept_id']
        ].rename(columns={'concept_id': 'descendant_concept_id'})
    ).apply(make_ancestor, axis=1)  

In [128]:
with so.Session(engine) as sess:
    sess.add_all(list(ancestors))
    sess.commit()

In [84]:
with so.Session(engine) as sess:
    sess.add_all(ho_rels)
    #sess.add_all(missed_rels)
    sess.commit()

In [None]:
# for vocabulary, vocab_concepts in concepts_with_ids.groupby('vocabulary_id'):
#     if vocabulary not in ['Concept Class', 'Domain', 'Vocabulary', 'Relationship'] + [e[0] for e in existing_vocabs]:
#         # should not have to filter on non-null concept names here?
#         vocab_concept_objects = vocab_concepts[~vocab_concepts.concept_name.isna()].apply(make_concept, axis=1)
#         print(vocabulary, len(vocab_concept_objects))
#         with so.Session(engine) as sess:
#             sess.add_all(vocab_concept_objects)
#             sess.commit()

In [None]:
# due to misalignment with current athena version we have to accept gap in concept_id completeness even for classes that should have them

concept_stage_id[concept_stage_id.concept_id.isna() & ~concept_stage_id.concept_class_id.isin(non_omop_class)].head()

In [None]:
# helper mapping data / merges within OMOP concepts
is_a = concept_relationship_stage_id[
    concept_relationship_stage_id.relationship_id=='Is a'
    ][['concept_code_1', 'concept_code_2', 'concept_id_1', 'concept_id_2']]


brand_mappings = concept_stage_id[
    concept_stage_id.concept_class_id=='Brand Name'
    ].merge(
        concept_relationship_stage_id[
            concept_relationship_stage_id.relationship_id=='Has brand name'
        ][['concept_code_1', 'concept_code_2', 'concept_id_1', 'concept_id_2']], 
        left_on='concept_code', 
        right_on='concept_code_2', 
        how='left'
    )

component_df = concept_stage_id[
    concept_stage_id.concept_class_id.isin(['Component', 'Procedure'])
    ].copy()

component_class_df = concept_stage_id[
    concept_stage_id.concept_class_id=='Component Class'
    ][['concept_name','concept_code','concept_id']].rename(
        columns={'concept_code': 'concept_code_2', 
                 'concept_name': 'concept_class_name'}
    )

has_modality = concept_relationship_stage_id[
    concept_relationship_stage_id.relationship_id=='Has modality'
    ][['concept_code_1', 'concept_code_2', 'concept_id_1', 'concept_id_1']]

modality_df = concept_stage_id[
    concept_stage_id.concept_class_id=='Modality'
    ][['concept_name','concept_code','concept_id']].rename(
        columns={'concept_code': 'concept_code_2', 
                 'concept_name': 'modality_name', 
                 'concept_id': 'concept_id_2'}
        )

In [None]:
route_mappings = {'44954': '26643006', 
                  '44957': '47625008',
                  '44979': '37161004',
                  '44994': '6064005',
                  '45080': '78421000',
                  '45153': '34206005',
                  '45215': '372471009',
                  '45273': '420254004',
                  '45426': '372466002',
                  '45531': '46713006',
                  '45574': '72607000',
                  '45684': '58100008',
                  '45939': '447694001'}

In [None]:
# helper data cleaning / transformation functions

def get_enum(e, s):
    try:
        return e[s.lower().strip().replace('-', '_').replace(' ', '_').replace('/', '_')]
    except:
        return None

def get_date(y, m, d):
    try:    
        datetime.date(int(y), int(m), int(d))
    except:
        return None

In [None]:
# functions to take dataframe rows and return database objects

def make_context(context_row):
    return Hemonc_Context(context_code = context_row.contextRaw,
                          context_name = context_row.contextPretty,
                          intent = get_enum(Intent, context_row.intent),
                          setting = get_enum(Setting, context_row.intent),
                          risk_stratification = get_enum(Risk, context_row.intent),
                          phenotype = get_enum(Phenotype, context_row.intent),
                          prior_therapy = get_enum(PriorTherapy, context_row.prior_therapy),
                          date_added = context_row.date_added)


def make_study(study_row):
    enrollment_from = get_date(study_row.from_year, study_row.from_month, study_row.from_day)
    enrol_to = get_date(study_row.to_year, study_row.to_month, study_row.to_day)
    return Hemonc_Study(study_code = study_row.study,
                        registry = study_row.registry,
                        trial_id = study_row.trial_id,
                        condition_code = study_row.concept_code,
                        enrollment_from = enrollment_from if not pd.isna(enrollment_from) else None,
                        enrollment_to = enrol_to if not pd.isna(enrol_to) else None,
                        phase = study_row.phase,
                        study_design = get_enum(StudyDesign, study_row.study_design),
                        study_design_imputed = study_row.study_design_imputed,
                        sact = study_row.sact if not pd.isna(study_row.sact) else None,
                        protocol = study_row.protocol,
                        fda_reg_study = study_row.fda_reg_study,
                        fda_unreg_study = study_row.fda_unreg_study,
                        start = study_row.start  if not pd.isna(study_row.start) else None,
                        end = study_row.end  if not pd.isna(study_row.end) else None,
                        study_group = study_row.study_group,
                        sponsor = study_row.sponsor,
                        date_added = study_row.date_added if not pd.isna(study_row.date_added) else None, 
                        date_modified = study_row.date_last_modified if not pd.isna(study_row.date_last_modified) else None
                        )

def make_regimen(regimen_row):
    return Hemonc_Regimen(regimen_cui = regimen_row.regimen_cui,
                          regimen_name = regimen_row.regimen)

def make_condition(condition_row):
    return Hemonc_Condition(condition_code = condition_row.concept_code,
                            condition_name = condition_row.concept_name,
                            condition_concept_id = int(condition_row.concept_id) if not pd.isna(condition_row.concept_id) else None)


def make_component(component_class_row):
    return Hemonc_Component(component_code = component_class_row.concept_code,
                            component_name = component_class_row.concept_name,
                            component_concept_id = int(component_class_row.concept_id) if not pd.isna(component_class_row.concept_id) else None)

def make_component_role(role_row):
    return Hemonc_Component_Role(regimen_cui = role_row.concept_code_1,
                                 component_code = role_row.concept_code_2 if role_row.component_class in ['Component', 'Procedure'] else None,
                                 component_class_code = role_row.concept_code_2 if role_row.component_class == 'Component Class' else None,
                                 relationship_id = role_row.relationship_id)


def make_component_class(component_class_row):
    return Hemonc_Component_Class(component_class_code = component_class_row.concept_code_2,
                                  component_class_name = component_class_row.concept_class_name,
                                  component_class_concept_id = int(component_class_row.concept_id) if not pd.isna(component_class_row.concept_id) else None)

def make_branch(branch_row):
    return Hemonc_Branch_Conditional(branch_name = branch_row.original,
                                     branch_type = branch_row.RULE_TYPE if not pd.isna(branch_row.RULE_TYPE) else BranchConditionalType.other,
                                     numeric_min = branch_row.MIN_NUM,
                                     numeric_max = branch_row.MAX_NUM,
                                     value = branch_row.RULE_VALUE)

def make_variant(variant_row):
    return Hemonc_Variant(variant_cui = variant_row.variant_cui,
                         variant_name = variant_row.variant,
                         regimen_cui = variant_row.regimen_cui)


def make_reg_part(reg_part_row):
    return Hemonc_Regimen_Part(variant_cui = reg_part_row.variant_cui,
                               regimen_part_id = reg_part_row.regimen_part_id,
                               portion = reg_part_row.portion if not pd.isna(reg_part_row.portion) else None,
                               cycle_sig_id = reg_part_row.cyclesigs,
                               timing = reg_part_row.timing,
                               timing_unit = reg_part_row.timing_unit
                               )

def assign_part_phase(reg_part_row):
    if not pd.isna(reg_part_row.phase):
        phases = [Part_Phase(regimen_part_id = reg_part_row.regimen_part_id,
                             variant_cui = reg_part_row.variant_cui, 
                             phase = get_enum(Phase, reg_part_row.phase),
                             ) for phase in reg_part_row.phase.split('|')]
        return phases

def make_cycle_sig(cs_row):
    return Hemonc_Cycle_Sig(cycle_sig_id = cs_row.cyclesigs,
                            duration_min = cs_row.DUR_MIN, 
                            duration_max = cs_row.DUR_MAX,
                            duration_units = cs_row.DUR_UNITS,
                            frequency_min = cs_row.FREQ_MIN,
                            frequency_max = cs_row.FREQ_MAX,
                            frequency_units = cs_row.FREQ_UNITS,
                            repeats_min = cs_row.REP_MIN,
                            repeats_max = cs_row.REP_MAX,
                            repeats_units = cs_row.REP_UNITS, 
                            cycle_len_min = cs_row.cycle_length_lb if not pd.isna(cs_row.cycle_length_lb) else None,
                            cycle_len_max = cs_row.cycle_length_ub if not pd.isna(cs_row.cycle_length_ub) else None,
                            cycle_len_units = cs_row.cycle_length_unit if not pd.isna(cs_row.cycle_length_unit) else None,
                            residual = ' '.join([t.text for t in cs_row.cs_residual]))


def make_modality(mod_row):
    return Hemonc_Modality(modality_code = mod_row.concept_code_2,
                           modality_name = mod_row.modality_name,
                           modality_concept_id = int(mod_row.concept_id_2) if not pd.isna(mod_row.concept_id_2) else None)

def make_ref(ref_row):
    return Hemonc_Ref(reference = ref_row.reference,
                      condition_code = ref_row.concept_code,
                      pmid = ref_row.pmid,
                      study = ref_row.study,
                      title = ref_row.title,
                      pmcid = ref_row.pmcid,
                      doi = ref_row.doi if not ref_row.doi=="" else None,
                      url = ref_row.url if not ref_row.url=="" else None,
                      journal = ref_row.journal,
                      biblio = ref_row.biblio,
                      pub_date = ref_row.pub_date,
                      order = ref_row.order,
                      update = ref_row['update'],
                      ref_type = ref_row.ref_type)

def make_sig(sig_row):
    return Hemonc_Sig(regimen_part_id = sig_row.regimen_part_id,
                      variant_cui = sig_row.variant_cui,
                      sig_id = sig_row.sig_id,
                      component_code = sig_row.concept_code,
                      component_name = sig_row.component,
                      component_role = get_enum(ComponentRole, sig_row.component_role),
                      step_number = sig_row.step_number,
                      component_class = sig_row['class'],
                      tail = sig_row['tail'],
                      route = sig_row.route,
                      doseMinNum = sig_row.doseMinNum if not pd.isna(sig_row.doseMinNum) else None,
                      doseMaxNum = sig_row.doseMaxNum if not pd.isna(sig_row.doseMaxNum) else None,
                      doseUnit = sig_row.doseUnit if not pd.isna(sig_row.doseUnit) else None,
                      doseCapNum = sig_row.doseCapNum if not pd.isna(sig_row.doseCapNum) else None,
                      doseCapUnit = sig_row.doseCapUnit if not pd.isna(sig_row.doseCapUnit) else None,
                      durationMinNum = sig_row.durationMinNum if not pd.isna(sig_row.durationMinNum) else None,
                      durationMaxNum = sig_row.durationMaxNum if not pd.isna(sig_row.durationMaxNum) else None,
                      durationUnit = sig_row.durationUnit if not pd.isna(sig_row.durationUnit) else None,
                      frequency = sig_row.frequency if not pd.isna(sig_row.frequency) else None,
                      inParens = sig_row.inParens if not pd.isna(sig_row.inParens) else None,
                      sequence = sig_row.sequence if not pd.isna(sig_row.sequence) else None,
                      seq_rel = sig_row['seq.rel'] if not pd.isna(sig_row['seq.rel']) else None,
                      seq_rel_what = sig_row['seq.rel.what'] if not pd.isna(sig_row['seq.rel.what']) else None)            


def make_sig_days(sd_row):
    days = '0' if pd.isna(sd_row.allDays) else sd_row.allDays
    return [Sig_Days(regimen_part_id = sd_row.regimen_part_id,
                     variant_cui = sd_row.variant_cui,
                     sig_id = sd_row.sig_id,
                     day = d) for d in days.split(',')]


# spacer matcher helper functions

small_num_lookup = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}

def safe_num(tok):
    try:
        return int(tok.text)
    except:
        try:
            return float(tok.text)
        except:
            try: 
                return small_num_lookup[tok.text.lower()]
            except:
                return None

def safe_age(numeric):
    ages = [safe_num(t) for t in numeric if t._.AGE]
    if len(ages) > 0:
        return ages[0]

def get_full_stopping_condition(doc, conditions):
    return doc[conditions[0].i:] if len(conditions) > 0 else None

def get_units(tokens):
    try:
        return [t.lemma_ for t in tokens if t._.TIMING_UNIT][0]
    except:
        return None

def get_min(group):
    try:
        return min([i for i in [safe_num(t) for t in group] if i])
    except:
        return None

def get_max(group): 
    try:
        if any([t._.GTE for t in group]):
            return -1
        return max([i for i in [safe_num(t) for t in group] if i])
    except:
        return None


In [None]:
role = ['Has cytotoxic chemo', 'Has targeted therapy', 'Has supportive med',
       'Has steroid tx', 'Has local therapy', 'Has immunosuppressor',
       'Has immunotherapy', 'Has endocrine tx', 'Has radiotherapy',
       'Has growth factor', 'Has AB-drug cjgt', 'Has radioconjugate',
       'Has antineoplastic', 'Has anticoag tx', 'Has pept-drug cjgt']

concept_reg_roles = (
    concept_relationship_stage_id[concept_relationship_stage_id.relationship_id.isin(role)][['concept_code_1', 'concept_id_1', 'relationship_id', 'concept_code_2', 'concept_id_2']]
        .merge(
                concept_stage_id[['concept_code', 'concept_name', 'concept_class_id']]
                .rename(
                    columns={'concept_code': 'concept_code_1', 
                             'concept_name': 'regimen_name',
                             'concept_class_id': 'regimen_class'}
                )
        )
        .merge(
                concept_stage_id[['concept_code', 'concept_name', 'concept_class_id']]
                .rename(
                    columns={'concept_code': 'concept_code_2', 
                             'concept_name': 'component_name',
                             'concept_class_id': 'component_class'}
                )
        )  
)

In [None]:
concept_reg_roles.head()

In [None]:
concept_reg_roles['role_object'] = concept_reg_roles.apply(make_component_role, axis=1)

In [None]:
conditions = concept_stage_id[concept_stage_id.concept_class_id=='Condition'].apply(make_condition, axis=1)
component_class_df['component_class_object'] = component_class_df.apply(make_component_class, axis=1)

In [None]:
component_df['component_object'] = component_df.apply(make_component, axis=1)
components_with_class = component_df.merge(is_a, 
                                           left_on='concept_code', 
                                           right_on='concept_code_1').merge(component_class_df
                                                                            ).rename(columns={'concept_code_2': 'concept_class_code'})


In [None]:
component_df

In [None]:
modality_df['modality_object'] = modality_df.apply(make_modality, axis=1)

regimens_with_modality = concept_stage_id[
    concept_stage_id.invalid_reason == ''
    ].merge(
        has_modality, 
        left_on='concept_code', 
        right_on='concept_code_1'
    ).merge(
        modality_df
    ).rename(
            columns={'concept_code_2': 'concept_class_code'}
    ).drop_duplicates()


In [None]:
regimens_with_modality[regimens_with_modality[['concept_name', 'modality_name']].duplicated()]

In [None]:
components_with_class.apply(lambda x: x.component_object.component_classes.append(x.component_class_object), axis=1)

In [None]:
ref = ref.merge(concept_stage[concept_stage.concept_class_id=='Condition'][['concept_name', 'concept_code']], left_on='condition', right_on='concept_name', how='left')
ref['pub_date'] = pd.to_datetime(ref['pub.date'])
ref_obj = ref[~ref.concept_code.isna()].apply(make_ref, axis=1)

In [None]:
# todo: confirm what is meant by a single cui with >1 title - is this a synonym?

regimen_dedup = regimen.drop_duplicates(subset='regimen_cui').copy()

In [None]:
context.date_added = pd.to_datetime(context.date_added)
contexts = context.apply(make_context, axis=1)

In [None]:
# prepare study objects
study.date_added = pd.to_datetime(study.date_added, format='mixed', errors='coerce')
study.date_last_modified = pd.to_datetime(study.date_last_modified, format='mixed', errors='coerce')
enrol_dates = study.enrollment.str.split('to', expand=True)
enrol_from = enrol_dates[0].str.strip().str.split('-', expand=True).rename(columns={0: 'from_year', 1:'from_month', 2: 'from_day'}).fillna('01')
enrol_to = enrol_dates[1].str.strip().str.split('-', expand=True).rename(columns={0: 'to_year', 1:'to_month', 2: 'to_day'}).fillna('01')
study = pd.concat([study, enrol_from, enrol_to], axis=1)
study = study.merge(concept_stage[concept_stage.concept_class_id=='Condition'][['concept_name', 'concept_code']], left_on='condition', right_on='concept_name', how='left')
study.start = pd.to_datetime(study.start, format='mixed', errors='coerce')
study.end = pd.to_datetime(study.end, format='mixed', errors='coerce')

In [None]:
study['study_object'] = study.apply(make_study, axis=1)
regimen_dedup['regimen_object'] = regimen_dedup.apply(make_regimen, axis=1)

In [None]:
regimen_dedup = regimen_dedup.merge(regimens_with_modality[['concept_name', 'modality_object']].rename(columns={'concept_name':'regimen'}))

In [None]:
regimen_dedup.apply(lambda x: x.regimen_object.modalities.append(x.modality_object), axis=1)

In [None]:
with so.Session(engine) as sess:
    sess.add_all(conditions)
    sess.add_all(list(component_class_df.component_class_object))
    sess.add_all(list(component_df.component_object))
    sess.add_all(contexts)
    sess.add_all(list(study.study_object))
    sess.add_all(list(modality_df.modality_object))
    sess.add_all(list(regimen_dedup.regimen_object))
    sess.add_all(list(ref_obj))
    sess.add_all(list(concept_reg_roles.role_object))
    sess.commit()

In [None]:
# we are not going to pull in out of date variant versions
sig_vars = sigs.merge(variant[variant.version==1][['variant_cui']])

In [None]:
for (r, c), dets in sig_vars[~sig_vars.step_number.str.contains('1 of') & sig_vars.allDays.isna()].groupby(['regimen', 'component']):
    if len(dets)>1:
        break

In [None]:
# parsing out branch conditions with spacy matchers

branch_crit = sig_vars.branch.value_counts().reset_index()

all_branches = []

for crit in branch_crit.branch.unique():
    combined_branches = crit.split('AND ')
    for br in combined_branches:
        all_branches += [b.lower().strip() for b in br.split('OR ')]

all_branches = list(set(all_branches))

branch_details = pd.DataFrame({'original': all_branches, 'doc': [nlp(b) for b in all_branches]})

for label, config in rules.items():
    branch_details.doc.apply(lambda d: match_entities(matchers[label], d, label, config["MERGE"]))
    branch_details[label] = branch_details.doc.apply(lambda d: [t for t in d if getattr(t._, label)])
    branch_details[f'has_{label}'] = (branch_details[label].apply(len)>0)

branch_details['MODIFIER_HEAD'] = branch_details.doc.apply(get_modifier_child)
branch_details['NOUNS'] = branch_details.doc.apply(get_nouns)
branch_details['NUMERIC'] = branch_details.doc.map(lambda doc: [tok for tok in doc if tok.like_num])

In [None]:
# parse numeric branch factors into constituant elements

branch_details.loc[branch_details.has_AGE, 'RULE_TYPE'] = BranchConditionalType.age
branch_details.loc[branch_details.has_SIZE, 'RULE_TYPE'] = BranchConditionalType.size
branch_details.loc[branch_details.has_LAB, 'RULE_TYPE'] = BranchConditionalType.lab
branch_details.loc[branch_details.has_STAGE, 'RULE_TYPE'] = BranchConditionalType.stage

branch_details.loc[branch_details.has_STAGE, 'RULE_VALUE'] = branch_details.STAGE.apply(lambda x: ' '.join([tok.text for tok in x]) if len(x) > 0 else None)

num = branch_details.NUMERIC.apply(lambda x: safe_num(x[0]) if len(x) > 0 else None)
age = branch_details.NUMERIC.apply(lambda x: safe_age(x))
first_num = branch_details.NUMERIC.apply(lambda x: safe_num(x[0]) if len(x) > 0 else None)
second_num = branch_details.NUMERIC.apply(lambda x: safe_num(x[1]) if len(x) > 1 else None)

gt = (branch_details.GT.apply(len)>0)
gte = (branch_details.GTE.apply(len)>0)
lt = (branch_details.LT.apply(len)>0)
lte = (branch_details.LTE.apply(len)>0)

r = (branch_details.RANGE.apply(len)>0)

branch_details.loc[branch_details.has_AGE & gt, 'MIN_NUM'] = age.apply(lambda x: x + 1 if x else None)
branch_details.loc[branch_details.has_AGE & gte, 'MIN_NUM'] = age
branch_details.loc[branch_details.has_AGE & lt, 'MAX_NUM'] = age.apply(lambda x: x - 1 if x else None)
branch_details.loc[branch_details.has_AGE & lte, 'MAX_NUM'] = age

branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & gt, 'MIN_NUM'] = num.apply(lambda x: x + 1 if x else None)
branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & gte, 'MIN_NUM'] = num
branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & lt, 'MAX_NUM'] = num.apply(lambda x: x - 1 if x else None)
branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & lte, 'MAX_NUM'] = num

branch_details.loc[(branch_details.has_AGE | branch_details.has_SIZE | branch_details.has_LAB) & r, 'MIN_NUM'] = first_num
branch_details.loc[(branch_details.has_AGE | branch_details.has_SIZE | branch_details.has_LAB) & r, 'MAX_NUM'] = second_num

In [None]:
# very likely to be able to improve these value extraction steps through either medspacy NER or better rules / more fullsome parsing, but a decent first pass

branch_details.loc[branch_details.RULE_VALUE.isna(), 'RULE_VALUE'] = branch_details.apply(lambda row: ' '.join([t.text for t in (row.FACT_MODIFIER + row.FACT)]), axis=1)

In [None]:
branches = branch_details.apply(make_branch, axis=1)

In [None]:
with so.Session(engine) as sess:
    sess.add_all(branches)
    sess.commit()

In [None]:
variant_study = variant.merge(pointer[['regimen', 'regimen_cui']].drop_duplicates(), how='left')[['regimen', 'study', 'variant_cui', 'variant', 'regimen_cui']].drop_duplicates()
variant_study['variant_object'] = variant_study.apply(make_variant, axis=1)

In [None]:
study_object_lookup = {s.study: s.study_object for s in study.itertuples()}

In [None]:
for row in variant_study.itertuples():
    for study_name in row.study.split('|'):
        try:
            row.variant_object.studied_in.append(study_object_lookup[study_name])
        except:
            print(f'missing study: {study_name} ({row.regimen})')

In [None]:
regimen_part_sig = sig_vars[['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].drop_duplicates()

In [None]:
# spacy matcher parsing of cycle sigs

cycle_sigs_parsed = regimen_part_sig[['cyclesigs', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].fillna('').drop_duplicates().copy()
cycle_sigs_parsed['doc'] = cycle_sigs_parsed.cyclesigs.map(nlp)

for label, config in rules.items():
    cycle_sigs_parsed.doc.apply(lambda d: match_entities(matchers[label], d, label, config["MERGE"]))
    cycle_sigs_parsed[label] = cycle_sigs_parsed.doc.apply(lambda d: [t for t in d if getattr(t._, label)])
    cycle_sigs_parsed[f'has_{label}'] = (cycle_sigs_parsed[label].apply(len)>0)

cycle_sigs_parsed['full_stopping'] = cycle_sigs_parsed.apply(lambda row: get_full_stopping_condition(row.doc, row.STOPPING_CONDITION), axis=1)


cycle_sigs_parsed['DUR_MIN'] = cycle_sigs_parsed.DURATION.apply(get_min)
cycle_sigs_parsed['DUR_MAX'] = cycle_sigs_parsed.DURATION.apply(get_max)
cycle_sigs_parsed['DUR_UNITS'] = cycle_sigs_parsed.DURATION.apply(get_units)
cycle_sigs_parsed['FREQ_MIN'] = cycle_sigs_parsed.FREQUENCY.apply(get_min)
cycle_sigs_parsed['FREQ_MAX'] = cycle_sigs_parsed.FREQUENCY.apply(get_max)
cycle_sigs_parsed['FREQ_UNITS'] = cycle_sigs_parsed.FREQUENCY.apply(get_units)
cycle_sigs_parsed['REP_MIN'] = cycle_sigs_parsed.REPEATS.apply(get_min)
cycle_sigs_parsed['REP_MAX'] = cycle_sigs_parsed.REPEATS.apply(get_max)
cycle_sigs_parsed['REP_UNITS'] = cycle_sigs_parsed.REPEATS.apply(get_units)

In [None]:
reg_parts = sig_vars[['regimen', 'variant_cui','variant','phase', 'portion', 'cyclesigs', 'timing', 'timing_unit']].drop_duplicates().copy()
reg_parts['regimen_part_id'] = reg_parts.sort_values(['variant_cui', 'phase', 'portion', 'cyclesigs', 'timing', 'timing_unit']).groupby('variant_cui').cumcount()

In [None]:
reg_parts[reg_parts.cyclesigs.str.contains('2 and', na=False, case=False)]

In [None]:
regimen_parts = reg_parts.apply(make_reg_part, axis=1)
part_phase = reg_parts.apply(assign_part_phase, axis=1)
cycle_sigs_parsed['cs_residual'] = cycle_sigs_parsed.doc.apply(lambda d: [t for t in d if not t._.FREQUENCY and not t._.REPEATS and not t._.DURATION and not t._.GTE and t.text != 'for'])

In [None]:
cycle_sigs_parsed[cycle_sigs_parsed.cs_residual.apply(len) > 0]

In [None]:
cs = cycle_sigs_parsed.apply(make_cycle_sig, axis=1)

In [None]:
with so.Session(engine) as sess:
    sess.add_all(list(variant_study.variant_object))
    sess.add_all(regimen_parts)
    sess.add_all(cs)
    sess.commit()

In [None]:
sig_vars[sig_vars.variant_cui==131430]#sig_vars[['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].duplicated()]

regimen_part_sig[regimen_part_sig.regimen=='D-FEC plus Bev']

reg_parts[reg_parts.variant_cui==131430]#sig_vars[['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].duplicated()]


In [None]:
sig_vars_components = sig_vars.merge(components_with_class[['concept_name', 'concept_code']], left_on='component', right_on='concept_name', how='left').drop_duplicates().merge(reg_parts[['variant_cui', 'phase', 'portion', 'cyclesigs', 'regimen_part_id']])
sig_vars_components[['variant_cui', 'regimen_part_id', 'component', 'component_role', 'timing', 'step_number', 'inParens', 'class', 'tail']][sig_vars_components[['variant_cui', 'regimen_part_id', 'component', 'component_role', 'timing', 'step_number', 'class', 'tail']].duplicated()]

In [None]:
sig_vars_components['sig_id'] = sig_vars_components.groupby(['regimen', 'variant_cui', 'regimen_part_id']).cumcount()

In [None]:
sig_vars_components = sig_vars_components.sort_values(['variant_cui', 'regimen_part_id', 'concept_name', 'step_number'])

In [None]:
# if days are empty, but this component and regimen part are identical to the previous row when sorted in this fashion, filling these day details forward

sig_vars_components.loc[sig_vars_components.allDays.isna() & (sig_vars_components.concept_name.shift(1) ==  sig_vars_components.concept_name), 'allDays'] = sig_vars_components.allDays.shift(1)

In [None]:
sig_objects = sig_vars_components.apply(make_sig, axis=1)
sd = sig_vars_components.apply(make_sig_days, axis=1)

In [None]:
sig_vars_components.component.apply(len).max()

In [None]:
with so.Session(engine) as sess:
    sess.add_all(sig_objects)
    for s in sd:
        sess.add_all(s)
    sess.commit()

In [None]:
sig_vars_components[['component', 'doseMinNum', 'doseMaxNum', 'doseUnit', 'doseCapNum', 'doseCapUnit', 'divided', 'durationMinNum', 'durationMaxNum', 'durationUnit', 'frequency', 'inParens', 'sequence', 'seq.rel', 'seq.rel.what']]['seq.rel.what'].value_counts()

In [None]:
regimen[regimen.regimen_cui.duplicated()].head()

In [None]:
### Issues

# Need a n:m context <-> status mapper

# single record with >1 setting - is this a variant?
context[context.setting.map(lambda x: get_enum(Setting, x)).isna()].setting.value_counts()

# list of regimens that have no associated variants

vv = variant.merge(pointer[['regimen', 'regimen_cui']].drop_duplicates(), how='outer')
vv[vv.variant.isna()].regimen.unique()

# and vice-versa
vv[vv.regimen_cui.isna()]


# some potential duplication across variant/sig rows
dup = sig_vars[sig_vars[['study',  'regimen', 'component', 'variant_cui', 'branch', 'timing', 'step_number', 'portion', 'class']].duplicated(keep=False)]

for regimen, reg_dets in dup.groupby('regimen'):
    print(regimen)
    for lab, d in reg_dets.to_dict().items():
        if len(set(d.values())) != 1:
            print('\t', lab, d)

# should these variant definitions have the same cyclesig?

distinct_sigs = sig_vars[~sig_vars.component.str.contains('surg|brachy|radiotherapy', case=False)][['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch']].drop_duplicates()
distinct_sigs[distinct_sigs[['regimen', 'variant', 'phase', 'portion', 'timing', 'branch']].duplicated(keep=False)][['regimen', 'variant']].drop_duplicates()

sig_vars[(sig_vars.regimen=='TACE, then 5-FU') & (sig_vars.variant=='Variant #01')][['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'component']]


# components in the variant file that aren't in the drug class - need to revisit to pull in procedures
sig_vars_components[sig_vars_components.concept_code.isna()]

In [None]:

distinct_sigs = sig_vars[~sig_vars.component.str.contains('surg|brachy|radiotherapy', case=False)][['regimen', 'variant', 'portion', 'cyclesigs', 'timing', 'branch']].drop_duplicates()
distinct_sigs[distinct_sigs[['regimen', 'variant', 'portion', 'timing', 'branch']].duplicated(keep=False)][['regimen', 'variant']].drop_duplicates()


In [None]:
ref[ref.concept_code.isna()].condition.value_counts()