In [1]:
# if running for first time, you may need to install spacy model
# > python -m spacy download en_core_web_sm
import dotenv
dotenv.load_dotenv()


from pathlib import Path
from datetime import datetime
import pandas as pd
import pyreadr
import sqlalchemy as sa
import sqlalchemy.orm as so
from hemonc_alchemy.matchers.spacy_config import matchers, nlp, rules, match_entities, get_modifier_child, get_nouns  
from hemonc_alchemy.model.hemonc_model import Hemonc_Study, Hemonc_Condition, Hemonc_Component, Hemonc_Modality, Hemonc_Component_Class, Hemonc_Regimen, Hemonc_Variant, Hemonc_Regimen_Part, Part_Phase, Hemonc_Cycle_Sig, Hemonc_Sig, Sig_Days, Base, component_to_class_map, Hemonc_Ref, Hemonc_Component_Role, Hemonc_Context, Hemonc_Branch_Conditional
from hemonc_alchemy.model.hemonc_enums import Intent, Setting, Phase, Risk, Phenotype, PriorTherapy, Phase, StudyDesign, SponsorType, BranchConditionalType, ComponentRole

oa_configurator.config - DEBUG - Application config path set: /Users/georginakennedy/cloudstor/CBDRH/ACDN/HemOnc_Alchemy/oa_system_config.yaml
oa_configurator.config - DEBUG - Log path set: /Users/georginakennedy/cloudstor/CBDRH/ACDN/HemOnc_Alchemy/logs
oa_configurator.config - DEBUG - DB connection string: sqlite:////Users/georginakennedy/cloudstor/CBDRH/ACDN/HemOnc_Alchemy/data/dash.db


In [2]:
dotenv.find_dotenv()

'/Users/georginakennedy/cloudstor/CBDRH/ACDN/HemOnc_Alchemy/.env'

In [3]:
import os
os.getcwd()

'/Users/georginakennedy/cloudstor/CBDRH/ACDN/HemOnc_Alchemy/notebooks'

In [9]:
# load data objects

# download the updated hemonc tables folder from dropbox and unzip it here
TABLE_PATH = Path('..') / 'Tables'

# download the required athena vocab files and place them here
CDM_PATH = Path('..') / 'OHDSI_Vocabs' 

In [10]:
TABLE_PATH.absolute()

PosixPath('/Users/georginakennedy/cloudstor/CBDRH/ACDN/HemOnc_Alchemy/notebooks/../Tables')

In [22]:
concept_columns = ['concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code','valid_start_date', 'valid_end_date', 'invalid_reason']
concept_relationship_columns = ['concept_code_1', 'concept_code_2', 'vocabulary_id_1', 'vocabulary_id_2', 'relationship_id', 'valid_start_date', 'valid_end_date', 'invalid_reason']

sigs = pd.read_csv(TABLE_PATH / 'sigs.csv', low_memory=False)
omop = pyreadr.read_r(TABLE_PATH / 'omop.RData')
concept_stage = omop['concept_stage'][concept_columns].copy()
concept_relationship_stage = omop['concept_relationship_stage'][concept_relationship_columns].copy()
concept_synonym_stage = omop['concept_synonym_stage'].copy()

# using RData files where possible, to avoid versioning issues
context = pyreadr.read_r(TABLE_PATH / 'context.table.RData')['context.table']
page = pyreadr.read_r(TABLE_PATH / 'page.table.RData')['page.table']
pointer = pyreadr.read_r(TABLE_PATH / 'pointer.table.RData')['pointer.table']
ref = pyreadr.read_r(TABLE_PATH / 'ref.table.RData')['ref.table']
sequence = pyreadr.read_r(TABLE_PATH / 'sequence.table.RData')['sequence.table']
study = pyreadr.read_r(TABLE_PATH / 'study.table.RData')['study.table']

In [14]:
# author = pyreadr.read_r(TABLE_PATH / 'author.table.RData')['author.table']
# person = pyreadr.read_r(TABLE_PATH / 'person.table.RData')['person.table']
# variant = pyreadr.read_r(TABLE_PATH / 'variant.table.RData')['variant.table']

In [24]:

# following files had non-utf characters, requiring manual R-based conversion to csv
author = pd.read_csv(TABLE_PATH / 'author.table.csv', low_memory=False)
person = pd.read_csv(TABLE_PATH / 'person.table.csv', encoding="ISO-8859-1")
variant = pd.read_csv(TABLE_PATH / 'variant.table.csv', encoding="ISO-8859-1")

regimen = pointer[pointer.regimen_cui!='NOT YET ASSIGNED'][['regimen', 'regimen_cui']].drop_duplicates().copy()

In [25]:
# load minimal athena download files for cross-reference including concept IDs where available
concepts_with_ids = pd.read_csv(CDM_PATH / 'CONCEPT.csv', delimiter='\t', low_memory=False)
concept_relationships = pd.read_csv(CDM_PATH / 'CONCEPT_RELATIONSHIP.csv', delimiter='\t', low_memory=False)
concept_ancestors = pd.read_csv(CDM_PATH / 'CONCEPT_ANCESTOR.csv', delimiter='\t', low_memory=False)
vocabularies = pd.read_csv(CDM_PATH / 'VOCABULARY.csv', delimiter='\t', low_memory=False)
concept_class = pd.read_csv(CDM_PATH / 'CONCEPT_CLASS.csv', delimiter='\t', low_memory=False)
domain = pd.read_csv(CDM_PATH / 'DOMAIN.csv', delimiter='\t', low_memory=False)
relationship = pd.read_csv(CDM_PATH / 'RELATIONSHIP.csv', delimiter='\t', low_memory=False)

In [26]:
import omop_alchemy as oa

engine = oa.oa_config.engine
oa.Base.metadata.create_all(engine)

In [27]:
concepts_with_ids.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,45756805,Pediatric Cardiology,Provider,ABMS,Physician Specialty,S,OMOP4821938,19700101,20991231,
1,45756804,Pediatric Anesthesiology,Provider,ABMS,Physician Specialty,S,OMOP4821939,19700101,20991231,
2,45756803,Pathology-Anatomic / Pathology-Clinical,Provider,ABMS,Physician Specialty,S,OMOP4821940,19700101,20991231,
3,45756802,Pathology - Pediatric,Provider,ABMS,Physician Specialty,S,OMOP4821941,19700101,20991231,
4,45756801,Pathology - Molecular Genetic,Provider,ABMS,Physician Specialty,S,OMOP4821942,19700101,20991231,


In [28]:
non_omop_class = ['Author', 'Reference', 'Study', 'PubMedURL', 'ReferenceDOI', 'Clinical trial ID', 'Regimen Variant', 
                  'ReferenceURL', 'PubMedCentralURL', 'City', 'Cycle Sigs', 'Regimen Stub', 'Study Group', 'Journal', 
                  'Endpoint', 'Duration', 'Year', 'Numeric', 'Endpoint Type', 'Experimental design', 'Study Class']

cc = concept_stage[~concept_stage.concept_class_id.isin(non_omop_class)].merge(concepts_with_ids, on='concept_code', how='left')

In [29]:
len(cc[cc.concept_id.isna()])/len(cc)

0.12654120700843607

In [30]:
# which current hemonc concepts are missing IDs from Athena?

cc[cc.concept_id.isna()].concept_class_id_x.value_counts()

concept_class_id_x
Regimen            1223
Regimen Class       421
Brand Name          160
Component            67
Component Class      46
Unit                 12
Condition             8
Frequency             6
Context               4
Component Role        3
Name: count, dtype: int64

In [31]:
concept_relationship_stage.concept_code_2 = concept_relationship_stage.apply(lambda x: x.concept_code_2.replace('-', '') if x.vocabulary_id_2 == "NDC" else x.concept_code_2, axis=1)
cr = concept_relationship_stage[concept_relationship_stage.vocabulary_id_2 != 'HemOnc'].merge(concepts_with_ids, left_on=['concept_code_2', 'vocabulary_id_2'], right_on=['concept_code', 'vocabulary_id'], how='left')

In [32]:
# which non-hemonc vocabs do we need to have available for allowing this to actually extend the CDM?
concept_relationship_stage.vocabulary_id_2.value_counts()

vocabulary_id_2
HemOnc              409557
RxNorm               19074
NDC                  14588
ICD-O-3               1618
ICD-10-CM             1430
ICD-9-CM              1021
NCIT                   732
HCPCS                  461
RxNorm Extension       401
SEER Site Recode       166
OncoTree               140
Name: count, dtype: int64

In [33]:
concept_relationship_stage[concept_relationship_stage.vocabulary_id_2=='NDC']

Unnamed: 0_level_0,concept_code_1,concept_code_2,vocabulary_id_1,vocabulary_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1100235,4,00024483,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
285916,4,00024815,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
310209,4,00025337,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
410201,4,00026216,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
510173,6,00931125,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
...,...,...,...,...,...,...,...,...
1296910,93818,50242103,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
1297010,93818,50242105,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
1396710,94609,25682022,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
1396810,94609,25682025,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,


In [34]:
concepts_with_ids.vocabulary_id.value_counts()

vocabulary_id
RxNorm Extension        2146945
SNOMED                  1084286
RxNorm                   309670
OSM                      203339
ICD10CM                   99421
ICDO3                     64471
NAACCR                    34473
ICD9CM                    17564
ICD10                     16638
HemOnc                     8028
Cancer Modifier            6043
ICD9Proc                   4657
OMOP Extension             1457
UCUM                       1127
CDM                        1060
Relationship                718
Concept Class               423
UB04 Typ bill               298
SOPT                        168
Vocabulary                  149
Medicare Specialty          120
Condition Type              118
ABMS                         98
Procedure Type               97
Type Concept                 80
Domain                       65
CMS Place of Service         63
UB04 Pt dis status           55
Race                         53
Cost                         51
Observation Type          

In [35]:
# not sure how much this matters, but there seems to be some issues with hemonc mappings to NDC - codes are a bit of a mish-mash with dropped hyphens and leading 0s removed
cr[cr.concept_id.isna()].vocabulary_id_2.value_counts()

vocabulary_id_2
NDC                 14588
ICD-O-3              1618
ICD-10-CM            1430
ICD-9-CM             1021
NCIT                  732
HCPCS                 461
SEER Site Recode      166
OncoTree              140
RxNorm                  7
Name: count, dtype: int64

In [36]:
code_mods = concept_stage.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id', 'concept_name']
    ], 
    on=['concept_code', 'vocabulary_id'],
    how='left'
)

In [38]:
# there are some hemonc components that have changed concept_code? this will not be backwards compatible?
code_mods[(code_mods.concept_name_x != code_mods.concept_name_y) & ~code_mods.concept_class_id.isin(non_omop_class) & code_mods.concept_name_y.notna()].to_csv('reused_codes.csv', index=False)

In [40]:
vocab_id_lookup = {
    'ICD-10-CM': 'ICD10CM',
    'ICD-9-CM': 'ICD9CM',
    'ICD-O-3': 'ICDO3'
}

concept_relationship_stage['vocabulary_id'] = concept_relationship_stage.vocabulary_id_2.map(lambda x: vocab_id_lookup[x] if x in vocab_id_lookup else x)

In [41]:
concept_relationship_stage.vocabulary_id.value_counts()

vocabulary_id
HemOnc              409557
RxNorm               19074
NDC                  14588
ICDO3                 1618
ICD10CM               1430
ICD9CM                1021
NCIT                   732
HCPCS                  461
RxNorm Extension       401
SEER Site Recode       166
OncoTree               140
Name: count, dtype: int64

In [42]:
# get all the OMOP concept IDs that exist for the current OMOP-ready hemonc codes

concept_stage_id = concept_stage.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id', 'concept_name'] # note that merging on concept_name shouldn't be required here, but as per above with code mods, some of the concept codes in hemonc have changed over time
    ], 
    how='left'
)

# also for the hemonc concepts in the relationship file

concept_relationship_stage_id = concept_relationship_stage.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id']
    ].rename(
        columns={'concept_code': 'concept_code_1', 
                 'concept_id': 'concept_id_1', 
                 'vocabulary_id': 'vocabulary_id_1'}
        ), 
    how='left'
)

# and then the related concepts in the relationship file

concept_relationship_stage_id = concept_relationship_stage_id.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id']
    ].rename(
        columns={'concept_code': 'concept_code_2', 
                 'concept_id': 'concept_id_2'}
        ), 
    how='left'
)

In [43]:
# we need to grab all the concepts specifically for vocab, relationships, domains and concept classes so we can insert those as bulk initial load and avoid referential integrity issues

vocab_concepts = vocabularies.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'vocabulary_id']], 
    left_on='vocabulary_concept_id', 
    right_on='concept_id'
)

rel_concepts = relationship.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'relationship_id']], 
    left_on='relationship_concept_id', 
    right_on='concept_id'
)

domain_concepts = domain.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'domain_id']], 
    left_on='domain_concept_id', 
    right_on='concept_id'
)

concept_class_concepts = concept_class.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'concept_class_id']], 
    left_on='concept_class_concept_id', 
    right_on='concept_id'
)

In [44]:
# this is where we are building in the compatibility with the OMOP Alchemy classes

from omop_alchemy.model.vocabulary import Concept, Vocabulary, Domain, Concept_Class, Relationship, Concept_Relationship, Concept_Ancestor

def make_concept(concept_row):
    return Concept(concept_id = int(concept_row.concept_id),
                   concept_name = concept_row.concept_name,
                   domain_id = concept_row.domain_id,
                   vocabulary_id = concept_row.vocabulary_id,
                   concept_class_id = concept_row.concept_class_id,
                   standard_concept = concept_row.standard_concept if not pd.isna(concept_row.standard_concept) else None,
                   concept_code = concept_row.concept_code,
                   valid_start_date = datetime.strptime(str(concept_row.valid_start_date), '%Y%m%d'), 
                   valid_end_date = datetime.strptime(str(concept_row.valid_end_date), '%Y%m%d'), 
                   invalid_reason = concept_row.invalid_reason if not pd.isna(concept_row.invalid_reason) else None)

def make_vocab(vocab_row):
    v = Vocabulary(vocabulary_id = vocab_row.vocabulary_id,
                   vocabulary_name = vocab_row.vocabulary_name,
                   vocabulary_reference = vocab_row.vocabulary_reference,
                   vocabulary_concept_id = vocab_row.vocabulary_concept_id, 
                   vocabulary_version = vocab_row.vocabulary_version)
    v.vocabulary_concept = vocab_row.concept_object
    return v
    
def make_domain(domain_row):
    d = Domain(domain_id = domain_row.domain_id,
               domain_name = domain_row.domain_name,
               domain_concept_id = domain_row.domain_concept_id)
    d.domain_concept = domain_row.concept_object
    return d

def make_concept_class(cc_row):
    c = Concept_Class(concept_class_id = cc_row.concept_class_id,
                      concept_class_name = cc_row.concept_class_name,
                      concept_class_concept_id = cc_row.concept_class_concept_id)
    c.concept_class_concept = cc_row.concept_object
    return c


def make_rel(r_row):
    r = Relationship(relationship_id = r_row.relationship_id,
                      relationship_name = r_row.relationship_name,
                      is_hierarchical = r_row.is_hierarchical,
                      defines_ancestry = r_row.defines_ancestry,
                      reverse_relationship_id = r_row.reverse_relationship_id,
                      relationship_concept_id = r_row.relationship_concept_id)
    r.relationship_concept = r_row.concept_object
    return r

In [45]:
# preserve referential integrity by adding all the concept class, vocabulary and domain concepts in one go
meta_concepts = pd.concat([concept_class_concepts.concept_id, domain_concepts.concept_id, vocab_concepts.concept_id, rel_concepts.concept_id])
meta_concepts_df = concepts_with_ids.merge(meta_concepts)
meta_concepts_df['concept_object'] = meta_concepts_df.apply(make_concept, axis=1)


concept_class_concepts = concept_class_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='concept_class_concept_id', right_on='concept_id')
domain_concepts = domain_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='domain_concept_id', right_on='concept_id')
vocab_concepts = vocab_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='vocabulary_concept_id', right_on='concept_id')
rel_concepts = rel_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='relationship_concept_id', right_on='concept_id')

In [48]:
# full set of hemonc relationships - some obviously do not need to be pulled in - TBC how to cut down

concept_relationship_stage.relationship_id.value_counts()

relationship_id
Has middle author       106508
Is a                     62847
Was studied in           36946
Maps to                  20926
Has modality             11824
                         ...  
Has anticoag Rx             21
Was NMPA approved yr        19
Has radiotherapy Rx          9
Has pept-drug cjgt           1
Has PDC Rx                   1
Name: count, Length: 89, dtype: int64

In [52]:
# if you're pulling in only hemonc concepts that have a valid domain_id, then you can retain only relationships that have concepts in allocated domains on both sides
concept_stage_id.domain_id.value_counts()

domain_id
               93217
regimen         7963
drug            5892
measurement      778
condition        282
procedure        110
Name: count, dtype: int64

In [59]:
domain_lookup_by_code = {k:v for k, v in zip(concept_stage_id.concept_code, concept_stage_id.domain_id)}
domain_lookup_by_id = {k:v for k, v in zip(concepts_with_ids.concept_id, concepts_with_ids.domain_id)}

In [82]:
# try find all known domain_id values either from hemonc source or existing OMOP 
concept_rels_with_domains = concept_relationship_stage_id[concept_relationship_stage_id.invalid_reason != 'D'].copy()
concept_rels_with_domains['domain_id_1'] = concept_rels_with_domains.concept_code_1.map(domain_lookup_by_code)
concept_rels_with_domains['domain_id_2'] = concept_rels_with_domains.concept_code_2.map(domain_lookup_by_code)
concept_rels_with_domains.loc[concept_rels_with_domains['domain_id_1'].isna(), 'domain_id_1'] = concept_rels_with_domains.concept_id_1.map(domain_lookup_by_id)
concept_rels_with_domains.loc[concept_rels_with_domains['domain_id_2'].isna(), 'domain_id_2'] = concept_rels_with_domains.concept_id_2.map(domain_lookup_by_id)

In [83]:
# that would imply that these are the relationship types that you need to exist before import
concept_rels_with_domains[(concept_rels_with_domains.domain_id_1!='') & (concept_rels_with_domains.domain_id_2!='') ].relationship_id.value_counts()

relationship_id
Is a                   61061
Maps to                20593
Has cytotoxic chemo    10340
Has context             7100
Is current in adult     6574
                       ...  
Has anticoag Rx           14
Has antineopl Rx          14
Has radiotherapy Rx        8
Has pept-drug cjgt         1
Has PDC Rx                 1
Name: count, Length: 64, dtype: int64

In [84]:
target_rels = concept_rels_with_domains[(concept_rels_with_domains.domain_id_1!='') & (concept_rels_with_domains.domain_id_2!='') ].relationship_id.value_counts().reset_index()
target_rels = target_rels.merge(relationship, how='left')

In [86]:
# means these may be the new relationship types being sought - confirmation required with respect to the 'year' concept class?
target_rels[target_rels.relationship_name.isna()]

Unnamed: 0,relationship_id,count,relationship_name,is_hierarchical,defines_ancestry,reverse_relationship_id,relationship_concept_id
4,Is current in adult,6574,,,,,
8,Has been compared to,4247,,,,,
10,Has synthetic regimen,2791,,,,,
14,Can be preceded by,1079,,,,,
16,Can be followed by,985,,,,,
20,Is historical in adult,606,,,,,
21,Was FDA approved yr,544,,,,,
26,Has PMDA indication,339,,,,,
28,Was EMA approved yr,308,,,,,
32,Has minor class,226,,,,,


In [54]:
# query why one of the vocabularies has a null ID from athena?

vocab_concepts[vocab_concepts.vocabulary_id.isna()]

Unnamed: 0,vocabulary_id,vocabulary_name,vocabulary_reference,vocabulary_version,vocabulary_concept_id,concept_id_x,concept_name,domain_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,concept_id_y,concept_object
14,,OMOP Standardized Vocabularies,OMOP generated,v5.0 30-AUG-24,44819096,44819096,OMOP Standardized Vocabularies,Metadata,Vocabulary,,OMOP generated,19700101,20991231,,44819096,<Concept 44819096 - OMOP generated (OMOP Stand...


In [95]:
vocab_concepts.vocabulary_id = vocab_concepts.vocabulary_id.fillna('empty') 

In [97]:
def make_ancestor(a_row):
    return Concept_Ancestor(ancestor_concept_id = int(a_row.ancestor_concept_id),
                            descendant_concept_id = int(a_row.descendant_concept_id))
                            #min_levels_of_separation = a_row.min_levels_of_separation,
                            #max_levels_of_separation = a_row.max_levels_of_separation)
                    

In [None]:
# with so.Session(engine) as sess:
#     sess.add_all(concepts_with_ids[concepts_with_ids.vocabulary_id=='Cancer Modifier'].apply(make_concept, axis=1))
#     sess.commit()

In [98]:
# create metadata concepts & make objects for other reference tables
concept_class_concepts['ob'] = concept_class_concepts.apply(make_concept_class, axis=1)
domain_concepts['ob'] = domain_concepts.apply(make_domain, axis=1)
vocab_concepts['ob'] = vocab_concepts.apply(make_vocab, axis=1)
rel_concepts['ob'] = rel_concepts.apply(make_rel, axis=1)

all_ob = pd.concat([concept_class_concepts[['concept_id_x', 'ob']], 
                    domain_concepts[['concept_id_x', 'ob']], 
                    vocab_concepts[['concept_id_x', 'ob']], 
                    rel_concepts[['concept_id_x', 'ob']]])

In [99]:

with so.Session(engine) as sess:
    sess.add_all(list(concept_class_concepts.ob))
    sess.add_all(list(domain_concepts.ob))
    sess.add_all(list(vocab_concepts.ob))
    sess.add_all(list(rel_concepts.ob))
    sess.add_all(list(meta_concepts_df.concept_object))
    sess.commit()

In [100]:
with so.Session(engine) as sess:
    existing_vocabs = pd.DataFrame(sess.query(Concept.vocabulary_id).distinct().all())
    existing_concepts = pd.DataFrame(sess.query(Concept.concept_id).distinct().all())

In [102]:
existing_vocabs.head()

Unnamed: 0,vocabulary_id
0,Domain
1,Concept Class
2,Relationship
3,Vocabulary


In [104]:
concept_stage_id[~concept_stage_id.concept_class_id.isin(non_omop_class) & 
                 ~concept_stage_id.vocabulary_id.isin(['Concept Class', 'Domain', 'Vocabulary', 'Relationship'])  
#                 &~concept_stage_id.concept_id.isin(existing_concepts.concept_id.unique())
                 ].concept_class_id.value_counts()

concept_class_id
Regimen            5566
Brand Name         4586
Component           804
Regimen Class       752
Component Class     488
Condition           282
Procedure            63
Context              47
Unit                 37
Modality             23
Route                14
Frequency            14
Component Role        3
Null                  1
Name: count, dtype: int64

In [105]:
len(concepts_with_ids[concepts_with_ids.vocabulary_id=='Gender'])

5

In [106]:
concepts_to_add = concept_relationship_stage_id[
    ['concept_code_2', 'vocabulary_id']
    ].drop_duplicates().merge(
        concepts_with_ids, 
        left_on=['concept_code_2', 'vocabulary_id'], 
        right_on=['concept_code', 'vocabulary_id']
    )

In [107]:
len(concepts_to_add[~concepts_to_add.concept_id.isin(existing_concepts.concept_id.unique())])

9570

In [108]:
#missed = concepts_to_add[~concepts_to_add.concept_id.isin(existing_concepts.concept_id.unique())].apply(make_concept, axis=1)

In [109]:
# add in concepts both from within the HemOnc universe, as well as all available cross-mappings
ho_conc = concepts_with_ids[concepts_with_ids.vocabulary_id=='HemOnc'].apply(make_concept, axis=1)
non_ho_conc = concepts_to_add[~concepts_to_add.vocabulary_id.isin(['Concept Class', 'Domain', 'Vocabulary', 'Relationship', 'HemOnc'])].apply(make_concept, axis=1)

In [110]:
with so.Session(engine) as sess:
    sess.add_all(ho_conc)
    sess.add_all(non_ho_conc)
    sess.commit()

In [111]:
def make_relationship(rel_row):
    return Concept_Relationship(concept_id_1 = rel_row.concept_id_1,
                         concept_id_2 = rel_row.concept_id_2,
                         relationship_id = rel_row.relationship_id)


ho_rels = concept_relationship_stage_id[
    concept_relationship_stage_id.concept_id_1.notna() &
    concept_relationship_stage_id.concept_id_2.notna()
    ].drop_duplicates(
        subset=['concept_id_1', 'concept_id_2', 'relationship_id']
        ).apply(make_relationship, axis=1)

In [112]:
concept_relationship_stage_id[
    concept_relationship_stage_id.concept_id_1.notna() &
    concept_relationship_stage_id.concept_id_2.notna() & 
    concept_relationship_stage_id.concept_id_2.isin(concepts_to_add[~concepts_to_add.concept_id.isin(existing_concepts.concept_id.unique())].concept_id.unique()) & 
    concept_relationship_stage_id.relationship_id.str.contains('map', case=False)
    ].vocabulary_id_2.value_counts()

vocabulary_id_2
ICD-O-3             1512
ICD-10-CM           1416
ICD-9-CM             876
RxNorm               562
RxNorm Extension     113
Name: count, dtype: int64

In [113]:
# missed_rels = concept_relationship_stage_id[
#     concept_relationship_stage_id.concept_id_1.notna() &
#     concept_relationship_stage_id.concept_id_2.notna() & 
#     concept_relationship_stage_id.concept_id_2.isin(concepts_to_add[~concepts_to_add.concept_id.isin(existing_concepts.concept_id.unique())].concept_id.unique()) & 
#     concept_relationship_stage_id.relationship_id.str.contains('map', case=False)
#     ].drop_duplicates(
#         subset=['concept_id_1', 'concept_id_2', 'relationship_id']
#         ).apply(make_relationship, axis=1)

In [114]:
ancestors = concept_ancestors.merge(
    existing_concepts[
        ['concept_id']
        ].rename(columns={'concept_id': 'ancestor_concept_id'})
    ).merge(
        existing_concepts[
            ['concept_id']
        ].rename(columns={'concept_id': 'descendant_concept_id'})
    ).apply(make_ancestor, axis=1)  

In [115]:
ancestors = concept_ancestors.merge(
    concepts_with_ids[concepts_with_ids.vocabulary_id=='Cancer Modifier'][
        ['concept_id']
        ].rename(columns={'concept_id': 'ancestor_concept_id'})
    ).merge(
        concepts_with_ids[concepts_with_ids.vocabulary_id=='Cancer Modifier'][
            ['concept_id']
        ].rename(columns={'concept_id': 'descendant_concept_id'})
    ).apply(make_ancestor, axis=1)  

In [116]:
with so.Session(engine) as sess:
    sess.add_all(list(ancestors))
    sess.commit()

In [117]:
with so.Session(engine) as sess:
    sess.add_all(ho_rels)
    #sess.add_all(missed_rels)
    sess.commit()

In [None]:
# for vocabulary, vocab_concepts in concepts_with_ids.groupby('vocabulary_id'):
#     if vocabulary not in ['Concept Class', 'Domain', 'Vocabulary', 'Relationship'] + [e[0] for e in existing_vocabs]:
#         # should not have to filter on non-null concept names here?
#         vocab_concept_objects = vocab_concepts[~vocab_concepts.concept_name.isna()].apply(make_concept, axis=1)
#         print(vocabulary, len(vocab_concept_objects))
#         with so.Session(engine) as sess:
#             sess.add_all(vocab_concept_objects)
#             sess.commit()

In [118]:
# due to misalignment with current athena version we have to accept gap in concept_id completeness even for classes that should have them

concept_stage_id[concept_stage_id.concept_id.isna() & ~concept_stage_id.concept_class_id.isin(non_omop_class)].head()

Unnamed: 0,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,concept_id
0,Eluvixtamab,drug,HemOnc,Component,,1,2019-05-27,2099-12-31,,
55,BHQ-880,drug,HemOnc,Component,,57,2019-05-27,2099-12-31,,
57,BCG vaccine,drug,HemOnc,Component,,59,2019-05-27,2099-12-31,,
208,Andexanet alfa,drug,HemOnc,Component,,211,2019-05-27,2099-12-31,,
226,Leucovorin,drug,HemOnc,Component,,229,2019-05-27,2099-12-31,,


In [119]:
# helper mapping data / merges within OMOP concepts
is_a = concept_relationship_stage_id[
    concept_relationship_stage_id.relationship_id=='Is a'
    ][['concept_code_1', 'concept_code_2', 'concept_id_1', 'concept_id_2']]


brand_mappings = concept_stage_id[
    concept_stage_id.concept_class_id=='Brand Name'
    ].merge(
        concept_relationship_stage_id[
            concept_relationship_stage_id.relationship_id=='Has brand name'
        ][['concept_code_1', 'concept_code_2', 'concept_id_1', 'concept_id_2']], 
        left_on='concept_code', 
        right_on='concept_code_2', 
        how='left'
    )

component_df = concept_stage_id[
    concept_stage_id.concept_class_id.isin(['Component', 'Procedure'])
    ].copy()

component_class_df = concept_stage_id[
    concept_stage_id.concept_class_id=='Component Class'
    ][['concept_name','concept_code','concept_id']].rename(
        columns={'concept_code': 'concept_code_2', 
                 'concept_name': 'concept_class_name'}
    )

has_modality = concept_relationship_stage_id[
    concept_relationship_stage_id.relationship_id=='Has modality'
    ][['concept_code_1', 'concept_code_2', 'concept_id_1', 'concept_id_1']]

modality_df = concept_stage_id[
    concept_stage_id.concept_class_id=='Modality'
    ][['concept_name','concept_code','concept_id']].rename(
        columns={'concept_code': 'concept_code_2', 
                 'concept_name': 'modality_name', 
                 'concept_id': 'concept_id_2'}
        )

In [120]:
route_mappings = {'44954': '26643006', 
                  '44957': '47625008',
                  '44979': '37161004',
                  '44994': '6064005',
                  '45080': '78421000',
                  '45153': '34206005',
                  '45215': '372471009',
                  '45273': '420254004',
                  '45426': '372466002',
                  '45531': '46713006',
                  '45574': '72607000',
                  '45684': '58100008',
                  '45939': '447694001'}

In [121]:
# helper data cleaning / transformation functions

def get_enum(e, s):
    try:
        return e[s.lower().strip().replace('-', '_').replace(' ', '_').replace('/', '_')]
    except:
        return None

def get_date(y, m, d):
    try:    
        datetime.date(int(y), int(m), int(d))
    except:
        return None

In [122]:
# functions to take dataframe rows and return database objects

def make_context(context_row):
    return Hemonc_Context(context_code = context_row.contextRaw,
                          context_name = context_row.contextPretty,
                          intent = get_enum(Intent, context_row.intent),
                          setting = get_enum(Setting, context_row.intent),
                          risk_stratification = get_enum(Risk, context_row.intent),
                          phenotype = get_enum(Phenotype, context_row.intent),
                          prior_therapy = get_enum(PriorTherapy, context_row.prior_therapy),
                          date_added = context_row.date_added)


def make_study(study_row):
    enrollment_from = get_date(study_row.from_year, study_row.from_month, study_row.from_day)
    enrol_to = get_date(study_row.to_year, study_row.to_month, study_row.to_day)
    return Hemonc_Study(study_code = study_row.study,
                        registry = study_row.registry,
                        trial_id = study_row.trial_id,
                        condition_code = study_row.concept_code,
                        enrollment_from = enrollment_from if not pd.isna(enrollment_from) else None,
                        enrollment_to = enrol_to if not pd.isna(enrol_to) else None,
                        phase = study_row.phase,
                        study_design = get_enum(StudyDesign, study_row.study_design),
                        study_design_imputed = study_row.study_design_imputed,
                        sact = study_row.sact if not pd.isna(study_row.sact) else None,
                        protocol = study_row.protocol,
                        fda_reg_study = study_row.fda_reg_study,
                        fda_unreg_study = study_row.fda_unreg_study,
                        start = study_row.start  if not pd.isna(study_row.start) else None,
                        end = study_row.end  if not pd.isna(study_row.end) else None,
                        study_group = study_row.study_group,
                        sponsor = study_row.sponsor,
                        date_added = study_row.date_added if not pd.isna(study_row.date_added) else None, 
                        date_modified = study_row.date_last_modified if not pd.isna(study_row.date_last_modified) else None
                        )

def make_regimen(regimen_row):
    return Hemonc_Regimen(regimen_cui = regimen_row.regimen_cui,
                          regimen_name = regimen_row.regimen)

def make_condition(condition_row):
    return Hemonc_Condition(condition_code = condition_row.concept_code,
                            condition_name = condition_row.concept_name,
                            condition_concept_id = int(condition_row.concept_id) if not pd.isna(condition_row.concept_id) else None)


def make_component(component_class_row):
    return Hemonc_Component(component_code = component_class_row.concept_code,
                            component_name = component_class_row.concept_name,
                            component_concept_id = int(component_class_row.concept_id) if not pd.isna(component_class_row.concept_id) else None)

def make_component_role(role_row):
    return Hemonc_Component_Role(regimen_cui = role_row.concept_code_1,
                                 component_code = role_row.concept_code_2 if role_row.component_class in ['Component', 'Procedure'] else None,
                                 component_class_code = role_row.concept_code_2 if role_row.component_class == 'Component Class' else None,
                                 relationship_id = role_row.relationship_id)


def make_component_class(component_class_row):
    return Hemonc_Component_Class(component_class_code = component_class_row.concept_code_2,
                                  component_class_name = component_class_row.concept_class_name,
                                  component_class_concept_id = int(component_class_row.concept_id) if not pd.isna(component_class_row.concept_id) else None)

def make_branch(branch_row):
    return Hemonc_Branch_Conditional(branch_name = branch_row.original,
                                     branch_type = branch_row.RULE_TYPE if not pd.isna(branch_row.RULE_TYPE) else BranchConditionalType.other,
                                     numeric_min = branch_row.MIN_NUM,
                                     numeric_max = branch_row.MAX_NUM,
                                     value = branch_row.RULE_VALUE)

def make_variant(variant_row):
    return Hemonc_Variant(variant_cui = variant_row.variant_cui,
                         variant_name = variant_row.variant,
                         regimen_cui = variant_row.regimen_cui)


def make_reg_part(reg_part_row):
    return Hemonc_Regimen_Part(variant_cui = reg_part_row.variant_cui,
                               regimen_part_id = reg_part_row.regimen_part_id,
                               portion = reg_part_row.portion if not pd.isna(reg_part_row.portion) else None,
                               cycle_sig_id = reg_part_row.cyclesigs,
                               timing = reg_part_row.timing,
                               timing_unit = reg_part_row.timing_unit
                               )

def assign_part_phase(reg_part_row):
    if not pd.isna(reg_part_row.phase):
        phases = [Part_Phase(regimen_part_id = reg_part_row.regimen_part_id,
                             variant_cui = reg_part_row.variant_cui, 
                             phase = get_enum(Phase, reg_part_row.phase),
                             ) for phase in reg_part_row.phase.split('|')]
        return phases

def make_cycle_sig(cs_row):
    return Hemonc_Cycle_Sig(cycle_sig_id = cs_row.cyclesigs,
                            duration_min = cs_row.DUR_MIN, 
                            duration_max = cs_row.DUR_MAX,
                            duration_units = cs_row.DUR_UNITS,
                            frequency_min = cs_row.FREQ_MIN,
                            frequency_max = cs_row.FREQ_MAX,
                            frequency_units = cs_row.FREQ_UNITS,
                            repeats_min = cs_row.REP_MIN,
                            repeats_max = cs_row.REP_MAX,
                            repeats_units = cs_row.REP_UNITS, 
                            cycle_len_min = cs_row.cycle_length_lb if not pd.isna(cs_row.cycle_length_lb) else None,
                            cycle_len_max = cs_row.cycle_length_ub if not pd.isna(cs_row.cycle_length_ub) else None,
                            cycle_len_units = cs_row.cycle_length_unit if not pd.isna(cs_row.cycle_length_unit) else None,
                            residual = ' '.join([t.text for t in cs_row.cs_residual]))


def make_modality(mod_row):
    return Hemonc_Modality(modality_code = mod_row.concept_code_2,
                           modality_name = mod_row.modality_name,
                           modality_concept_id = int(mod_row.concept_id_2) if not pd.isna(mod_row.concept_id_2) else None)

def make_ref(ref_row):
    return Hemonc_Ref(reference = ref_row.reference,
                      condition_code = ref_row.concept_code,
                      pmid = ref_row.pmid,
                      study = ref_row.study,
                      title = ref_row.title,
                      pmcid = ref_row.pmcid,
                      doi = ref_row.doi if not ref_row.doi=="" else None,
                      url = ref_row.url if not ref_row.url=="" else None,
                      journal = ref_row.journal,
                      biblio = ref_row.biblio,
                      pub_date = ref_row.pub_date,
                      order = ref_row.order,
                      update = ref_row['update'],
                      ref_type = ref_row.ref_type)

def make_sig(sig_row):
    return Hemonc_Sig(regimen_part_id = sig_row.regimen_part_id,
                      variant_cui = sig_row.variant_cui,
                      sig_id = sig_row.sig_id,
                      component_code = sig_row.concept_code,
                      component_name = sig_row.component,
                      component_role = get_enum(ComponentRole, sig_row.component_role),
                      step_number = sig_row.step_number,
                      component_class = sig_row['class'],
                      tail = sig_row['tail'],
                      route = sig_row.route,
                      doseMinNum = sig_row.doseMinNum if not pd.isna(sig_row.doseMinNum) else None,
                      doseMaxNum = sig_row.doseMaxNum if not pd.isna(sig_row.doseMaxNum) else None,
                      doseUnit = sig_row.doseUnit if not pd.isna(sig_row.doseUnit) else None,
                      doseCapNum = sig_row.doseCapNum if not pd.isna(sig_row.doseCapNum) else None,
                      doseCapUnit = sig_row.doseCapUnit if not pd.isna(sig_row.doseCapUnit) else None,
                      durationMinNum = sig_row.durationMinNum if not pd.isna(sig_row.durationMinNum) else None,
                      durationMaxNum = sig_row.durationMaxNum if not pd.isna(sig_row.durationMaxNum) else None,
                      durationUnit = sig_row.durationUnit if not pd.isna(sig_row.durationUnit) else None,
                      frequency = sig_row.frequency if not pd.isna(sig_row.frequency) else None,
                      inParens = sig_row.inParens if not pd.isna(sig_row.inParens) else None,
                      sequence = sig_row.sequence if not pd.isna(sig_row.sequence) else None,
                      seq_rel = sig_row['seq.rel'] if not pd.isna(sig_row['seq.rel']) else None,
                      seq_rel_what = sig_row['seq.rel.what'] if not pd.isna(sig_row['seq.rel.what']) else None)            


def make_sig_days(sd_row):
    days = '0' if pd.isna(sd_row.allDays) else sd_row.allDays
    return [Sig_Days(regimen_part_id = sd_row.regimen_part_id,
                     variant_cui = sd_row.variant_cui,
                     sig_id = sd_row.sig_id,
                     day = d) for d in days.split(',')]


# spacer matcher helper functions

small_num_lookup = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}

def safe_num(tok):
    try:
        return int(tok.text)
    except:
        try:
            return float(tok.text)
        except:
            try: 
                return small_num_lookup[tok.text.lower()]
            except:
                return None

def safe_age(numeric):
    ages = [safe_num(t) for t in numeric if t._.AGE]
    if len(ages) > 0:
        return ages[0]

def get_full_stopping_condition(doc, conditions):
    return doc[conditions[0].i:] if len(conditions) > 0 else None

def get_units(tokens):
    try:
        return [t.lemma_ for t in tokens if t._.TIMING_UNIT][0]
    except:
        return None

def get_min(group):
    try:
        return min([i for i in [safe_num(t) for t in group] if i])
    except:
        return None

def get_max(group): 
    try:
        if any([t._.GTE for t in group]):
            return -1
        return max([i for i in [safe_num(t) for t in group] if i])
    except:
        return None


In [123]:
role = ['Has cytotoxic chemo', 'Has targeted therapy', 'Has supportive med',
       'Has steroid tx', 'Has local therapy', 'Has immunosuppressor',
       'Has immunotherapy', 'Has endocrine tx', 'Has radiotherapy',
       'Has growth factor', 'Has AB-drug cjgt', 'Has radioconjugate',
       'Has antineoplastic', 'Has anticoag tx', 'Has pept-drug cjgt']

concept_reg_roles = (
    concept_relationship_stage_id[concept_relationship_stage_id.relationship_id.isin(role)][['concept_code_1', 'concept_id_1', 'relationship_id', 'concept_code_2', 'concept_id_2']]
        .merge(
                concept_stage_id[['concept_code', 'concept_name', 'concept_class_id']]
                .rename(
                    columns={'concept_code': 'concept_code_1', 
                             'concept_name': 'regimen_name',
                             'concept_class_id': 'regimen_class'}
                )
        )
        .merge(
                concept_stage_id[['concept_code', 'concept_name', 'concept_class_id']]
                .rename(
                    columns={'concept_code': 'concept_code_2', 
                             'concept_name': 'component_name',
                             'concept_class_id': 'component_class'}
                )
        )  
)

In [124]:
concept_reg_roles.head()

Unnamed: 0,concept_code_1,concept_id_1,relationship_id,concept_code_2,concept_id_2,regimen_name,regimen_class,component_name,component_class
0,795,35803428.0,Has immunosuppressor,122,35802975.0,Cyclophosphamide and Prednisolone,Regimen,Cyclophosphamide,Component
1,795,35803428.0,Has immunosuppressor,417,35803267.0,Cyclophosphamide and Prednisolone,Regimen,Prednisolone,Component
2,797,35803429.0,Has immunosuppressor,122,35802975.0,Cyclophosphamide and Prednisone,Regimen,Cyclophosphamide,Component
3,797,35803429.0,Has immunosuppressor,418,35803268.0,Cyclophosphamide and Prednisone,Regimen,Prednisone,Component
4,797,35803429.0,Has cytotoxic chemo,122,35802975.0,Cyclophosphamide and Prednisone,Regimen,Cyclophosphamide,Component


In [125]:
concept_reg_roles['role_object'] = concept_reg_roles.apply(make_component_role, axis=1)

In [126]:
conditions = concept_stage_id[concept_stage_id.concept_class_id=='Condition'].apply(make_condition, axis=1)
component_class_df['component_class_object'] = component_class_df.apply(make_component_class, axis=1)

In [127]:
component_df['component_object'] = component_df.apply(make_component, axis=1)
components_with_class = component_df.merge(is_a, 
                                           left_on='concept_code', 
                                           right_on='concept_code_1').merge(component_class_df
                                                                            ).rename(columns={'concept_code_2': 'concept_class_code'})


In [128]:
component_df

Unnamed: 0,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,concept_id,component_object
0,Eluvixtamab,drug,HemOnc,Component,,1,2019-05-27,2099-12-31,,,<hemonc_alchemy.model.hemonc_model.Hemonc_Comp...
1,Capivasertib,drug,HemOnc,Component,,2,2019-05-27,2099-12-31,,35802856.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Comp...
2,Abciximab,drug,HemOnc,Component,,3,2019-05-27,2099-12-31,,35802857.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Comp...
3,Abemaciclib,drug,HemOnc,Component,,4,2019-05-27,2099-12-31,,35802858.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Comp...
4,Abexinostat,drug,HemOnc,Component,,5,2019-05-27,2099-12-31,,35802859.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Comp...
...,...,...,...,...,...,...,...,...,...,...,...
105612,CM-313,drug,HemOnc,Component,,142243,2024-09-18,2099-12-31,,,<hemonc_alchemy.model.hemonc_model.Hemonc_Comp...
105717,Efanesoctocog alfa,drug,HemOnc,Component,,142348,2024-09-18,2099-12-31,,,<hemonc_alchemy.model.hemonc_model.Hemonc_Comp...
107180,Ivonescimab,drug,HemOnc,Component,,143813,2024-09-18,2099-12-31,,,<hemonc_alchemy.model.hemonc_model.Hemonc_Comp...
107382,Mavorixafor,drug,HemOnc,Component,,144015,2024-09-18,2099-12-31,,,<hemonc_alchemy.model.hemonc_model.Hemonc_Comp...


In [129]:
modality_df['modality_object'] = modality_df.apply(make_modality, axis=1)

regimens_with_modality = concept_stage_id[
    concept_stage_id.invalid_reason == ''
    ].merge(
        has_modality, 
        left_on='concept_code', 
        right_on='concept_code_1'
    ).merge(
        modality_df
    ).rename(
            columns={'concept_code_2': 'concept_class_code'}
    ).drop_duplicates()


In [130]:
regimens_with_modality[regimens_with_modality[['concept_name', 'modality_name']].duplicated()]

Unnamed: 0,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,concept_id,concept_code_1,concept_class_code,concept_id_1,concept_id_1.1,modality_name,concept_id_2,modality_object


In [131]:
components_with_class.apply(lambda x: x.component_object.component_classes.append(x.component_class_object), axis=1)

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
41    None
42    None
43    None
44    None
45    None
46    None
47    None
48    None
49    None
50    None
51    None
52    None
53    None
54    None
55    None
56    None
57    None
dtype: object

In [132]:
ref = ref.merge(concept_stage[concept_stage.concept_class_id=='Condition'][['concept_name', 'concept_code']], left_on='condition', right_on='concept_name', how='left')
ref['pub_date'] = pd.to_datetime(ref['pub.date'])
ref_obj = ref[~ref.concept_code.isna()].apply(make_ref, axis=1)

In [133]:
# todo: confirm what is meant by a single cui with >1 title - is this a synonym?

regimen_dedup = regimen.drop_duplicates(subset='regimen_cui').copy()

In [134]:
context.date_added = pd.to_datetime(context.date_added)
contexts = context.apply(make_context, axis=1)

In [135]:
# prepare study objects
study.date_added = pd.to_datetime(study.date_added, format='mixed', errors='coerce')
study.date_last_modified = pd.to_datetime(study.date_last_modified, format='mixed', errors='coerce')
enrol_dates = study.enrollment.str.split('to', expand=True)
enrol_from = enrol_dates[0].str.strip().str.split('-', expand=True).rename(columns={0: 'from_year', 1:'from_month', 2: 'from_day'}).fillna('01')
enrol_to = enrol_dates[1].str.strip().str.split('-', expand=True).rename(columns={0: 'to_year', 1:'to_month', 2: 'to_day'}).fillna('01')
study = pd.concat([study, enrol_from, enrol_to], axis=1)
study = study.merge(concept_stage[concept_stage.concept_class_id=='Condition'][['concept_name', 'concept_code']], left_on='condition', right_on='concept_name', how='left')
study.start = pd.to_datetime(study.start, format='mixed', errors='coerce')
study.end = pd.to_datetime(study.end, format='mixed', errors='coerce')

In [136]:
study['study_object'] = study.apply(make_study, axis=1)
regimen_dedup['regimen_object'] = regimen_dedup.apply(make_regimen, axis=1)

In [137]:
regimen_dedup = regimen_dedup.merge(regimens_with_modality[['concept_name', 'modality_object']].rename(columns={'concept_name':'regimen'}))

In [138]:
regimen_dedup.apply(lambda x: x.regimen_object.modalities.append(x.modality_object), axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
4175    None
4176    None
4177    None
4178    None
4179    None
Length: 4180, dtype: object

In [142]:
# new issue: null condition codes
study[study.concept_code.isna()]

Unnamed: 0,study,registry,trial_id,condition,enrollment,phase,study_design,study_design_imputed,sact,protocol,...,date_last_modified,from_year,from_month,from_day,to_year,to_month,to_day,concept_name,concept_code,study_object
315,AMD3100-3102,ClinicalTrials.gov,NCT00103662,Stem cell mobilization regimens,2005-02-04 to 2006-07-07,Phase 3,Escalation,False,False,False,...,NaT,2005.0,2,4,2006,7,7,,,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
381,BMT CTN 0501,ClinicalTrials.gov,NCT00412360,Allogeneic HSCT,,,CBD,False,,False,...,NaT,,1,1,1,1,1,,,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
1154,CD7-001,ClinicalTrials.gov,NCT04599556,Cellular therapy conditioning regimens,2021-11 to 2023-09,Phase 1,Non-randomized,False,True,False,...,NaT,2021.0,11,1,2023,9,1,,,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...


In [143]:
with so.Session(engine) as sess:
    sess.add_all(conditions)
    sess.add_all(list(component_class_df.component_class_object))
    sess.add_all(list(component_df.component_object))
    sess.add_all(contexts)
    sess.add_all(list(study[~study.concept_code.isna()].study_object))
    sess.add_all(list(modality_df.modality_object))
    sess.add_all(list(regimen_dedup.regimen_object))
    sess.add_all(list(ref_obj))
    sess.add_all(list(concept_reg_roles.role_object))
    sess.commit()

In [144]:
# we are not going to pull in out of date variant versions
sig_vars = sigs.merge(variant[variant.version==1][['variant_cui']])

In [145]:
for (r, c), dets in sig_vars[~sig_vars.step_number.str.contains('1 of') & sig_vars.allDays.isna()].groupby(['regimen', 'component']):
    if len(dets)>1:
        break

In [146]:
# parsing out branch conditions with spacy matchers

branch_crit = sig_vars.branch.value_counts().reset_index()

all_branches = []

for crit in branch_crit.branch.unique():
    combined_branches = crit.split('AND ')
    for br in combined_branches:
        all_branches += [b.lower().strip() for b in br.split('OR ')]

all_branches = list(set(all_branches))

branch_details = pd.DataFrame({'original': all_branches, 'doc': [nlp(b) for b in all_branches]})

for label, config in rules.items():
    branch_details.doc.apply(lambda d: match_entities(matchers[label], d, label, config["MERGE"]))
    branch_details[label] = branch_details.doc.apply(lambda d: [t for t in d if getattr(t._, label)])
    branch_details[f'has_{label}'] = (branch_details[label].apply(len)>0)

branch_details['MODIFIER_HEAD'] = branch_details.doc.apply(get_modifier_child)
branch_details['NOUNS'] = branch_details.doc.apply(get_nouns)
branch_details['NUMERIC'] = branch_details.doc.map(lambda doc: [tok for tok in doc if tok.like_num])

In [147]:
# parse numeric branch factors into constituant elements

branch_details.loc[branch_details.has_AGE, 'RULE_TYPE'] = BranchConditionalType.age
branch_details.loc[branch_details.has_SIZE, 'RULE_TYPE'] = BranchConditionalType.size
branch_details.loc[branch_details.has_LAB, 'RULE_TYPE'] = BranchConditionalType.lab
branch_details.loc[branch_details.has_STAGE, 'RULE_TYPE'] = BranchConditionalType.stage

branch_details.loc[branch_details.has_STAGE, 'RULE_VALUE'] = branch_details.STAGE.apply(lambda x: ' '.join([tok.text for tok in x]) if len(x) > 0 else None)

num = branch_details.NUMERIC.apply(lambda x: safe_num(x[0]) if len(x) > 0 else None)
age = branch_details.NUMERIC.apply(lambda x: safe_age(x))
first_num = branch_details.NUMERIC.apply(lambda x: safe_num(x[0]) if len(x) > 0 else None)
second_num = branch_details.NUMERIC.apply(lambda x: safe_num(x[1]) if len(x) > 1 else None)

gt = (branch_details.GT.apply(len)>0)
gte = (branch_details.GTE.apply(len)>0)
lt = (branch_details.LT.apply(len)>0)
lte = (branch_details.LTE.apply(len)>0)

r = (branch_details.RANGE.apply(len)>0)

branch_details.loc[branch_details.has_AGE & gt, 'MIN_NUM'] = age.apply(lambda x: x + 1 if x else None)
branch_details.loc[branch_details.has_AGE & gte, 'MIN_NUM'] = age
branch_details.loc[branch_details.has_AGE & lt, 'MAX_NUM'] = age.apply(lambda x: x - 1 if x else None)
branch_details.loc[branch_details.has_AGE & lte, 'MAX_NUM'] = age

branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & gt, 'MIN_NUM'] = num.apply(lambda x: x + 1 if x else None)
branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & gte, 'MIN_NUM'] = num
branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & lt, 'MAX_NUM'] = num.apply(lambda x: x - 1 if x else None)
branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & lte, 'MAX_NUM'] = num

branch_details.loc[(branch_details.has_AGE | branch_details.has_SIZE | branch_details.has_LAB) & r, 'MIN_NUM'] = first_num
branch_details.loc[(branch_details.has_AGE | branch_details.has_SIZE | branch_details.has_LAB) & r, 'MAX_NUM'] = second_num

In [148]:
# very likely to be able to improve these value extraction steps through either medspacy NER or better rules / more fullsome parsing, but a decent first pass

branch_details.loc[branch_details.RULE_VALUE.isna(), 'RULE_VALUE'] = branch_details.apply(lambda row: ' '.join([t.text for t in (row.FACT_MODIFIER + row.FACT)]), axis=1)

In [149]:
branches = branch_details.apply(make_branch, axis=1)

In [150]:
with so.Session(engine) as sess:
    sess.add_all(branches)
    sess.commit()

In [207]:
variant_study = variant.merge(pointer[['regimen', 'regimen_cui']].drop_duplicates(), how='left')[['regimen', 'study', 'variant_cui', 'variant', 'regimen_cui']].drop_duplicates()
variant_study['variant_object'] = variant_study.apply(make_variant, axis=1)

In [208]:
study_object_lookup = {s.study: s.study_object for s in study.itertuples()}

In [209]:
# new issue - some null studies in variant study map
for row in variant_study[~variant_study.study.isna()].itertuples():
    for study_name in row.study.split('|'):
        try:
            row.variant_object.studied_in.append(study_object_lookup[study_name])
        except:
            print(f'missing study: {study_name} ({row.regimen})')

missing study: No supporting study (Afatinib monotherapy)
missing study: No supporting study (Atezolizumab monotherapy)
missing study: No supporting study (Atezolizumab monotherapy)
missing study: No supporting study (Atezolizumab monotherapy)
missing study: No supporting study (Atezolizumab monotherapy)
missing study: No supporting study (Axitinib and Avelumab)
missing study: CABL001J12301 (Bosutinib monotherapy)
missing study: No supporting study (Cabozantinib and Nivolumab)
missing study: No supporting study (Cabozantinib monotherapy)
missing study: No supporting study (Crizotinib monotherapy)
missing study: No supporting study (Crizotinib monotherapy)
missing study: CABL001J12301 (Dasatinib monotherapy)
missing study: CABL001J12301 (Imatinib monotherapy)
missing study: No supporting study (Ipilimumab and Nivolumab)
missing study: CABL001J12301 (Nilotinib monotherapy)
missing study: No supporting study (Nivolumab monotherapy)
missing study: No supporting study (Nivolumab monotherapy

In [159]:
regimen_part_sig = sig_vars[['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].drop_duplicates()

In [160]:
# spacy matcher parsing of cycle sigs

cycle_sigs_parsed = regimen_part_sig[['cyclesigs', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].fillna('').drop_duplicates().copy()
cycle_sigs_parsed['doc'] = cycle_sigs_parsed.cyclesigs.map(nlp)

for label, config in rules.items():
    cycle_sigs_parsed.doc.apply(lambda d: match_entities(matchers[label], d, label, config["MERGE"]))
    cycle_sigs_parsed[label] = cycle_sigs_parsed.doc.apply(lambda d: [t for t in d if getattr(t._, label)])
    cycle_sigs_parsed[f'has_{label}'] = (cycle_sigs_parsed[label].apply(len)>0)

cycle_sigs_parsed['full_stopping'] = cycle_sigs_parsed.apply(lambda row: get_full_stopping_condition(row.doc, row.STOPPING_CONDITION), axis=1)


cycle_sigs_parsed['DUR_MIN'] = cycle_sigs_parsed.DURATION.apply(get_min)
cycle_sigs_parsed['DUR_MAX'] = cycle_sigs_parsed.DURATION.apply(get_max)
cycle_sigs_parsed['DUR_UNITS'] = cycle_sigs_parsed.DURATION.apply(get_units)
cycle_sigs_parsed['FREQ_MIN'] = cycle_sigs_parsed.FREQUENCY.apply(get_min)
cycle_sigs_parsed['FREQ_MAX'] = cycle_sigs_parsed.FREQUENCY.apply(get_max)
cycle_sigs_parsed['FREQ_UNITS'] = cycle_sigs_parsed.FREQUENCY.apply(get_units)
cycle_sigs_parsed['REP_MIN'] = cycle_sigs_parsed.REPEATS.apply(get_min)
cycle_sigs_parsed['REP_MAX'] = cycle_sigs_parsed.REPEATS.apply(get_max)
cycle_sigs_parsed['REP_UNITS'] = cycle_sigs_parsed.REPEATS.apply(get_units)

In [161]:
reg_parts = sig_vars[['regimen', 'variant_cui','variant','phase', 'portion', 'cyclesigs', 'timing', 'timing_unit']].drop_duplicates().copy()
reg_parts['regimen_part_id'] = reg_parts.sort_values(['variant_cui', 'phase', 'portion', 'cyclesigs', 'timing', 'timing_unit']).groupby('variant_cui').cumcount()

In [162]:
reg_parts[reg_parts.cyclesigs.str.contains('2 and', na=False, case=False)]

Unnamed: 0,regimen,variant_cui,variant,phase,portion,cyclesigs,timing,timing_unit,regimen_part_id
7013,Dasatinib and Blinatumomab,131515.0,Variant #01,Consolidation,-,42-day cycle for at least 2 and 5 cycles,,Cycle,0


In [163]:
regimen_parts = reg_parts.apply(make_reg_part, axis=1)
part_phase = reg_parts.apply(assign_part_phase, axis=1)
cycle_sigs_parsed['cs_residual'] = cycle_sigs_parsed.doc.apply(lambda d: [t for t in d if not t._.FREQUENCY and not t._.REPEATS and not t._.DURATION and not t._.GTE and t.text != 'for'])

In [164]:
cycle_sigs_parsed[cycle_sigs_parsed.cs_residual.apply(len) > 0]

Unnamed: 0,cyclesigs,cycle_length_lb,cycle_length_ub,cycle_length_unit,doc,AGE,has_AGE,STAGE,has_STAGE,RANGE,...,DUR_MIN,DUR_MAX,DUR_UNITS,FREQ_MIN,FREQ_MAX,FREQ_UNITS,REP_MIN,REP_MAX,REP_UNITS,cs_residual
193,Monthly cycles,1,1,month,"(Monthly, cycles)",[],False,[],False,[],...,,,,,,,,,,"[Monthly, cycles]"
653,Continued until achievement of CR or best resp...,1,1,week,"(Continued, until, achievement, of, CR, or, be...",[],False,[],False,[],...,,,,,,,,,,"[Continued, until, achievement, of, CR, or, be..."
793,Continued until CR or 60 days,c,60,day,"(Continued, until, CR, or, 60, days)",[],False,[],False,[],...,,,,,,,,,,"[Continued, until, CR, or, 60, days]"
794,Continued until CR,c,NUB,indeterminate,"(Continued, until, CR)",[],False,[],False,[],...,,,,,,,,,,"[Continued, until, CR]"
796,Monthly cycle for 6 cycles,1,1,month,"(Monthly, cycle, for, 6, cycles)",[],False,[],False,[],...,,,,,,,6.0,6.0,cycle,"[Monthly, cycle]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17446,Duration of treatment not specified,1,1,indeterminate,"(Duration, of, treatment, not, specified)",[],False,[],False,[],...,,,,,,,,,,"[Duration, of, treatment, not, specified]"
17482,14-day lead-in,14,14,day,"(14, -, day, lead, -, in)",[],False,[],False,[],...,,,,,,,,,,"[14, -, day, lead, -, in]"
17800,21-day cycles until progression or two cycles ...,21,21,day,"(21, -, day, cycles, until, progression, or, t...",[],False,[],False,[],...,,,,21.0,21.0,day,2.0,2.0,cycle,"[until, progression, or, past, documented, CR]"
17832,14-day cycle for 2 years or until disease prog...,14,14,day,"(14, -, day, cycle, for, 2, years, or, until, ...",[],False,[],False,[],...,,,,14.0,14.0,day,,,,"[2, years, or, until, disease, progression, or..."


In [165]:
cs = cycle_sigs_parsed.apply(make_cycle_sig, axis=1)

In [178]:
# new issue - variant_cui not unique in this file any more - need to resolve the study field to unique pairs?
variant_study[variant_study.variant_cui.duplicated(keep=False)]

Unnamed: 0,regimen,study,variant_cui,variant,regimen_cui,variant_object
32,7+3d,Arlin et al. 1990|AZA-AML-001|CALGB 7421|CALGB...,129508,Variant #05,814,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
33,7+3d,Jin et al. 2013|Arlin et al. 1990|AZA-AML-001|...,129508,Variant #05,814,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
41,7+3d,AML-AZA|AML2003|AML2006|AZA-AML-001|BRIGHT AML...,129512,Variant #09,814,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
42,7+3d,AML-AZA|AML2003|AML2006|AZA-AML-001|CALGB 1020...,129512,Variant #09,814,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
43,7+3d,ECOG E1900|JHOC-J1101|SWOG S1203,129513,Variant #10,814,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
...,...,...,...,...,...,...
17594,TBI,McGovern et al. 1959|SWOG S9704,144873,Variant #01,4280,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
17595,TCH (Docetaxel),RESPECT,144874,Variant #05,9488,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
17962,Abiraterone and Olaparib,PROpel,144875,Variant #02,112451,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
17963,Carboplatin and Paclitaxel (CP),AtTEnd|NRG/GOG-0213,144876,Variant #82,2554,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...


In [179]:
# new issue - handling null condition codes
variant_study[variant_study.study.notna() & ~variant_study.study.isin(study[study.concept_code.isna()].study.unique())].drop_duplicates(subset='variant_cui')

Unnamed: 0,regimen,study,variant_cui,variant,regimen_cui,variant_object
0,(90)YFC,MDACC ID01-233,129495,Variant #01,3071,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
2,"(90)YFC, then allo HSCT",MDACC ID01-233,129496,Variant #01,19726,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
5,131Iodine-Tositumomab monotherapy,CP-97-012|Kaminski et al. 1993|Kaminski et al....,129497,Variant #01,19847,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
8,5+2d,CALGB 7421|JHOC-J1101,129498,Variant #01,1002,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
10,5+2d,CALGB 7421,129499,Variant #02,1002,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
...,...,...,...,...,...,...
10569,TACE monotherapy,LCI-125-009,144898,Variant #02,22070,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
10570,TACE monotherapy,LCI-125-009,144899,Variant #03,22070,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
10571,Topotecan monotherapy,CheckMate 331|Eckardt et al. 2007|GSK 104864/478,144900,Variant #15,11058,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
10572,Vinorelbine monotherapy,HORG CT/03.07,144901,Variant #21,6944,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...


In [189]:
from collections import defaultdict
vs_lookup = defaultdict(list)

for variant_map in variant_study[['variant_cui', 'study']].fillna('').itertuples():
    vs_lookup[variant_map.variant_cui] += variant_map.study.split('|')

In [198]:
# new issue - variant_cui - study id is not unique in this file any more - need to resolve the study field to unique pairs?

dupe_vs_map = [k for k, v in vs_lookup.items() if len(v) != len(set(v))]

In [206]:
variant_study[variant_study.variant_cui == 129495]#.variant_object.iloc[0].__dict__

Unnamed: 0,regimen,study,variant_cui,variant,regimen_cui,variant_object
0,(90)YFC,MDACC ID01-233,129495,Variant #01,3071,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...


In [213]:
with so.Session(engine) as sess:
    sess.add_all(list(variant_study[variant_study.study.notna() & 
                                    ~variant_study.study.isin(study[study.concept_code.isna()].study.unique()) & 
                                    ~variant_study.variant_cui.isin(dupe_vs_map)].drop_duplicates(subset='variant_cui').variant_object))
    sess.add_all(regimen_parts)
    sess.add_all(cs)
    sess.commit()

  sess.commit()


In [214]:
sig_vars[sig_vars.variant_cui==131430]#sig_vars[['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].duplicated()]

regimen_part_sig[regimen_part_sig.regimen=='D-FEC plus Bev']

reg_parts[reg_parts.variant_cui==131430]#sig_vars[['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].duplicated()]


Unnamed: 0,regimen,variant_cui,variant,phase,portion,cyclesigs,timing,timing_unit,regimen_part_id
6642,D-FEC+Bev,131430.0,Variant #01,,D portion,21-day cycle for 3 cycles,Cycles 1 to 3,Cycle,0
6644,D-FEC+Bev,131430.0,Variant #01,,FEC portion,21-day course,Cycle 4,Course,1
6645,D-FEC+Bev,131430.0,Variant #01,,FEC portion,21-day cycle for 3 cycles,Cycles 4 to 6,Cycle,2


In [215]:
sig_vars_components = sig_vars.merge(components_with_class[['concept_name', 'concept_code']], left_on='component', right_on='concept_name', how='left').drop_duplicates().merge(reg_parts[['variant_cui', 'phase', 'portion', 'cyclesigs', 'regimen_part_id']])
sig_vars_components[['variant_cui', 'regimen_part_id', 'component', 'component_role', 'timing', 'step_number', 'inParens', 'class', 'tail']][sig_vars_components[['variant_cui', 'regimen_part_id', 'component', 'component_role', 'timing', 'step_number', 'class', 'tail']].duplicated()]

Unnamed: 0,variant_cui,regimen_part_id,component,component_role,timing,step_number,inParens,class,tail
6854,136544.0,0,Cyclosporine,primary systemic,,1 of 1,,Non-canonical Sig,adjusted to maintain therapeutic cyclosporine ...
6949,131390.0,0,Cytarabine,primary systemic,,1 of 1,,IV intermittent canonical Sig,-
12406,136926.0,0,Hydroxyurea,primary systemic,,1 of 1,,Non-canonical Sig,-
12407,136926.0,0,Hydroxyurea,primary systemic,,1 of 1,,Non-canonical Sig,-
12408,136926.0,0,Hydroxyurea,primary systemic,,1 of 1,,Non-canonical Sig,-
12409,136926.0,0,Hydroxyurea,primary systemic,,1 of 1,,Non-canonical Sig,-
12410,136926.0,0,Hydroxyurea,primary systemic,,1 of 1,,Non-canonical Sig,-
14006,132911.0,0,Lenalidomide,primary systemic,,1 of 1,,Non-IV canonical Sig,-
14007,132911.0,0,Lenalidomide,primary systemic,,1 of 1,,Non-IV canonical Sig,-
14710,144895.0,0,Methotrexate,primary systemic,,1 of 1,,Non-IV canonical Sig,-


In [216]:
sig_vars_components['sig_id'] = sig_vars_components.groupby(['regimen', 'variant_cui', 'regimen_part_id']).cumcount()

In [217]:
sig_vars_components = sig_vars_components.sort_values(['variant_cui', 'regimen_part_id', 'concept_name', 'step_number'])

In [218]:
# if days are empty, but this component and regimen part are identical to the previous row when sorted in this fashion, filling these day details forward

sig_vars_components.loc[sig_vars_components.allDays.isna() & (sig_vars_components.concept_name.shift(1) ==  sig_vars_components.concept_name), 'allDays'] = sig_vars_components.allDays.shift(1)

In [219]:
sig_objects = sig_vars_components.apply(make_sig, axis=1)
sd = sig_vars_components.apply(make_sig_days, axis=1)

In [220]:
sig_vars_components.component.apply(len).max()

40

In [221]:
with so.Session(engine) as sess:
    sess.add_all(sig_objects)
    for s in sd:
        sess.add_all(s)
    sess.commit()

In [222]:
sig_vars_components[['component', 'doseMinNum', 'doseMaxNum', 'doseUnit', 'doseCapNum', 'doseCapUnit', 'divided', 'durationMinNum', 'durationMaxNum', 'durationUnit', 'frequency', 'inParens', 'sequence', 'seq.rel', 'seq.rel.what']]['seq.rel.what'].value_counts()

seq.rel.what
Elotuzumab           12
Ifosfamide            6
Fludarabine           5
Leucovorin            5
Irinotecan            3
Rituximab             3
Melphalan             2
Radiation therapy     2
HIPEC                 2
Etoposide             1
Teniposide            1
chemotherapy          1
Methotrexate          1
Radiotherapy          1
Mesna                 1
Levoleucovorin        1
Cyclophosphamide      1
Goserelin             1
Name: count, dtype: int64

In [223]:
regimen[regimen.regimen_cui.duplicated()].head()

Unnamed: 0_level_0,regimen,regimen_cui
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1
145769,"Busulfan and Fludarabine, then allo HSCT",1584
133438,CYVE,1105
133514,"Arsenic trioxide, then ATRA and Daunorubicin",2391
126628,FC,2913
133621,"Fludarabine, Busulfan, ATG, Ibritumomab tiuxetan",3015


In [224]:
### Issues

# Need a n:m context <-> status mapper

# single record with >1 setting - is this a variant?
context[context.setting.map(lambda x: get_enum(Setting, x)).isna()].setting.value_counts()

# list of regimens that have no associated variants

vv = variant.merge(pointer[['regimen', 'regimen_cui']].drop_duplicates(), how='outer')
vv[vv.variant.isna()].regimen.unique()

# and vice-versa
vv[vv.regimen_cui.isna()]


# some potential duplication across variant/sig rows
dup = sig_vars[sig_vars[['study',  'regimen', 'component', 'variant_cui', 'branch', 'timing', 'step_number', 'portion', 'class']].duplicated(keep=False)]

for regimen, reg_dets in dup.groupby('regimen'):
    print(regimen)
    for lab, d in reg_dets.to_dict().items():
        if len(set(d.values())) != 1:
            print('\t', lab, d)

# should these variant definitions have the same cyclesig?

distinct_sigs = sig_vars[~sig_vars.component.str.contains('surg|brachy|radiotherapy', case=False)][['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch']].drop_duplicates()
distinct_sigs[distinct_sigs[['regimen', 'variant', 'phase', 'portion', 'timing', 'branch']].duplicated(keep=False)][['regimen', 'variant']].drop_duplicates()

sig_vars[(sig_vars.regimen=='TACE, then 5-FU') & (sig_vars.variant=='Variant #01')][['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'component']]


# components in the variant file that aren't in the drug class - need to revisit to pull in procedures
sig_vars_components[sig_vars_components.concept_code.isna()]

BCG vaccine monotherapy
	 portion {1164: '-', 1165: '-', 1166: 'induction', 1167: 'induction', 1168: '-', 1169: '-'}
	 timing_sequence {1164: '1,8,15', 1165: '1,8,15', 1166: '1,8,15,22,29,36', 1167: '1,8,15,22,29,36', 1168: '1,8,15,22,29,36', 1169: '1,8,15,22,29,36'}
	 timing {1164: 'Days 1, 8, 15', 1165: 'Days 1, 8, 15', 1166: 'Days 1, 8, 15, 22, 29, 36', 1167: 'Days 1, 8, 15, 22, 29, 36', 1168: 'Days 1, 8, 15, 22, 29, 36', 1169: 'Days 1, 8, 15, 22, 29, 36'}
	 cyclesigs {1164: '6-week course, then proceed to maintenance therapy', 1165: '6-week course, then proceed to maintenance therapy', 1166: '6-week course, then proceed to maintenance therapy', 1167: '6-week course, then proceed to maintenance therapy', 1168: '6-week course', 1169: '6-week course'}
	 doseMinNum {1164: '0.5', 1165: '50', 1166: '0.5', 1167: '50', 1168: '0.5', 1169: '50'}
	 doseMaxNum {1164: '0.5', 1165: '50', 1166: '0.5', 1167: '50', 1168: '0.5', 1169: '50'}
	 doseUnit_cui {1164: nan, 1165: nan, 1166: nan, 1167: nan,

Unnamed: 0,study,regimen,regimen_cui,phase,portion,component,component_cui,component_role,cycle_length_lb,cycle_length_ub,...,seq.rel.what,tail,variant,variant_cui,temp,date_added,concept_name,concept_code,regimen_part_id,sig_id
0,MDACC ID01-233,(90)YFC,3071,,-,Cyclophosphamide,122,primary systemic,1,1,...,,-,Variant #01,129495.0,,2023-09-19,,,0,0
1,MDACC ID01-233,(90)YFC,3071,,-,Fludarabine,224,primary systemic,1,1,...,,-,Variant #01,129495.0,,2023-09-19,,,0,1
4,MDACC ID01-233,(90)YFC,3071,,-,Rituximab,446,primary systemic,1,1,...,,-,Variant #01,129495.0,,2023-09-19,,,0,4
2,MDACC ID01-233,(90)YFC,3071,,-,Ibritumomab tiuxetan,262,primary systemic,1,1,...,,-,Variant #01,129495.0,,2023-09-19,,,0,2
3,MDACC ID01-233,(90)YFC,3071,,-,Ibritumomab tiuxetan,262,primary systemic,1,1,...,,-,Variant #01,129495.0,,2023-09-19,,,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18079,LCI-125-009,TACE monotherapy,22070,Adjuvant,-,Doxorubicin,170,locoregional,1,1,...,,mg/m/li>,Variant #03,144899.0,,2023-09-19,,,0,0
18080,LCI-125-009,TACE monotherapy,22070,Adjuvant,-,TACE,22036,locoregional,1,1,...,,-,Variant #03,144899.0,,2023-09-19,,,0,1
18592,GFPC 01-2013,Topotecan monotherapy,11058,,-,Topotecan,503,primary systemic,21,21,...,,-,Variant #15,144900.0,,2024-10-13,,,0,0
19206,HORG CT/03.07,Vinorelbine monotherapy,6944,,-,Vinorelbine,540,primary systemic,21,21,...,,-,Variant #21,144901.0,,2024-10-13,,,0,0


In [225]:

distinct_sigs = sig_vars[~sig_vars.component.str.contains('surg|brachy|radiotherapy', case=False)][['regimen', 'variant', 'portion', 'cyclesigs', 'timing', 'branch']].drop_duplicates()
distinct_sigs[distinct_sigs[['regimen', 'variant', 'portion', 'timing', 'branch']].duplicated(keep=False)][['regimen', 'variant']].drop_duplicates()


Unnamed: 0,regimen,variant
2384,CapeOx and Pembrolizumab,Variant #01
3593,Chlorambucil and Prednisolone,Variant #01
3597,Chlorambucil and Prednisolone,Variant #02
5521,"Cobimetinib, Vemurafenib, Atezolizumab",Variant #01
8757,ECX,Variant #04
9648,FEC,Variant #34
10875,Fotemustine monotherapy,Variant #02
12194,Ipilimumab and Sargramostim,Variant #01
12669,"L-Asparaginase, Vincristine, Dexamethasone",Variant #01
12899,Lenalidomide monotherapy,Variant #17


In [226]:
ref[ref.concept_code.isna()].condition.value_counts()

condition
Allogeneic HSCT                           74
Autologous HSCT                           70
Stem cell mobilization regimens           16
Cellular therapy conditioning regimens     7
Name: count, dtype: int64