In [1]:
# if running for first time, you may need to install spacy model
# > python -m spacy download en_core_web_sm
import dotenv, pyreadr, os
dotenv.load_dotenv()

from pathlib import Path
from datetime import datetime
import pandas as pd
import sqlalchemy as sa
import sqlalchemy.orm as so

from hemonc_alchemy.matchers.spacy_config import matchers, nlp, rules, match_entities, get_modifier_child, get_nouns  
from hemonc_alchemy.model.hemonc_model import Hemonc_Study, Hemonc_Condition, Hemonc_Component, Hemonc_Modality, Hemonc_Component_Class, Hemonc_Regimen, Hemonc_Variant, Hemonc_Regimen_Part, Part_Phase, Hemonc_Cycle_Sig, Hemonc_Sig, Sig_Days, Base, component_to_class_map, Hemonc_Ref, Hemonc_Component_Role, Hemonc_Context, Hemonc_Branch_Conditional
from hemonc_alchemy.model.hemonc_enums import Intent, Setting, Phase, Risk, Phenotype, PriorTherapy, Phase, StudyDesign, SponsorType, BranchConditionalType, ComponentRole

In [2]:
# load data objects

data_path = Path('..') / 'data'

# download the updated hemonc tables folder from dropbox and unzip it here
TABLE_PATH = data_path / 'Tables'

# download the required athena vocab files and place them here
CDM_PATH = data_path / 'OHDSI_VOCABS' 

In [3]:
import omop_alchemy as oa

engine = oa.oa_config.engine
oa.Base.metadata.create_all(engine)

In [4]:
concept_columns = ['concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code','valid_start_date', 'valid_end_date', 'invalid_reason']
concept_relationship_columns = ['concept_code_1', 'concept_code_2', 'vocabulary_id_1', 'vocabulary_id_2', 'relationship_id', 'valid_start_date', 'valid_end_date', 'invalid_reason']

sigs = pd.read_csv(TABLE_PATH / 'sigs.csv', low_memory=False)
omop = pyreadr.read_r(TABLE_PATH / 'omop.RData')
concept_stage = omop['concept_stage'][concept_columns].copy()
concept_relationship_stage = omop['concept_relationship_stage'][concept_relationship_columns].copy()
concept_synonym_stage = omop['concept_synonym_stage'].copy()

# using RData files where possible, to avoid versioning issues
context = pyreadr.read_r(TABLE_PATH / 'context.table.RData')['context.table']
page = pyreadr.read_r(TABLE_PATH / 'page.table.RData')['page.table']
# Query: pointer, refs, studies table change name?
pointer = pyreadr.read_r(TABLE_PATH / 'pointers.RData')['pointers']
sequence = pyreadr.read_r(TABLE_PATH / 'sequence.table.RData')['sequence.table']


In [5]:
ref = pyreadr.read_r(TABLE_PATH / 'refs.RData')['refs']
study = pyreadr.read_r(TABLE_PATH / 'studies.RData')['studies']

In [6]:
# author = pyreadr.read_r(TABLE_PATH / 'author.table.RData')['author.table']
# person = pyreadr.read_r(TABLE_PATH / 'person.table.RData')['person.table']
# variant = pyreadr.read_r(TABLE_PATH / 'variant.table.RData')['variant.table']

In [7]:
# following files had non-utf characters, requiring manual R-based conversion to csv because this is not handled with pyreadr at this time
author = pd.read_csv(TABLE_PATH / 'authors.csv', low_memory=False)
person = pd.read_csv(TABLE_PATH / 'persons.csv', encoding="ISO-8859-1")
variant = pd.read_csv(TABLE_PATH / 'variants.csv', encoding="ISO-8859-1")

regimen = pointer[pointer.regimen_cui!='NOT YET ASSIGNED'][['regimen', 'regimen_cui']].drop_duplicates().copy()

In [8]:
# load minimal athena download files for cross-reference including concept IDs where available
concepts_with_ids = pd.read_csv(CDM_PATH / 'CONCEPT.csv', delimiter='\t', low_memory=False)
concept_relationships = pd.read_csv(CDM_PATH / 'CONCEPT_RELATIONSHIP.csv', delimiter='\t', low_memory=False)
concept_ancestors = pd.read_csv(CDM_PATH / 'CONCEPT_ANCESTOR.csv', delimiter='\t', low_memory=False)
vocabularies = pd.read_csv(CDM_PATH / 'VOCABULARY.csv', delimiter='\t', low_memory=False)
concept_class = pd.read_csv(CDM_PATH / 'CONCEPT_CLASS.csv', delimiter='\t', low_memory=False)
domain = pd.read_csv(CDM_PATH / 'DOMAIN.csv', delimiter='\t', low_memory=False)
relationship = pd.read_csv(CDM_PATH / 'RELATIONSHIP.csv', delimiter='\t', low_memory=False)

In [9]:
concepts_with_ids.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,21117371,10 ML Chloramphenicol 5 MG/ML Ophthalmic Solut...,Drug,RxNorm Extension,Marketed Product,S,OMOP310177,24-AUG-2017,31-DEC-2099,
1,36409728,1 ML Medroxyprogesterone 160 MG/ML Injectable ...,Drug,RxNorm Extension,Marketed Product,S,OMOP3101770,02-AUG-2017,31-DEC-2099,
2,36409730,1 ML pine bark extract 0.01 MG/ML Injectable S...,Drug,RxNorm Extension,Marketed Product,S,OMOP3101772,02-AUG-2017,31-DEC-2099,
3,36409731,1 ML pine bark extract 0.1 MG/ML Injectable So...,Drug,RxNorm Extension,Marketed Product,S,OMOP3101773,02-AUG-2017,31-DEC-2099,
4,36409732,1 ML pine bark extract 0.001 MG/ML Injectable ...,Drug,RxNorm Extension,Marketed Product,S,OMOP3101774,02-AUG-2017,31-DEC-2099,


In [285]:
# standard maps outside of souce domain? also, athena has at least some 'maps to' not in HemOnc source e.g. 9590/3-C72.9 Malignant lymphoma, NOS, of nervous system, NOS for HemOnc CNS lymphoma (576)
concept_relationship_stage[(concept_relationship_stage.concept_code_1 == '576') & (concept_relationship_stage.relationship_id=='Maps to')]

Unnamed: 0_level_0,concept_code_1,concept_code_2,vocabulary_id_1,vocabulary_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason,vocabulary_id
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1261004,576,33042,HemOnc,SEER Site Recode,Maps to,2020-12-07,2099-12-31,,SEER Site Recode
511041,576,C9301,HemOnc,NCIT,Maps to,2019-05-27,2099-12-31,,NCIT
1110174,576,C02.4,HemOnc,ICD-O-3,Maps to,2024-03-17,2099-12-31,,ICDO3
361223,576,C09.8,HemOnc,ICD-O-3,Maps to,2024-03-17,2099-12-31,,ICDO3
611040,576,C09.9,HemOnc,ICD-O-3,Maps to,2024-03-17,2099-12-31,,ICDO3
93842,576,C11.1,HemOnc,ICD-O-3,Maps to,2024-03-17,2099-12-31,,ICDO3
1181029,576,C14.2,HemOnc,ICD-O-3,Maps to,2024-03-17,2099-12-31,,ICDO3
288120,576,C37.9,HemOnc,ICD-O-3,Maps to,2024-03-17,2099-12-31,,ICDO3
382120,576,C42.0,HemOnc,ICD-O-3,Maps to,2024-03-17,2099-12-31,,ICDO3
406119,576,C42.1,HemOnc,ICD-O-3,Maps to,2024-03-17,2099-12-31,,ICDO3


In [10]:
# at this stage, these are the classes that exist in HemOnc but aren't pulled into OMOP

non_omop_class = ['Author', 'Reference', 'Study', 'PubMedURL', 'ReferenceDOI', 'Clinical trial ID', 'Regimen Variant', 
                  'ReferenceURL', 'PubMedCentralURL', 'City', 'Cycle Sigs', 'Regimen Stub', 'Study Group', 'Journal', 
                  'Endpoint', 'Duration', 'Year', 'Numeric', 'Endpoint Type', 'Experimental design', 'Study Class']

cc = concept_stage[~concept_stage.concept_class_id.isin(non_omop_class)].merge(concepts_with_ids, on='concept_code', how='left')

In [11]:
# how many of the HemOnc concepts that are in OMOP classes do not currently have an assigned concept_id?

len(cc[cc.concept_id.isna()])/len(cc)

0.10531144282410504

In [12]:
# current hemonc concepts from which classes are missing IDs from Athena?

cc[cc.concept_id.isna()].concept_class_id_x.value_counts()

concept_class_id_x
PMID                 4819
PMCID                 667
Synthetic Regimen      13
Regimen                 5
Brand Name              2
Component               2
Regimen Class           1
Unit                    1
Name: count, dtype: int64

In [13]:
# trying to clean up NDC concept codes but it doesn't really work

concept_relationship_stage.concept_code_2 = (
    concept_relationship_stage.apply(
        lambda x: x.concept_code_2.replace('-', '') if x.vocabulary_id_2 == "NDC" else x.concept_code_2, 
        axis=1
        )
)

cr = (
    concept_relationship_stage[concept_relationship_stage.vocabulary_id_2 != 'HemOnc']
    .merge(concepts_with_ids, 
           left_on=['concept_code_2', 'vocabulary_id_2'], 
           right_on=['concept_code', 'vocabulary_id'], 
           how='left')
)

In [14]:
# which non-hemonc vocabs do we need to have available for allowing this to actually extend the CDM?
concept_relationship_stage.vocabulary_id_2.value_counts()

vocabulary_id_2
HemOnc                445819
RxNorm                 19977
NDC                    14588
ICD-O-3                 1656
ICD-10-CM               1446
ICD-9-CM                1036
NCIT                     808
ATC                      660
HCPCS                    530
RxNorm Extension         505
SNOMED                   293
OncoTree                 231
SEER Site Recode         207
ICD-O-3 morphology       168
Name: count, dtype: int64

In [15]:
concept_relationship_stage[concept_relationship_stage.vocabulary_id_2=='NDC']

Unnamed: 0_level_0,concept_code_1,concept_code_2,vocabulary_id_1,vocabulary_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1100235,4,00024483,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
285916,4,00024815,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
310209,4,00025337,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
410201,4,00026216,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
510173,6,00931125,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
...,...,...,...,...,...,...,...,...
1296910,93818,50242103,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
1297010,93818,50242105,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
1396710,94609,25682022,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,
1396810,94609,25682025,HemOnc,NDC,Maps to,2022-02-05,2099-12-31,


In [16]:
concepts_with_ids.vocabulary_id.value_counts()

vocabulary_id
RxNorm Extension    2146945
NDC                 1254857
SNOMED              1089088
SPL                  808966
Nebraska Lexicon     465801
                     ...   
Ethnicity                 2
Metadata                  2
Language                  1
Specimen Type             1
Supplier                  1
Name: count, Length: 127, dtype: int64

In [17]:
# not sure how much this matters, but there seems to be some issues with hemonc mappings to NDC - codes are a bit of a mish-mash with dropped hyphens and leading 0s removed
cr[cr.concept_id.isna()].vocabulary_id_2.value_counts()

vocabulary_id_2
NDC                   10615
ICD-O-3                1656
ICD-10-CM              1446
ICD-9-CM               1036
NCIT                    808
SEER Site Recode        207
ICD-O-3 morphology      168
HCPCS                    31
RxNorm                   29
ATC                      21
SNOMED                    3
Name: count, dtype: int64

In [18]:
code_mods = concept_stage.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id', 'concept_name']
    ], 
    on=['concept_code', 'vocabulary_id'],
    how='left'
)

In [19]:
# there are some hemonc components that have changed concept_code? this will not be backwards compatible?
code_mods[(code_mods.concept_name_x != code_mods.concept_name_y) & ~code_mods.concept_class_id.isin(non_omop_class) & code_mods.concept_name_y.notna()].to_csv('reused_codes.csv', index=False)

In [20]:
vocab_id_lookup = {
    'ICD-10-CM': 'ICD10CM',
    'ICD-9-CM': 'ICD9CM',
    'ICD-O-3': 'ICDO3'
}

concept_relationship_stage['vocabulary_id'] = concept_relationship_stage.vocabulary_id_2.map(lambda x: vocab_id_lookup[x] if x in vocab_id_lookup else x)

In [21]:
# get all the OMOP concept IDs that exist for the current OMOP-ready hemonc codes

concept_stage_id = concept_stage.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id', 'concept_name'] # note that merging on concept_name shouldn't be required here, but as per above with code mods, some of the concept codes in hemonc have changed over time
    ], 
    how='left'
)

# also for the hemonc concepts in the relationship file

concept_relationship_stage_id = concept_relationship_stage.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id']
    ].rename(
        columns={'concept_code': 'concept_code_1', 
                 'concept_id': 'concept_id_1', 
                 'vocabulary_id': 'vocabulary_id_1'}
        ), 
    how='left'
)

# and then the related concepts in the relationship file

concept_relationship_stage_id = concept_relationship_stage_id.merge(
    concepts_with_ids[
        ['concept_code', 'concept_id', 'vocabulary_id']
    ].rename(
        columns={'concept_code': 'concept_code_2', 
                 'concept_id': 'concept_id_2'}
        ), 
    how='left'
)

In [22]:
# we need to grab all the concepts specifically for vocab, relationships, domains and concept classes so we can insert those as bulk initial load and avoid referential integrity issues

vocab_concepts = vocabularies.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'vocabulary_id']], 
    left_on='vocabulary_concept_id', 
    right_on='concept_id'
)

rel_concepts = relationship.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'relationship_id']], 
    left_on='relationship_concept_id', 
    right_on='concept_id'
)

domain_concepts = domain.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'domain_id']], 
    left_on='domain_concept_id', 
    right_on='concept_id'
)

concept_class_concepts = concept_class.merge(
    concepts_with_ids[[c for c in concepts_with_ids.columns if c != 'concept_class_id']], 
    left_on='concept_class_concept_id', 
    right_on='concept_id'
)

In [23]:
# this is where we are building in the compatibility with the OMOP Alchemy classes

from omop_alchemy.model.vocabulary import Concept, Vocabulary, Domain, Concept_Class, Relationship, Concept_Relationship, Concept_Ancestor

def parse_date(date_str: str):
    formats = ["%Y%m%d", "%d-%b-%Y"]  # note: %b is locale-dependent (JAN, FEB, etc.)
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    raise ValueError(f"Date '{date_str}' is not in an accepted format: {formats}")


def make_concept(concept_row):
    return Concept(concept_id = int(concept_row.concept_id),
                   concept_name = concept_row.concept_name if not pd.isna(concept_row.concept_name) else "",
                   domain_id = concept_row.domain_id,
                   vocabulary_id = concept_row.vocabulary_id,
                   concept_class_id = concept_row.concept_class_id,
                   standard_concept = concept_row.standard_concept if not pd.isna(concept_row.standard_concept) else None,
                   concept_code = concept_row.concept_code,
                   valid_start_date = parse_date(str(concept_row.valid_start_date)),#datetime.strptime(str(concept_row.valid_start_date), '%Y%m%d'), 
                   valid_end_date = parse_date(str(concept_row.valid_end_date)), #datetime.strptime(str(concept_row.valid_end_date), '%Y%m%d'), 
                   invalid_reason = concept_row.invalid_reason if not pd.isna(concept_row.invalid_reason) else None)

def make_vocab(vocab_row):
    v = Vocabulary(vocabulary_id = vocab_row.vocabulary_id,
                   vocabulary_name = vocab_row.vocabulary_name,
                   vocabulary_reference = vocab_row.vocabulary_reference,
                   vocabulary_concept_id = vocab_row.vocabulary_concept_id, 
                   vocabulary_version = vocab_row.vocabulary_version)
    v.vocabulary_concept = vocab_row.concept_object
    return v
    
def make_domain(domain_row):
    d = Domain(domain_id = domain_row.domain_id,
               domain_name = domain_row.domain_name,
               domain_concept_id = domain_row.domain_concept_id)
    d.domain_concept = domain_row.concept_object
    return d

def make_concept_class(cc_row):
    c = Concept_Class(concept_class_id = cc_row.concept_class_id,
                      concept_class_name = cc_row.concept_class_name,
                      concept_class_concept_id = cc_row.concept_class_concept_id)
    c.concept_class_concept = cc_row.concept_object
    return c


def make_rel(r_row):
    r = Relationship(relationship_id = r_row.relationship_id,
                      relationship_name = r_row.relationship_name,
                      is_hierarchical = r_row.is_hierarchical,
                      defines_ancestry = r_row.defines_ancestry,
                      reverse_relationship_id = r_row.reverse_relationship_id,
                      relationship_concept_id = r_row.relationship_concept_id)
    r.relationship_concept = r_row.concept_object
    return r

In [24]:
# preserve referential integrity by adding all the concept class, vocabulary and domain concepts in one go
meta_concepts = pd.concat([concept_class_concepts.concept_id, domain_concepts.concept_id, vocab_concepts.concept_id, rel_concepts.concept_id])
meta_concepts_df = concepts_with_ids.merge(meta_concepts)
meta_concepts_df['concept_object'] = meta_concepts_df.apply(make_concept, axis=1)


concept_class_concepts = concept_class_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='concept_class_concept_id', right_on='concept_id')
domain_concepts = domain_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='domain_concept_id', right_on='concept_id')
vocab_concepts = vocab_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='vocabulary_concept_id', right_on='concept_id')
rel_concepts = rel_concepts.merge(meta_concepts_df[['concept_id', 'concept_object']], left_on='relationship_concept_id', right_on='concept_id')

In [25]:
# full set of hemonc relationships - some obviously do not need to be pulled in - TBC how to cut down

concept_relationship_stage.relationship_id.value_counts()

relationship_id
Has middle author       121441
Is a                     67250
Was studied in           38934
Maps to                  22455
Has modality             12664
                         ...  
Was MFDS approved yr         5
Has MFDS indication          2
Has restorative tx           2
Has pept-drug cjgt           1
Has PDC Rx                   1
Name: count, Length: 94, dtype: int64

In [26]:
# if you're pulling in only hemonc concepts that have a valid domain_id, then you can retain only relationships that have concepts in allocated domains on both sides
# this is kind of circular logic, but an ok place to start

concept_stage_id.domain_id.value_counts()

domain_id
               108454
regimen          8462
drug             6447
measurement       872
condition         297
procedure         120
Name: count, dtype: int64

In [27]:
domain_lookup_by_code = {k:v for k, v in zip(concept_stage_id.concept_code, concept_stage_id.domain_id)}
domain_lookup_by_id = {k:v for k, v in zip(concepts_with_ids.concept_id, concepts_with_ids.domain_id)}

In [28]:
# try find all known domain_id values either from hemonc source or existing OMOP 

concept_rels_with_domains = concept_relationship_stage_id[concept_relationship_stage_id.invalid_reason != 'D'].copy()
concept_rels_with_domains['domain_id_1'] = concept_rels_with_domains.concept_code_1.map(domain_lookup_by_code)
concept_rels_with_domains['domain_id_2'] = concept_rels_with_domains.concept_code_2.map(domain_lookup_by_code)
concept_rels_with_domains.loc[concept_rels_with_domains['domain_id_1'].isna(), 'domain_id_1'] = concept_rels_with_domains.concept_id_1.map(domain_lookup_by_id)
concept_rels_with_domains.loc[concept_rels_with_domains['domain_id_2'].isna(), 'domain_id_2'] = concept_rels_with_domains.concept_id_2.map(domain_lookup_by_id)

In [29]:
# that would imply that these are the relationship types that you need to exist before import
concept_rels_with_domains[
    (concept_rels_with_domains.domain_id_1!='') & 
    (concept_rels_with_domains.domain_id_2!='') 
    ].relationship_id.value_counts()

relationship_id
Is a                    64669
Maps to                 21816
Has cytotoxic chemo     10761
Has context              7638
Is current in adult      6981
                        ...  
Was MFDS approved yr        5
Has MFDS indication         2
Has restorative tx          2
Has pept-drug cjgt          1
Has PDC Rx                  1
Name: count, Length: 69, dtype: int64

In [30]:
target_rels = concept_rels_with_domains[(concept_rels_with_domains.domain_id_1!='') & (concept_rels_with_domains.domain_id_2!='') ].relationship_id.value_counts().reset_index()
target_rels = target_rels.merge(relationship, how='left')

In [31]:
# means these may be the new relationship types being sought - confirmation required with respect to the 'year' concept class?
target_rels[target_rels.relationship_name.isna()]

Unnamed: 0,relationship_id,count,relationship_name,is_hierarchical,defines_ancestry,reverse_relationship_id,relationship_concept_id
4,Is current in adult,6981,,,,,
8,Has been compared to,4428,,,,,
10,Has synthetic regimen,3107,,,,,
14,Can be preceded by,1116,,,,,
16,Can be followed by,1015,,,,,
19,Is historical in adult,717,,,,,
21,Was FDA approved yr,584,,,,,
24,Has major class,449,,,,,
26,Has minor class,388,,,,,
28,Was EMA approved yr,313,,,,,


In [32]:
# query why one of the vocabularies has a null ID from athena?

vocab_concepts[vocab_concepts.vocabulary_id.isna()]

Unnamed: 0,vocabulary_id,vocabulary_name,vocabulary_reference,vocabulary_version,vocabulary_concept_id,concept_id_x,concept_name,domain_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,concept_id_y,concept_object
116,,OMOP Standardized Vocabularies,OMOP generated,v5.0 30-AUG-24,44819096,44819096,OMOP Standardized Vocabularies,Metadata,Vocabulary,,OMOP generated,01-JAN-1970,31-DEC-2099,,44819096,<Concept 44819096 - OMOP generated (OMOP Stand...


In [33]:
vocab_concepts.vocabulary_id = vocab_concepts.vocabulary_id.fillna('empty') 

In [34]:
def make_ancestor(a_row):
    return Concept_Ancestor(ancestor_concept_id = int(a_row.ancestor_concept_id),
                            descendant_concept_id = int(a_row.descendant_concept_id))
                            #min_levels_of_separation = a_row.min_levels_of_separation,
                            #max_levels_of_separation = a_row.max_levels_of_separation)
                    

In [35]:
# create metadata concepts & make objects for other reference tables
concept_class_concepts['ob'] = concept_class_concepts.apply(make_concept_class, axis=1)
domain_concepts['ob'] = domain_concepts.apply(make_domain, axis=1)
vocab_concepts['ob'] = vocab_concepts.apply(make_vocab, axis=1)
rel_concepts['ob'] = rel_concepts.apply(make_rel, axis=1)

all_ob = pd.concat([concept_class_concepts[['concept_id_x', 'ob']], 
                    domain_concepts[['concept_id_x', 'ob']], 
                    vocab_concepts[['concept_id_x', 'ob']], 
                    rel_concepts[['concept_id_x', 'ob']]])

In [36]:

with so.Session(engine) as sess:
    sess.add_all(list(concept_class_concepts.ob))
    sess.add_all(list(domain_concepts.ob))
    sess.add_all(list(vocab_concepts.ob))
    sess.add_all(list(rel_concepts.ob))
    sess.add_all(list(meta_concepts_df.concept_object))
    sess.commit()

In [37]:
with so.Session(engine) as sess:
    existing_vocabs = pd.DataFrame(sess.query(Concept.vocabulary_id).distinct().all())
    existing_concepts = pd.DataFrame(sess.query(Concept.concept_id).distinct().all())

existing_vocabs.head()

Unnamed: 0,vocabulary_id
0,Concept Class
1,Domain
2,Vocabulary
3,Relationship


In [38]:
concept_stage_id[~concept_stage_id.concept_class_id.isin(non_omop_class) & 
                 ~concept_stage_id.vocabulary_id.isin(['Concept Class', 'Domain', 'Vocabulary', 'Relationship'])  
                 &~concept_stage_id.concept_id.isin(existing_concepts.concept_id.unique())
                 ].concept_class_id.value_counts()

concept_class_id
PMID                 9779
Brand Name           5006
Synthetic Regimen    3169
Regimen              2792
PMCID                2732
Component             860
Regimen Class         824
Component Class       567
Condition             297
Procedure              64
Context                56
Unit                   50
Modality               23
Frequency              18
Route                  14
Component Role          3
Null                    1
Name: count, dtype: int64

In [39]:
concepts_to_add = concept_relationship_stage_id[
    ['concept_code_2', 'vocabulary_id']
    ].drop_duplicates().merge(
        concepts_with_ids, 
        left_on=['concept_code_2', 'vocabulary_id'], 
        right_on=['concept_code', 'vocabulary_id']
    )

In [40]:
len(concepts_to_add[~concepts_to_add.concept_id.isin(existing_concepts.concept_id.unique())])

20127

In [41]:
# add in concepts both from within the HemOnc universe, as well as all available cross-mappings
ho_conc = concepts_with_ids[concepts_with_ids.vocabulary_id=='HemOnc'].apply(make_concept, axis=1)
non_ho_conc = concepts_to_add[~concepts_to_add.vocabulary_id.isin(['Concept Class', 'Domain', 'Vocabulary', 'Relationship', 'HemOnc'])].apply(make_concept, axis=1)

In [42]:
with so.Session(engine) as sess:
    sess.add_all(ho_conc)
    sess.add_all(non_ho_conc)
    sess.commit()

In [43]:
# target_vocabularies = ['Cancer Modifier', 'OMOP Extension', 'NAACCR', 'Gender', 'Episode Type', 
#                        'Visit', 'Condition Type', 'Procedure Type', 'Type Concept', 'Observation Type',
#                        'Visit Type', 'Drug Type', 'Death Type', 'Meas Type', 'Note Type',
#                        'Cost Type', 'Obs Period Type', 'Episode Type', 'Device Type', 'Domain', 
#                        'Ethnicity', 'Language', 'Relationship', 'Race', 'CDM', 'Concept Class',
#                        'RxNorm', 'ICD10CM', 'ICD03', 'Cancer Modifier', 'ICDO3', 'SNOMED']

# len(concepts_with_ids[concepts_with_ids.vocabulary_id.isin(target_vocabularies)])#.vocabulary_id.value_counts()[:50]

In [44]:
# uncomment this section only if you want to pull in extended vocab, not just those pointed to by hemonc relationships

# use this one if you want completeness for vocabs that are involved in hemonc relationships, even the concepts that aren't pointed 
# to directly by those relationships
# target_vocabularies = concept_relationship_stage.vocabulary_id.unique()

# use this one if you want useful oncology-extension relevant vocabs
target_vocabularies = ['Cancer Modifier', 'OMOP Extension', 'NAACCR', 'Gender', 'Episode Type', 
                       'Visit', 'Condition Type', 'Procedure Type', 'Type Concept', 'Observation Type',
                       'Visit Type', 'Drug Type', 'Death Type', 'Meas Type', 'Note Type',
                       'Cost Type', 'Obs Period Type', 'Episode Type', 'Device Type', 'Domain', 
                       'Ethnicity', 'Language', 'Relationship', 'Race', 'CDM', 'Concept Class',
                       'RxNorm', 'ICD10CM', 'ICD03', 'Cancer Modifier']

# you can handle a larger chunksize if using non-sqlite backend - 50k is large but reasonable on a local device if sqlite

chunksize = 50000

for vocab in target_vocabularies:
    v = concepts_with_ids[concepts_with_ids.vocabulary_id==vocab]
    if len(v) > 0:
        with so.Session(engine) as sess:
            existing = pd.DataFrame(sess.query(Concept.concept_id).filter(Concept.vocabulary_id==vocab))
            if len(existing) > 0:
                v = v.merge(existing, how="left", indicator=True)
                v = v[v._merge=='left_only']
            for i in range(0, len(v), chunksize):
                to_add_objects = v[i:i+chunksize].apply(make_concept, axis=1)
                sess.add_all(to_add_objects)
                sess.commit()
    print(vocab, len(v), len(existing), len(v))


Cancer Modifier 6043 0 6043
OMOP Extension 1459 0 1459
NAACCR 34473 0 34473
Gender 5 0 5
Episode Type 5 0 5
Visit 20 0 20
Condition Type 118 0 118
Procedure Type 97 0 97
Type Concept 80 0 80
Observation Type 29 0 29
Visit Type 18 0 18
Drug Type 16 0 16
Death Type 14 0 14
Meas Type 12 0 12
Note Type 10 0 10
Cost Type 8 0 8
Obs Period Type 6 0 6
Episode Type 0 5 0
Device Type 4 0 4
Domain 15 50 15
Ethnicity 2 0 2
Language 1 0 1
Relationship 22 722 22
Race 53 0 53
CDM 1061 0 1061
Concept Class 0 433 0
RxNorm 310733 599 310733
ICD10CM 98412 1009 98412
ICD03 0 1009 0
Cancer Modifier 0 6043 0


In [45]:
with so.Session(engine) as sess:
    existing_vocabs = pd.DataFrame(sess.query(Concept.vocabulary_id).distinct().all())
    existing_concepts = pd.DataFrame(sess.query(Concept.concept_id).distinct().all())

existing_vocabs.head()

Unnamed: 0,vocabulary_id
0,Concept Class
1,Domain
2,Vocabulary
3,Relationship
4,HemOnc


In [46]:
def make_relationship(rel_row):
    return Concept_Relationship(concept_id_1 = rel_row.concept_id_1,
                         concept_id_2 = rel_row.concept_id_2,
                         relationship_id = rel_row.relationship_id)

# we cannot set relationships for any new concepts that do not have concept_id assigned, so note that these will be missing

ho_rels = concept_relationship_stage_id[
    concept_relationship_stage_id.concept_id_1.notna() &
    concept_relationship_stage_id.concept_id_2.notna()
    ].drop_duplicates(
        subset=['concept_id_1', 'concept_id_2', 'relationship_id']
        ).apply(make_relationship, axis=1)

In [47]:
ancestors = concept_ancestors.merge(
    existing_concepts[
        ['concept_id']
        ].rename(columns={'concept_id': 'ancestor_concept_id'})
    ).merge(
        existing_concepts[
            ['concept_id']
        ].rename(columns={'concept_id': 'descendant_concept_id'})
    ).apply(make_ancestor, axis=1)  

In [48]:
len(ancestors), len(ho_rels)

(1553491, 174882)

In [49]:
with so.Session(engine) as sess:
    for i in range(0, len(ancestors), 50000):
        sess.add_all(list(ancestors[i:i+50000]))
        sess.commit()

In [50]:
with so.Session(engine) as sess:
    sess.add_all(ho_rels)
    sess.commit()

In [51]:
# uncomment this section if you want to add extended relationships that involve only non-hemonc concepts for specific extra vocabs
# don't need to do the same for ancestry, as we don't filter that by vocab - it's done by existence of concepts already above, so if you pulled
# them in, the ancestry will be there

chunksize = 50000

# get only relationships where both concepts have been imported

target_relationships = (
    concept_relationships
    .merge(existing_concepts.rename(columns={'concept_id': 'concept_id_1'}))
    .merge(existing_concepts.rename(columns={'concept_id': 'concept_id_2'}))
)

with so.Session(engine) as sess:
    existing_relationships = pd.DataFrame(
        sess.query(
            Concept_Relationship.concept_id_1, Concept_Relationship.concept_id_2, Concept_Relationship.relationship_id
            )
    )

relationships_to_add = target_relationships.merge(existing_relationships, how='left', indicator=True)
relationships_to_add = relationships_to_add[relationships_to_add._merge=='left_only']

with so.Session(engine) as sess:
    for i in range(0, len(relationships_to_add), chunksize):
        to_add = relationships_to_add[i:i+chunksize].apply(make_relationship, axis=1)
        sess.add_all(list(to_add))
        sess.commit()

In [52]:
# due to misalignment with current athena version we have to accept gap in concept_id completeness even for classes that should have them

concept_stage_id[concept_stage_id.concept_id.isna() & ~concept_stage_id.concept_class_id.isin(non_omop_class)].head()

Unnamed: 0,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,concept_id
393,Zinpentraxin alfa,drug,HemOnc,Component,,398,2019-05-27,2099-12-31,,
616,MSI-H or dMMR Malignant solid neoplasm,condition,HemOnc,Condition,,624,2019-05-27,2099-12-31,,
657,Sickle cell disease,condition,HemOnc,Condition,,665,2019-05-27,2099-12-31,,
871,PMC4480917,,HemOnc,PMCID,,917,2019-05-27,2099-12-31,,
878,PMC4504945,,HemOnc,PMCID,,928,2019-05-27,2099-12-31,,


In [53]:
# helper mapping data / merges within OMOP concepts
is_a = concept_relationship_stage_id[
    concept_relationship_stage_id.relationship_id=='Is a'
    ][['concept_code_1', 'concept_code_2', 'concept_id_1', 'concept_id_2']]


brand_mappings = concept_stage_id[
    concept_stage_id.concept_class_id=='Brand Name'
    ].merge(
        concept_relationship_stage_id[
            concept_relationship_stage_id.relationship_id=='Has brand name'
        ][['concept_code_1', 'concept_code_2', 'concept_id_1', 'concept_id_2']], 
        left_on='concept_code', 
        right_on='concept_code_2', 
        how='left'
    )

component_df = concept_stage_id[
    concept_stage_id.concept_class_id.isin(['Component', 'Procedure'])
    ].copy()

component_class_df = concept_stage_id[
    concept_stage_id.concept_class_id=='Component Class'
    ][['concept_name','concept_code','concept_id']].rename(
        columns={'concept_code': 'concept_code_2', 
                 'concept_name': 'concept_class_name'}
    )

has_modality = concept_relationship_stage_id[
    concept_relationship_stage_id.relationship_id=='Has modality'
    ][['concept_code_1', 'concept_code_2', 'concept_id_1', 'concept_id_1']]

modality_df = concept_stage_id[
    concept_stage_id.concept_class_id=='Modality'
    ][['concept_name','concept_code','concept_id']].rename(
        columns={'concept_code': 'concept_code_2', 
                 'concept_name': 'modality_name', 
                 'concept_id': 'concept_id_2'}
        )

In [54]:
route_mappings = {'44954': '26643006', 
                  '44957': '47625008',
                  '44979': '37161004',
                  '44994': '6064005',
                  '45080': '78421000',
                  '45153': '34206005',
                  '45215': '372471009',
                  '45273': '420254004',
                  '45426': '372466002',
                  '45531': '46713006',
                  '45574': '72607000',
                  '45684': '58100008',
                  '45939': '447694001'}

In [55]:
# helper data cleaning / transformation functions

def get_enum(e, s):
    try:
        return e[s.lower().strip().replace('-', '_').replace(' ', '_').replace('/', '_')]
    except:
        return None

def get_date(y, m, d):
    try:    
        datetime.date(int(y), int(m), int(d))
    except:
        return None

In [271]:
# functions to take dataframe rows and return database objects

def make_context(context_row):
    return Hemonc_Context(context_code = context_row.contextRaw,
                          context_name = context_row.contextPretty,
                          intent = get_enum(Intent, context_row.intent),
                          setting = get_enum(Setting, context_row.intent),
                          risk_stratification = get_enum(Risk, context_row.intent),
                          phenotype = get_enum(Phenotype, context_row.intent),
                          prior_therapy = get_enum(PriorTherapy, context_row.prior_therapy),
                          date_added = context_row.date_added)


def make_regimen(regimen_row):
    return Hemonc_Regimen(regimen_cui = regimen_row.regimen_cui,
                          regimen_name = regimen_row.regimen,
                          regimen_concept_id = int(regimen_row.concept_id) if not pd.isna(regimen_row.concept_id) else None)

def make_condition(condition_row):
    return Hemonc_Condition(condition_code = condition_row.concept_code,
                            condition_name = condition_row.concept_name,
                            condition_concept_id = int(condition_row.concept_id) if not pd.isna(condition_row.concept_id) else None)


def make_component(component_class_row):
    return Hemonc_Component(component_code = component_class_row.concept_code,
                            component_name = component_class_row.concept_name,
                            component_concept_id = int(component_class_row.concept_id) if not pd.isna(component_class_row.concept_id) else None)

def make_component_role(role_row):
    return Hemonc_Component_Role(regimen_cui = role_row.concept_code_1,
                                 component_code = role_row.concept_code_2 if role_row.component_class in ['Component', 'Procedure'] else None,
                                 component_class_code = role_row.concept_code_2 if role_row.component_class == 'Component Class' else None,
                                 relationship_id = role_row.relationship_id)


def make_component_class(component_class_row):
    return Hemonc_Component_Class(component_class_code = component_class_row.concept_code_2,
                                  component_class_name = component_class_row.concept_class_name,
                                  component_class_concept_id = int(component_class_row.concept_id) if not pd.isna(component_class_row.concept_id) else None)

def make_branch(branch_row):
    return Hemonc_Branch_Conditional(branch_name = branch_row.original,
                                     branch_type = branch_row.RULE_TYPE if not pd.isna(branch_row.RULE_TYPE) else BranchConditionalType.other,
                                     numeric_min = branch_row.MIN_NUM,
                                     numeric_max = branch_row.MAX_NUM,
                                     value = branch_row.RULE_VALUE)

def make_variant(variant_row):
    return Hemonc_Variant(variant_cui = variant_row.variant_cui,
                         variant_name = variant_row.variant,
                         regimen_cui = variant_row.regimen_cui)


def make_reg_part(reg_part_row):
    return Hemonc_Regimen_Part(variant_cui = reg_part_row.variant_cui,
                               regimen_part_cui = reg_part_row.regimen_part_cui,
                               regimen_part_id = reg_part_row.regimen_part_id,
                               portion = reg_part_row.portion if not pd.isna(reg_part_row.portion) else None,
                               #cycle_sig_id = reg_part_row.cyclesigs,
                               cycle_sig_cui = reg_part_row.cycle_sig_cui,
                               timing = reg_part_row.timing,
                               timing_unit = reg_part_row.timing_unit
                               )

def assign_part_phase(reg_part_row):
    if not pd.isna(reg_part_row.phase):
        phases = [Part_Phase(regimen_part_cui = reg_part_row.regimen_part_cui,
                             variant_cui = reg_part_row.variant_cui, 
                             phase = get_enum(Phase, reg_part_row.phase),
                             ) for phase in reg_part_row.phase.split('|')]
        return phases

def make_cycle_sig(cs_row):
    return Hemonc_Cycle_Sig(cycle_sig_id = cs_row.cyclesigs,
                            duration_min = cs_row.DUR_MIN, 
                            duration_max = cs_row.DUR_MAX,
                            duration_units = cs_row.DUR_UNITS,
                            frequency_min = cs_row.FREQ_MIN,
                            frequency_max = cs_row.FREQ_MAX,
                            frequency_units = cs_row.FREQ_UNITS,
                            repeats_min = cs_row.REP_MIN,
                            repeats_max = cs_row.REP_MAX,
                            repeats_units = cs_row.REP_UNITS, 
                            cycle_len_min = cs_row.cycle_length_lb if not pd.isna(cs_row.cycle_length_lb) else None,
                            cycle_len_max = cs_row.cycle_length_ub if not pd.isna(cs_row.cycle_length_ub) else None,
                            cycle_len_units = cs_row.cycle_length_unit if not pd.isna(cs_row.cycle_length_unit) else None,
                            residual = ' '.join([t.text for t in cs_row.cs_residual]))


def make_modality(mod_row):
    return Hemonc_Modality(modality_code = mod_row.concept_code_2,
                           modality_name = mod_row.modality_name,
                           modality_concept_id = int(mod_row.concept_id_2) if not pd.isna(mod_row.concept_id_2) else None)


def make_sig(sig_row):
    return Hemonc_Sig(regimen_part_id = sig_row.regimen_part_id,
                      regimen_part_cui = sig_row.regimen_part_cui,
                      variant_cui = sig_row.variant_cui,
                      sig_cui = sig_row.sig_cui,
                      sig_id = sig_row.sig_id,
                      component_code = sig_row.concept_code,
                      component_name = sig_row.component,
                      component_role = get_enum(ComponentRole, sig_row.component_role),
                      step_number = sig_row.step_number,
                      component_class = sig_row['class'],
                      tail = sig_row['tail'],
                      route = sig_row.route,
                      doseminnum = sig_row.doseMinNum if not pd.isna(sig_row.doseMinNum) else None,
                      dosemaxnum = sig_row.doseMaxNum if not pd.isna(sig_row.doseMaxNum) else None,
                      doseunit = sig_row.doseUnit if not pd.isna(sig_row.doseUnit) else None,
                      dosecapnum = sig_row.doseCapNum if not pd.isna(sig_row.doseCapNum) else None,
                      dosecapunit = sig_row.doseCapUnit if not pd.isna(sig_row.doseCapUnit) else None,
                      durationminnum = sig_row.durationMinNum if not pd.isna(sig_row.durationMinNum) else None,
                      durationmaxnum = sig_row.durationMaxNum if not pd.isna(sig_row.durationMaxNum) else None,
                      durationunit = sig_row.durationUnit if not pd.isna(sig_row.durationUnit) else None,
                      frequency = sig_row.frequency if not pd.isna(sig_row.frequency) else None,
                      inparens = sig_row.inParens if not pd.isna(sig_row.inParens) else None,
                      sequence = sig_row.sequence if not pd.isna(sig_row.sequence) else None,
                      seq_rel = sig_row['seq.rel'] if not pd.isna(sig_row['seq.rel']) else None,
                      seq_rel_what = sig_row['seq.rel.what'] if not pd.isna(sig_row['seq.rel.what']) else None)            


def make_sig_days(sd_row):
    days = '0' if pd.isna(sd_row.allDays) else sd_row.allDays
    return [Sig_Days(regimen_part_id = sd_row.regimen_part_id,
                     variant_cui = sd_row.variant_cui,
                     regimen_part_cui = sd_row.regimen_part_cui,
                     sig_cui = sd_row.sig_cui,
                     sig_id = sd_row.sig_id,
                     day = d) for d in set(days.split(','))]



# spacer matcher helper functions

small_num_lookup = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}

def safe_num(tok):
    try:
        return int(tok.text)
    except:
        try:
            return float(tok.text)
        except:
            try: 
                return small_num_lookup[tok.text.lower()]
            except:
                return None

def safe_age(numeric):
    ages = [safe_num(t) for t in numeric if t._.AGE]
    if len(ages) > 0:
        return ages[0]

def get_full_stopping_condition(doc, conditions):
    return doc[conditions[0].i:] if len(conditions) > 0 else None

def get_units(tokens):
    try:
        return [t.lemma_ for t in tokens if t._.TIMING_UNIT][0]
    except:
        return None

def get_min(group):
    try:
        return min([i for i in [safe_num(t) for t in group] if i])
    except:
        return None

def get_max(group): 
    try:
        if any([t._.GTE for t in group]):
            return -1
        return max([i for i in [safe_num(t) for t in group] if i])
    except:
        return None


In [57]:
role = ['Has cytotoxic chemo', 'Has targeted therapy', 'Has supportive med',
       'Has steroid tx', 'Has local therapy', 'Has immunosuppressor',
       'Has immunotherapy', 'Has endocrine tx', 'Has radiotherapy',
       'Has growth factor', 'Has AB-drug cjgt', 'Has radioconjugate',
       'Has antineoplastic', 'Has anticoag tx', 'Has pept-drug cjgt']

concept_reg_roles = (
    concept_relationship_stage_id[concept_relationship_stage_id.relationship_id.isin(role)][['concept_code_1', 'concept_id_1', 'relationship_id', 'concept_code_2', 'concept_id_2']]
        .merge(
                concept_stage_id[['concept_code', 'concept_name', 'concept_class_id']]
                .rename(
                    columns={'concept_code': 'concept_code_1', 
                             'concept_name': 'regimen_name',
                             'concept_class_id': 'regimen_class'}
                )
        )
        .merge(
                concept_stage_id[['concept_code', 'concept_name', 'concept_class_id']]
                .rename(
                    columns={'concept_code': 'concept_code_2', 
                             'concept_name': 'component_name',
                             'concept_class_id': 'component_class'}
                )
        )  
)

In [58]:
concept_reg_roles.head()

Unnamed: 0,concept_code_1,concept_id_1,relationship_id,concept_code_2,concept_id_2,regimen_name,regimen_class,component_name,component_class
0,795,35803428.0,Has immunosuppressor,122,35802975.0,Cyclophosphamide and Prednisolone,Regimen,Cyclophosphamide,Component
1,795,35803428.0,Has immunosuppressor,417,35803267.0,Cyclophosphamide and Prednisolone,Regimen,Prednisolone,Component
2,797,35803429.0,Has immunosuppressor,122,35802975.0,Cyclophosphamide and Prednisone,Regimen,Cyclophosphamide,Component
3,797,35803429.0,Has immunosuppressor,418,35803268.0,Cyclophosphamide and Prednisone,Regimen,Prednisone,Component
4,797,35803429.0,Has cytotoxic chemo,122,35802975.0,Cyclophosphamide and Prednisone,Regimen,Cyclophosphamide,Component


In [59]:
concept_reg_roles['role_object'] = concept_reg_roles.apply(make_component_role, axis=1)

In [60]:
conditions = concept_stage_id[concept_stage_id.concept_class_id=='Condition'].apply(make_condition, axis=1)
component_class_df['component_class_object'] = component_class_df.apply(make_component_class, axis=1)

In [61]:
component_df['component_object'] = component_df.apply(make_component, axis=1)
components_with_class = component_df.merge(is_a, 
                                           left_on='concept_code', 
                                           right_on='concept_code_1').merge(component_class_df
                                                                            ).rename(columns={'concept_code_2': 'concept_class_code'})

In [62]:
concept_stage_id[(concept_stage_id.concept_class_id=='Condition') & (concept_stage_id.duplicated(['concept_code']))]

Unnamed: 0,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,concept_id


In [63]:
modality_df['modality_object'] = modality_df.apply(make_modality, axis=1)

regimens_with_modality = concept_stage_id[
    concept_stage_id.invalid_reason == ''
    ].merge(
        has_modality, 
        left_on='concept_code', 
        right_on='concept_code_1'
    ).merge(
        modality_df
    ).rename(
            columns={'concept_code_2': 'concept_class_code'}
    ).drop_duplicates()


In [64]:
regimens_with_modality[regimens_with_modality[['concept_name', 'modality_name']].duplicated()]

Unnamed: 0,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason,concept_id,concept_code_1,concept_class_code,concept_id_1,concept_id_1.1,modality_name,concept_id_2,modality_object


In [65]:
components_with_class.apply(lambda x: x.component_object.component_classes.append(x.component_class_object), axis=1)

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
dtype: object

In [66]:
ref = ref.merge(concept_stage[concept_stage.concept_class_id=='Condition'][['concept_name', 'concept_code']], left_on='condition', right_on='concept_name', how='left')
ref['pub_date'] = pd.to_datetime(ref['pub.date'])

In [67]:
# query: the 'order' and 'update' fields have been removed from the refs file
def make_ref(ref_row):
    return Hemonc_Ref(reference = ref_row.reference,
                      condition_code = ref_row.concept_code,
                      pmid = ref_row.pmid,
                      study = ref_row.study,
                      title = ref_row.title,
                      pmcid = ref_row.pmcid,
                      doi = ref_row.doi if not ref_row.doi=="" else None,
                      url = ref_row.url if not ref_row.url=="" else None,
                      journal = ref_row.journal,
                      biblio = ref_row.biblio,
                      pub_date = ref_row.pub_date,
#                      order = ref_row['order'],
                      #update = ref_row['update'],
                      biomarker = ref_row.biomarker,
                      ref_type = ref_row.ref_type)

In [68]:
# hemonc_ref.reference, hemonc_ref.condition_code, hemonc_ref.pmid

ref[ref.duplicated(subset=['reference', 'concept_code', 'pmid', 'biomarker', 'condition'], keep=False)].sort_values('reference')

Unnamed: 0,study,reference,title,pmid,doi,url,pmcid,condition,biomarker,journal,...,pub.date,ref_type,errata,citations,citations_as_of,date_added,temp,concept_name,concept_code,pub_date
4549,,Fayers et al. 2011::00,Thalidomide for previously untreated elderly p...,21670471,10.1182/blood-2011-03-341669,,No PMCID,Multiple myeloma,,Blood,...,2011-06-13,Unassigned,False,168,2022-04-08,2025-10-08,,Multiple myeloma,633,2011-06-13
4550,,Fayers et al. 2011::00,Thalidomide for previously untreated elderly p...,21670471,10.1182/blood-2011-03-341669,,No PMCID,Multiple myeloma,,Blood,...,2011-06-13,Unassigned,False,168,2022-04-08,2025-10-08,,Multiple myeloma,633,2011-06-13
7475,,Newland et al. 2017::00,Fostamatinib for persistent/chronic adult immu...,28967793,10.2217/imt-2017-0097,,No PMCID,Immune thrombocytopenia,,Immunotherapy,...,2017-10-02,Unassigned,False,30,2021-12-04,2025-10-08,,Immune thrombocytopenia,617,2017-10-02
7476,,Newland et al. 2017::00,Fostamatinib for persistent/chronic adult immu...,28967793,10.2217/imt-2017-0097,,No PMCID,Immune thrombocytopenia,,Immunotherapy,...,2017-10-02,Unassigned,False,30,2021-12-04,2025-10-08,,Immune thrombocytopenia,617,2017-10-02
6965,,Yang et al. 2016::00,"Multicentre, randomised phase III study of the...",27734464,10.1111/bjh.14380,,No PMCID,Immune thrombocytopenia,,Br J Haematol,...,2016-10-13,Unassigned,False,41,2022-12-07,2025-10-08,,Immune thrombocytopenia,617,2016-10-13
6966,,Yang et al. 2016::00,"Multicentre, randomised phase III study of the...",27734464,10.1111/bjh.14380,,No PMCID,Immune thrombocytopenia,,Br J Haematol,...,2016-10-13,Unassigned,False,41,2022-12-07,2025-10-08,,Immune thrombocytopenia,617,2016-10-13


In [69]:
ref[ref.concept_code.notna() & ref.duplicated(subset=['reference', 'concept_code', 'pmid', 'biomarker'], keep=False)].sort_values('reference')

Unnamed: 0,study,reference,title,pmid,doi,url,pmcid,condition,biomarker,journal,...,pub.date,ref_type,errata,citations,citations_as_of,date_added,temp,concept_name,concept_code,pub_date
4549,,Fayers et al. 2011::00,Thalidomide for previously untreated elderly p...,21670471,10.1182/blood-2011-03-341669,,No PMCID,Multiple myeloma,,Blood,...,2011-06-13,Unassigned,False,168,2022-04-08,2025-10-08,,Multiple myeloma,633,2011-06-13
4550,,Fayers et al. 2011::00,Thalidomide for previously untreated elderly p...,21670471,10.1182/blood-2011-03-341669,,No PMCID,Multiple myeloma,,Blood,...,2011-06-13,Unassigned,False,168,2022-04-08,2025-10-08,,Multiple myeloma,633,2011-06-13
7475,,Newland et al. 2017::00,Fostamatinib for persistent/chronic adult immu...,28967793,10.2217/imt-2017-0097,,No PMCID,Immune thrombocytopenia,,Immunotherapy,...,2017-10-02,Unassigned,False,30,2021-12-04,2025-10-08,,Immune thrombocytopenia,617,2017-10-02
7476,,Newland et al. 2017::00,Fostamatinib for persistent/chronic adult immu...,28967793,10.2217/imt-2017-0097,,No PMCID,Immune thrombocytopenia,,Immunotherapy,...,2017-10-02,Unassigned,False,30,2021-12-04,2025-10-08,,Immune thrombocytopenia,617,2017-10-02
6965,,Yang et al. 2016::00,"Multicentre, randomised phase III study of the...",27734464,10.1111/bjh.14380,,No PMCID,Immune thrombocytopenia,,Br J Haematol,...,2016-10-13,Unassigned,False,41,2022-12-07,2025-10-08,,Immune thrombocytopenia,617,2016-10-13
6966,,Yang et al. 2016::00,"Multicentre, randomised phase III study of the...",27734464,10.1111/bjh.14380,,No PMCID,Immune thrombocytopenia,,Br J Haematol,...,2016-10-13,Unassigned,False,41,2022-12-07,2025-10-08,,Immune thrombocytopenia,617,2016-10-13


In [None]:
# query some full dupes?
ref[~ref.concept_code.isna() & ~ref.pub_date.isna() & ref.duplicated(keep=False)]

Unnamed: 0,study,reference,title,pmid,doi,url,pmcid,condition,biomarker,journal,...,pub.date,ref_type,errata,citations,citations_as_of,date_added,temp,concept_name,concept_code,pub_date
4549,,Fayers et al. 2011::00,Thalidomide for previously untreated elderly p...,21670471,10.1182/blood-2011-03-341669,,No PMCID,Multiple myeloma,,Blood,...,2011-06-13,Unassigned,False,168,2022-04-08,2025-10-08,,Multiple myeloma,633,2011-06-13
4550,,Fayers et al. 2011::00,Thalidomide for previously untreated elderly p...,21670471,10.1182/blood-2011-03-341669,,No PMCID,Multiple myeloma,,Blood,...,2011-06-13,Unassigned,False,168,2022-04-08,2025-10-08,,Multiple myeloma,633,2011-06-13
6965,,Yang et al. 2016::00,"Multicentre, randomised phase III study of the...",27734464,10.1111/bjh.14380,,No PMCID,Immune thrombocytopenia,,Br J Haematol,...,2016-10-13,Unassigned,False,41,2022-12-07,2025-10-08,,Immune thrombocytopenia,617,2016-10-13
6966,,Yang et al. 2016::00,"Multicentre, randomised phase III study of the...",27734464,10.1111/bjh.14380,,No PMCID,Immune thrombocytopenia,,Br J Haematol,...,2016-10-13,Unassigned,False,41,2022-12-07,2025-10-08,,Immune thrombocytopenia,617,2016-10-13
7475,,Newland et al. 2017::00,Fostamatinib for persistent/chronic adult immu...,28967793,10.2217/imt-2017-0097,,No PMCID,Immune thrombocytopenia,,Immunotherapy,...,2017-10-02,Unassigned,False,30,2021-12-04,2025-10-08,,Immune thrombocytopenia,617,2017-10-02
7476,,Newland et al. 2017::00,Fostamatinib for persistent/chronic adult immu...,28967793,10.2217/imt-2017-0097,,No PMCID,Immune thrombocytopenia,,Immunotherapy,...,2017-10-02,Unassigned,False,30,2021-12-04,2025-10-08,,Immune thrombocytopenia,617,2017-10-02


In [None]:
# query - pub date no longer mandatory?
ref_obj = ref[~ref.concept_code.isna() & ~ref.pub_date.isna()].drop_duplicates().apply(make_ref, axis=1)

In [72]:
# todo: confirm what is meant by a single cui with >1 title - is this a synonym?
# probably just different titles in different pages, as this is sourced from the pointer file originally.

regimen_dedup = regimen.drop_duplicates(subset='regimen_cui').copy()

In [73]:
context.date_added = pd.to_datetime(context.date_added)
contexts = context.apply(make_context, axis=1)

In [74]:
# new issue: null condition codes - consider the fact that some of the 'conditions' being mapped to here are now actually regimen classes

# # workaround mapping
# study_condition_workaround = {
#     'Stem cell mobilization regimens': 'Stem cell mobilization regimen', 
#     'Allogeneic HSCT' : 'Allogeneic HSCT conditioning regimen',  
#     'Cellular therapy conditioning regimens' : 'Cellular therapy conditioning regimen'
# }

# study.condition = study.condition.map(lambda x: study_condition_workaround[x] if x in study_condition_workaround else x)

In [75]:
# prepare study objects
study.date_added = pd.to_datetime(study.date_added, format='mixed', errors='coerce')
# query: date last modified has been removed
# study.date_last_modified = pd.to_datetime(study.date_last_modified, format='mixed', errors='coerce')
enrol_dates = study.enrollment.str.split('to', expand=True)
enrol_from = enrol_dates[0].str.strip().str.split('-', expand=True).rename(columns={0: 'from_year', 1:'from_month', 2: 'from_day'}).fillna('01')
enrol_to = enrol_dates[1].str.strip().str.split('-', expand=True).rename(columns={0: 'to_year', 1:'to_month', 2: 'to_day'}).fillna('01')
study = pd.concat([study, enrol_from, enrol_to], axis=1)
study = study.merge(concept_stage[concept_stage.concept_class_id.isin(['Condition', 'Regimen Class'])][['concept_name', 'concept_code']], left_on='condition', right_on='concept_name', how='left')
study.start = pd.to_datetime(study.start, format='mixed', errors='coerce')
study.end = pd.to_datetime(study.end, format='mixed', errors='coerce')

In [76]:
regimen_dedup = regimen_dedup.merge(regimens_with_modality[['concept_code', 'concept_id']].drop_duplicates(), left_on='regimen_cui', right_on='concept_code')

In [77]:
# query: date last modified, fda_reg_study and fda_unreg_study have been removed?
# query: protocol is not no longer required?
def make_study(study_row):
    enrollment_from = get_date(study_row.from_year, study_row.from_month, study_row.from_day)
    enrol_to = get_date(study_row.to_year, study_row.to_month, study_row.to_day)
    return Hemonc_Study(study_code = study_row.study,
                        registry = study_row.registry,
                        trial_id = study_row.trial_id,
                        condition_code = study_row.concept_code,
                        enrollment_from = enrollment_from if not pd.isna(enrollment_from) else None,
                        enrollment_to = enrol_to if not pd.isna(enrol_to) else None,
                        phase = study_row.phase,
                        study_design = get_enum(StudyDesign, study_row.study_design),
                        study_design_imputed = study_row.study_design_imputed,
                        sact = study_row.sact if not pd.isna(study_row.sact) else None,
                        protocol = study_row.protocol if not pd.isna(study_row.protocol) else False,
                        #fda_reg_study = study_row.fda_reg_study,
                        #fda_unreg_study = study_row.fda_unreg_study,
                        start = study_row.start  if not pd.isna(study_row.start) else None,
                        end = study_row.end  if not pd.isna(study_row.end) else None,
                        study_group = study_row.study_group,
                        sponsor = study_row.sponsor,
                        date_added = study_row.date_added if not pd.isna(study_row.date_added) else None, 
                        #date_modified = study_row.date_last_modified if not pd.isna(study_row.date_last_modified) else None
                        )

In [78]:
study['study_object'] = study.apply(make_study, axis=1)
regimen_dedup['regimen_object'] = regimen_dedup.apply(make_regimen, axis=1)

In [79]:
def append_modality(regimen_obj, modality_obj):
    if not pd.isna(modality_obj):
        regimen_obj.modalities.append(modality_obj)

In [80]:
regimen_dedup[regimen_dedup.regimen_cui.duplicated(keep=False)]

Unnamed: 0,regimen,regimen_cui,concept_code,concept_id,regimen_object


In [91]:
with so.Session(engine) as sess:
    sess.add_all(conditions)
    sess.add_all(list(component_class_df.component_class_object))
    sess.add_all(list(component_df.component_object))
    sess.add_all(contexts)
    sess.add_all(list(study[~study.concept_code.isna()].study_object))
    sess.add_all(list(modality_df.modality_object))
    sess.add_all(list(regimen_dedup.regimen_object))
    sess.add_all(list(ref_obj))
    sess.add_all(list(concept_reg_roles.role_object))
    sess.commit()

In [92]:
study[study.protocol.isna()]

Unnamed: 0,study,registry,trial_id,condition,condition_cui,biomarker,enrollment,phase,study_design,study_design_imputed,...,date_added,from_year,from_month,from_day,to_year,to_month,to_day,concept_name,concept_code,study_object
21,5F9003,ClinicalTrials.gov,NCT02953509,Diffuse large B-cell lymphoma,589,,,Phase 1b,Non-randomized,False,...,2025-05-30,,01,01,01,01,01,Diffuse large B-cell lymphoma,589,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
22,5F9003,ClinicalTrials.gov,NCT02953509,Follicular lymphoma,599,,,Phase 1b,Non-randomized,False,...,2025-05-30,,01,01,01,01,01,Follicular lymphoma,599,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
103,acelERA,ClinicalTrials.gov,NCT04576455,Breast cancer,572,HR-positive,,Randomized phase 2,In-class switch,False,...,2025-05-30,,01,01,01,01,01,Breast cancer,572,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
217,AK105-304,ClinicalTrials.gov,NCT04974398,Nasopharyngeal carcinoma,639,,,Phase 3,Escalation,False,...,2025-06-20,,01,01,01,01,01,Nasopharyngeal carcinoma,639,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
334,Amgen 20050103,ClinicalTrials.gov,NCT00321620,Prostate cancer,658,,,Phase 3,Out-of-class switch,False,...,2025-05-30,,01,01,01,01,01,Prostate cancer,658,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6772,SWOG S0910,ClinicalTrials.gov,NCT00945815,Acute lymphoblastic leukemia,24309,,,Phase 2,Non-randomized,False,...,2025-05-30,,01,01,01,01,01,Acute lymphoblastic leukemia,24309,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
6776,SWOG S1108,ClinicalTrials.gov,NCT01466881,Peripheral T-cell lymphoma,652,,,Phase 2,Non-randomized,False,...,2025-05-30,,01,01,01,01,01,Peripheral T-cell lymphoma,652,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
7006,Tian et al. 2024,ChiCTR,CTR20170221,Chronic myeloid leukemia,582,,,Phase 3,Escalation,False,...,2025-05-30,,01,01,01,01,01,Chronic myeloid leukemia,582,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...
7134,TvT CAR7,ClinicalTrials.gov,NCT05397184,T-cell acute lymphoblastic leukemia,670,,,Phase 1,Non-randomized,False,...,2025-05-30,,01,01,01,01,01,T-cell acute lymphoblastic leukemia,670,<hemonc_alchemy.model.hemonc_model.Hemonc_Stud...


In [93]:
with so.Session(engine) as sess:
    regimen_objects = pd.DataFrame(sess.query(Hemonc_Regimen), columns=['regimen_object'])
    regimen_objects['regimen_cui'] = regimen_objects.regimen_object.map(lambda x: str(x.regimen_cui))
    modality_objects = pd.DataFrame(sess.query(Hemonc_Modality), columns=['modality_object'])
    modality_objects['modality_code'] = modality_objects.modality_object.map(lambda x: str(x.modality_code))
    modality_lookup = regimens_with_modality[['concept_code', 'concept_class_code']].drop_duplicates()
    modality_lookup = (
        modality_lookup
        .merge(regimen_objects, left_on='concept_code', right_on='regimen_cui')
        .merge(modality_objects, left_on='concept_class_code', right_on='modality_code')
    )
    modality_lookup.apply(lambda row: row.regimen_object.modalities.append(row.modality_object), axis=1)
    sess.commit()

In [94]:
# we are not going to pull in out of date variant versions
sig_vars = sigs.merge(variant[variant.version==1][['variant_cui']])

In [95]:
for (r, c), dets in sig_vars[~sig_vars.step_number.str.contains('1 of') & sig_vars.allDays.isna()].groupby(['regimen', 'component']):
    if len(dets)>1:
        break

In [96]:
# parsing out branch conditions with spacy matchers

branch_crit = sig_vars.branch.value_counts().reset_index()

all_branches = []

for crit in branch_crit.branch.unique():
    combined_branches = crit.split('AND ')
    for br in combined_branches:
        all_branches += [b.lower().strip() for b in br.split('OR ')]

all_branches = list(set(all_branches))

branch_details = pd.DataFrame({'original': all_branches, 'doc': [nlp(b) for b in all_branches]})

for label, config in rules.items():
    branch_details.doc.apply(lambda d: match_entities(matchers[label], d, label, config["MERGE"]))
    branch_details[label] = branch_details.doc.apply(lambda d: [t for t in d if getattr(t._, label)])
    branch_details[f'has_{label}'] = (branch_details[label].apply(len)>0)

branch_details['MODIFIER_HEAD'] = branch_details.doc.apply(get_modifier_child)
branch_details['NOUNS'] = branch_details.doc.apply(get_nouns)
branch_details['NUMERIC'] = branch_details.doc.map(lambda doc: [tok for tok in doc if tok.like_num])

In [97]:
# parse numeric branch factors into constituant elements

branch_details.loc[branch_details.has_AGE, 'RULE_TYPE'] = BranchConditionalType.age
branch_details.loc[branch_details.has_SIZE, 'RULE_TYPE'] = BranchConditionalType.size
branch_details.loc[branch_details.has_LAB, 'RULE_TYPE'] = BranchConditionalType.lab
branch_details.loc[branch_details.has_STAGE, 'RULE_TYPE'] = BranchConditionalType.stage

branch_details.loc[branch_details.has_STAGE, 'RULE_VALUE'] = branch_details.STAGE.apply(lambda x: ' '.join([tok.text for tok in x]) if len(x) > 0 else None)

num = branch_details.NUMERIC.apply(lambda x: safe_num(x[0]) if len(x) > 0 else None)
age = branch_details.NUMERIC.apply(lambda x: safe_age(x))
first_num = branch_details.NUMERIC.apply(lambda x: safe_num(x[0]) if len(x) > 0 else None)
second_num = branch_details.NUMERIC.apply(lambda x: safe_num(x[1]) if len(x) > 1 else None)

gt = (branch_details.GT.apply(len)>0)
gte = (branch_details.GTE.apply(len)>0)
lt = (branch_details.LT.apply(len)>0)
lte = (branch_details.LTE.apply(len)>0)

r = (branch_details.RANGE.apply(len)>0)

branch_details.loc[branch_details.has_AGE & gt, 'MIN_NUM'] = age.apply(lambda x: x + 1 if x else None)
branch_details.loc[branch_details.has_AGE & gte, 'MIN_NUM'] = age
branch_details.loc[branch_details.has_AGE & lt, 'MAX_NUM'] = age.apply(lambda x: x - 1 if x else None)
branch_details.loc[branch_details.has_AGE & lte, 'MAX_NUM'] = age

branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & gt, 'MIN_NUM'] = num.apply(lambda x: x + 1 if x else None)
branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & gte, 'MIN_NUM'] = num
branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & lt, 'MAX_NUM'] = num.apply(lambda x: x - 1 if x else None)
branch_details.loc[(branch_details.has_SIZE | branch_details.has_LAB) & lte, 'MAX_NUM'] = num

branch_details.loc[(branch_details.has_AGE | branch_details.has_SIZE | branch_details.has_LAB) & r, 'MIN_NUM'] = first_num
branch_details.loc[(branch_details.has_AGE | branch_details.has_SIZE | branch_details.has_LAB) & r, 'MAX_NUM'] = second_num

In [98]:
# very likely to be able to improve these value extraction steps through either medspacy NER or better rules / more fullsome parsing, but a decent first pass

branch_details.loc[branch_details.RULE_VALUE.isna(), 'RULE_VALUE'] = branch_details.apply(lambda row: ' '.join([t.text for t in (row.FACT_MODIFIER + row.FACT)]), axis=1)

In [99]:
branches = branch_details.apply(make_branch, axis=1)

In [100]:
with so.Session(engine) as sess:
    sess.add_all(branches)
    sess.commit()

In [101]:
pointer.regimen_cui = pointer.regimen_cui.astype(int)

In [102]:
variant_study = variant.merge(pointer[['regimen', 'regimen_cui']].drop_duplicates(), how='left')[['regimen', 'study', 'variant_cui', 'variant', 'regimen_cui']].drop_duplicates()
variant_study['variant_object'] = variant_study.apply(make_variant, axis=1)

In [103]:
study_object_lookup = {s.study: s.study_object for s in study.itertuples()}

In [104]:
#variant_study[variant_study.variant_cui.notna() & variant_study.study.fillna('').map(lambda x: len(x.split('|')) != len(set(x.split('|'))))]

In [105]:
# new issue - some null studies in variant study map
variant[variant.study.isna()]

for row in variant_study[~variant_study.study.isna()].itertuples():
    for study_name in set(row.study.split('|')):
        try:
            row.variant_object.studied_in.append(study_object_lookup[study_name])
        except:
            print(f'missing study: {study_name} ({row.regimen})')

missing study: No supporting study (Afatinib monotherapy)
missing study: No supporting study (Atezolizumab monotherapy)
missing study: No supporting study (Atezolizumab monotherapy)
missing study: No supporting study (Atezolizumab monotherapy)
missing study: No supporting study (Atezolizumab monotherapy)
missing study: No supporting study (Axitinib and Avelumab)
missing study: No supporting study (Cabozantinib and Nivolumab)
missing study: No supporting study (Cabozantinib monotherapy)
missing study: No supporting study (Crizotinib monotherapy)
missing study: No supporting study (Crizotinib monotherapy)
missing study: No supporting study (Ipilimumab and Nivolumab)
missing study: No supporting study (Nivolumab monotherapy)
missing study: No supporting study (Nivolumab monotherapy)
missing study: KEYNOTE-427 (Pembrolizumab monotherapy)
missing study: No supporting study (Pembrolizumab monotherapy)
missing study: No supporting study (Pembrolizumab monotherapy)
missing study: No supporting

In [106]:
regimen_part_sig = sig_vars[['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].drop_duplicates()

In [107]:
# spacy matcher parsing of cycle sigs

cycle_sigs_parsed = regimen_part_sig[['cyclesigs', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].fillna('').drop_duplicates().copy()
cycle_sigs_parsed['doc'] = cycle_sigs_parsed.cyclesigs.map(nlp)

for label, config in rules.items():
    cycle_sigs_parsed.doc.apply(lambda d: match_entities(matchers[label], d, label, config["MERGE"]))
    cycle_sigs_parsed[label] = cycle_sigs_parsed.doc.apply(lambda d: [t for t in d if getattr(t._, label)])
    cycle_sigs_parsed[f'has_{label}'] = (cycle_sigs_parsed[label].apply(len)>0)

cycle_sigs_parsed['full_stopping'] = cycle_sigs_parsed.apply(lambda row: get_full_stopping_condition(row.doc, row.STOPPING_CONDITION), axis=1)


cycle_sigs_parsed['DUR_MIN'] = cycle_sigs_parsed.DURATION.apply(get_min)
cycle_sigs_parsed['DUR_MAX'] = cycle_sigs_parsed.DURATION.apply(get_max)
cycle_sigs_parsed['DUR_UNITS'] = cycle_sigs_parsed.DURATION.apply(get_units)
cycle_sigs_parsed['FREQ_MIN'] = cycle_sigs_parsed.FREQUENCY.apply(get_min)
cycle_sigs_parsed['FREQ_MAX'] = cycle_sigs_parsed.FREQUENCY.apply(get_max)
cycle_sigs_parsed['FREQ_UNITS'] = cycle_sigs_parsed.FREQUENCY.apply(get_units)
cycle_sigs_parsed['REP_MIN'] = cycle_sigs_parsed.REPEATS.apply(get_min)
cycle_sigs_parsed['REP_MAX'] = cycle_sigs_parsed.REPEATS.apply(get_max)
cycle_sigs_parsed['REP_UNITS'] = cycle_sigs_parsed.REPEATS.apply(get_units)

In [108]:
sig_vars.columns

Index(['study', 'regimen', 'regimen_cui', 'phase', 'portion', 'component',
       'component_cui', 'component_role', 'cycle_length_lb', 'cycle_length_ub',
       'cycle_length_unit', 'timing_sequence', 'timing', 'branch',
       'branch_type', 'cyclesigs', 'cyclesigs_note', 'step_number', 'class',
       'doseMinNum', 'doseMaxNum', 'doseUnit', 'doseUnit_cui', 'divided',
       'doseCapNum', 'doseCapUnit', 'doseCapUnit_cui', 'targetLevel',
       'targetLevelUnit', 'targetLevelUnit_cui', 'targetLevelType', 'route',
       'route_cui', 'allDays', 'durationMinNum', 'durationMaxNum',
       'durationUnit', 'durationUnit_cui', 'frequency', 'frequency_cui',
       'inParens', 'sequence', 'seq.rel.when', 'seq.rel.when.unit', 'seq.rel',
       'seq.rel.what', 'tail', 'variant', 'variant_cui', 'temp', 'date_added'],
      dtype='object')

In [109]:
# reg_parts = sig_vars[['regimen', 'variant_cui','variant','phase', 'portion', 'cyclesigs', 'timing', 'timing_unit']].drop_duplicates().copy()
# reg_parts['regimen_part_id'] = reg_parts.sort_values(['variant_cui', 'phase', 'portion', 'cyclesigs', 'timing', 'timing_unit']).groupby('variant_cui').cumcount()
# query: timing unit has been removed?
reg_parts = sig_vars[['regimen', 'variant_cui','variant','phase', 'portion', 'cyclesigs', 'timing']].drop_duplicates().copy()
reg_parts['regimen_part_cui'] = reg_parts.sort_values(['variant_cui', 'phase', 'portion', 'cyclesigs', 'timing']).groupby('variant_cui').cumcount()
reg_parts['timing_unit'] = ''

In [155]:
reg_parts['regimen_part_id'] = range(1, len(reg_parts)+1)

In [156]:
part_phase = reg_parts.apply(assign_part_phase, axis=1)
cycle_sigs_parsed['cs_residual'] = cycle_sigs_parsed.doc.apply(lambda d: [t for t in d if not t._.FREQUENCY and not t._.REPEATS and not t._.DURATION and not t._.GTE and t.text != 'for'])

In [157]:
cycle_sigs_parsed[cycle_sigs_parsed.cs_residual.apply(len) > 0]

Unnamed: 0,cyclesigs,cycle_length_lb,cycle_length_ub,cycle_length_unit,doc,AGE,has_AGE,STAGE,has_STAGE,RANGE,...,DUR_MIN,DUR_MAX,DUR_UNITS,FREQ_MIN,FREQ_MAX,FREQ_UNITS,REP_MIN,REP_MAX,REP_UNITS,cs_residual
198,Monthly cycles,1.0,1,month,"(Monthly, cycles)",[],False,[],False,[],...,,,,,,,,,,"[Monthly, cycles]"
634,Continued indefinitely,1.0,NUB,indeterminate,"(Continued, indefinitely)",[],False,[],False,[],...,,,,,,,,,,"[Continued, indefinitely]"
638,Continued indefinitely unless radiotherapy pla...,,,,"(Continued, indefinitely, unless, radiotherapy...",[],False,[],False,[],...,,,,,,,,,,"[Continued, indefinitely, unless, radiotherapy..."
882,Monthly cycle for 6 cycles,1.0,1,month,"(Monthly, cycle, for, 6, cycles)",[],False,[],False,[],...,,,,,,,6.0,6.0,cycle,"[Monthly, cycle]"
932,Two or more years,2.0,NUB,year,"(Two, or, more, years)",[],False,[],False,[],...,,,,,,,,,,"[Two, years]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18679,21-day cycles until maximum response plus 1 cycle,21.0,21,day,"(21, -, day, cycles, until, maximum, response,...",[],False,[],False,[],...,,,,21.0,21.0,day,1.0,1.0,cycle,"[until, maximum, response, plus]"
18839,14-day lead-in,14.0,14,day,"(14, -, day, lead, -, in)",[],False,[],False,[],...,,,,,,,,,,"[14, -, day, lead, -, in]"
19162,21-day cycles until progression or two cycles ...,21.0,21,day,"(21, -, day, cycles, until, progression, or, t...",[],False,[],False,[],...,,,,21.0,21.0,day,2.0,2.0,cycle,"[until, progression, or, past, documented, CR]"
19194,14-day cycle for 2 years or until disease prog...,14.0,14,day,"(14, -, day, cycle, for, 2, years, or, until, ...",[],False,[],False,[],...,,,,14.0,14.0,day,,,,"[2, years, or, until, disease, progression, or..."


In [158]:
cs = cycle_sigs_parsed.apply(make_cycle_sig, axis=1)

In [159]:
with so.Session(engine) as sess:
    sess.add_all(cs)
    sess.commit()

In [160]:
with so.Session(engine) as sess:
    cs_cui_lookup = {r[0]: r[1] for r in sess.query(Hemonc_Cycle_Sig.cycle_sig_id, Hemonc_Cycle_Sig.cycle_sig_cui).all()}


In [208]:
# new issue - variant_cui not unique in this file any more - need to resolve the study field to unique pairs?
variant_study[variant_study.variant_cui.duplicated(keep=False)]

Unnamed: 0,regimen,study,variant_cui,variant,regimen_cui,variant_object
52,7+3d,ALFA-0701|LAM-2001|PALG AML1/1999|PALG AML1/20...,129507,Variant #04,814.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
55,7+3d,Arlin et al. 1990|AZA-AML-001|CALGB 7421|CALGB...,129508,Variant #05,814.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
58,7+3d,Jin et al. 2013|Arlin et al. 1990|AZA-AML-001|...,129508,Variant #05,814.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
66,7+3d,ACCEDE|ADcomparison|AML 8B|CALGB 8321|CALGB 92...,129511,Variant #08,814.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
70,7+3d,AML-AZA|AML2003|AML2006|AZA-AML-001|BRIGHT AML...,129512,Variant #09,814.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
...,...,...,...,...,...,...
25371,FOLFIRI and Cetuximab,CRYSTAL|FIRE-3|KEYNOTE-177,132228,Variant #02,15194.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
25382,"MVAC, dose-dense",EORTC 30924,133256,Variant #02,5330.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
25385,Cisplatin and RT,CCTG HN.6|Chitapanarux et al. 2007|Fountzilas ...,130941,Variant #01,5259.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
25386,Cisplatin and RT,ARTSCAN III|DAHANCA19,130953,Variant #13,5259.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...


In [162]:
# RESOLVED
# # new issue - handling null condition codes
# variant_study[variant_study.study.notna() & ~variant_study.study.isin(study[study.concept_code.isna()].study.unique())].drop_duplicates(subset='variant_cui')

In [163]:
from collections import defaultdict
vs_lookup = defaultdict(list)

for variant_map in variant_study[['variant_cui', 'study']].fillna('').itertuples():
    vs_lookup[variant_map.variant_cui] += list(set(variant_map.study.split('|')))

In [164]:
# new issue - variant_cui - study id is not unique in this file any more - need to resolve the study field to unique pairs?

dupe_vs_map = [k for k, v in vs_lookup.items() if len(v) != len(set(v))]

In [165]:
variant_study[variant_study.variant_cui == 129495]#.variant_object.iloc[0].__dict__

Unnamed: 0,regimen,study,variant_cui,variant,regimen_cui,variant_object


In [166]:
reg_parts['cycle_sig_cui'] = reg_parts.cyclesigs.map(cs_cui_lookup)

In [167]:
reg_parts[reg_parts.duplicated(subset=['variant_cui', 'regimen_part_cui'], keep=False)]

Unnamed: 0,regimen,variant_cui,variant,phase,portion,cyclesigs,timing,regimen_part_cui,timing_unit,cycle_sig_cui,regimen_part_id


In [168]:
regimen_parts = reg_parts.apply(make_reg_part, axis=1)

In [169]:
regimen_parts[0]

<hemonc_alchemy.model.hemonc_model.Hemonc_Regimen_Part at 0x179701590>

In [170]:
variant_objects = list(variant_study[variant_study.study.notna() & 
                                    ~variant_study.study.isin(study[study.concept_code.isna()].study.unique()) & 
                                    ~variant_study.variant_cui.isin(dupe_vs_map)].drop_duplicates(subset='variant_cui').variant_object)

In [171]:
vc = variant_study[variant_study.study.notna() & 
                                    ~variant_study.study.isin(study[study.concept_code.isna()].study.unique()) & 
                                    ~variant_study.variant_cui.isin(dupe_vs_map)].drop_duplicates(subset='variant_cui').variant_cui.tolist()

In [172]:
variant_study[variant_study.study.notna() & 
            ~variant_study.study.isin(study[study.concept_code.isna()].study.unique()) & 
            ~variant_study.variant_cui.isin(dupe_vs_map)].drop_duplicates(subset='variant_cui')

Unnamed: 0,regimen,study,variant_cui,variant,regimen_cui,variant_object
0,"(90)YFC, then allo HSCT",MDACC ID01-233,129496,Variant #01,19726.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
5,131Iodine-Tositumomab monotherapy,CP-97-012|Kaminski et al. 1993|Kaminski et al....,129497,Variant #01,19847.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
10,131Iodine-Tositumomab monotherapy,CP-97-012|Kaminski et al. 1993|Kaminski et al....,144834,Variant #02,19847.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
12,4d+7,ALFA 9803,150895,Variant #01,110391.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
13,4i+7,ALFA 9803,150896,Variant #01,110392.0,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
...,...,...,...,...,...,...
25447,Hydroxyurea and Plicamycin,Koller and Miller 1986,161371,Variant #06,,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
25448,Iobenguane I 131 monotherapy,MIP-IB12B,161372,Variant #05,,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
25449,R-Hyper-CVAD/R-MA,Merli et al. 2012|Romaguera et al. 2005|SWOG S...,161373,Variant #10,,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...
25450,R-Hyper-CVAD/R-MA,Merli et al. 2012|Romaguera et al. 2005|SWOG S...,161374,Variant #08,,<hemonc_alchemy.model.hemonc_model.Hemonc_Vari...


In [173]:
l = []
for ll in variant_objects:
    if ll.variant_cui not in l:
        l.append(ll.variant_cui)
    else:
        print('boo')
        break

In [174]:
variant_objects[5]#.__dict__['studied_in'][0].__dict__

<hemonc_alchemy.model.hemonc_model.Hemonc_Variant at 0x179f16010>

In [175]:
with so.Session(engine) as sess:
    sess.add_all(list(variant_study[variant_study.study.notna() & 
                                    ~variant_study.study.isin(study[study.concept_code.isna()].study.unique()) & 
                                    ~variant_study.variant_cui.isin(dupe_vs_map)].drop_duplicates(subset='variant_cui').variant_object))
    #
    sess.add_all(variant_objects)
    sess.add_all(regimen_parts)
    sess.commit()

  sess.commit()
  sess.commit()
  sess.commit()
  sess.commit()
  sess.commit()
  sess.commit()
  sess.commit()
  sess.commit()


In [176]:
variant_objects[5].__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x1590d7530>}

In [177]:
sig_vars[sig_vars.variant_cui==131430]#sig_vars[['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].duplicated()]

regimen_part_sig[regimen_part_sig.regimen=='D-FEC plus Bev']

reg_parts[reg_parts.variant_cui==131430]#sig_vars[['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'cycle_length_lb', 'cycle_length_ub', 'cycle_length_unit']].duplicated()]


Unnamed: 0,regimen,variant_cui,variant,phase,portion,cyclesigs,timing,regimen_part_cui,timing_unit,cycle_sig_cui,regimen_part_id
7121,D-FEC+Bev,131430.0,Variant #01,,D portion,21-day cycle for 3 cycles,Cycles 1 to 3,0,,1014.0,3422
7123,D-FEC+Bev,131430.0,Variant #01,,FEC portion,21-day course,Cycle 4,1,,696.0,3423
7124,D-FEC+Bev,131430.0,Variant #01,,FEC portion,21-day cycle for 3 cycles,Cycles 4 to 6,2,,1014.0,3424


In [178]:
sig_vars_components = sig_vars.merge(component_df[['concept_name', 'concept_code']], left_on='component', right_on='concept_name', how='left').drop_duplicates().merge(reg_parts[['variant_cui', 'phase', 'portion', 'cyclesigs', 'regimen_part_id']])
sig_vars_components[['variant_cui', 'regimen_part_id', 'component', 'component_role', 'timing', 'step_number', 'inParens', 'class', 'tail']][sig_vars_components[['variant_cui', 'regimen_part_id', 'component', 'component_role', 'timing', 'step_number', 'class', 'tail']].duplicated()]

Unnamed: 0,variant_cui,regimen_part_id,component,component_role,timing,step_number,inParens,class,tail
3890,144847.0,1889,Bilateral orchiectomy,primary systemic,,1 of 1,,Non-canonical Sig,-
14954,132911.0,7042,Lenalidomide,primary systemic,,1 of 1,,Non-IV canonical Sig,-
14956,132911.0,7042,Lenalidomide,primary systemic,,1 of 1,,Non-IV canonical Sig,-
15698,144895.0,7418,Methotrexate,primary systemic,,1 of 1,,Non-IV canonical Sig,-
15700,144895.0,7418,Methotrexate,primary systemic,,1 of 1,,Non-IV canonical Sig,-
15701,144895.0,7418,Methotrexate,primary systemic,,1 of 1,,Non-IV canonical Sig,-
16556,133298.0,7726,Bilateral orchiectomy,primary systemic,,1 of 1,,Non-canonical Sig,-


In [179]:
sig_vars_components['sig_id'] = sig_vars_components.groupby(['regimen', 'variant_cui', 'regimen_part_id']).cumcount()

In [180]:
sig_vars_components = sig_vars_components.sort_values(['variant_cui', 'regimen_part_id', 'concept_name', 'step_number'])

In [181]:
# if days are empty, but this component and regimen part are identical to the previous row when sorted in this fashion, filling these day details forward

sig_vars_components.loc[sig_vars_components.allDays.isna() & (sig_vars_components.concept_name.shift(1) ==  sig_vars_components.concept_name), 'allDays'] = sig_vars_components.allDays.shift(1)

In [256]:
sig_vars_components['sig_cui'] = list(range(1, len(sig_vars_components)+1))

In [272]:
sig_objects = sig_vars_components[sig_vars_components.regimen_cui.notna()].merge(reg_parts[['variant_cui', 'regimen_part_id', 'regimen_part_cui']]).apply(make_sig, axis=1)
sd = sig_vars_components[sig_vars_components.regimen_cui.notna()].merge(reg_parts[['variant_cui', 'regimen_part_id', 'regimen_part_cui']]).apply(make_sig_days, axis=1)

In [260]:
# def make_sig_days(sd_row):
#     days = '0' if pd.isna(sd_row.allDays) else sd_row.allDays
#     return [Sig_Days(regimen_part_id = sd_row.regimen_part_id,
#                      variant_cui = sd_row.variant_cui,
#                      sig_cui = sd_row.sig_cui,
#                      regimen_part_cui = sd_row.regimen_part_cui,
#                      sig_id = sd_row.sig_id,
#                      day = d) for d in days.split(',')]

In [261]:
sig_vars_components

Unnamed: 0,study,regimen,regimen_cui,phase,portion,component,component_cui,component_role,cycle_length_lb,cycle_length_ub,...,tail,variant,variant_cui,temp,date_added,concept_name,concept_code,regimen_part_id,sig_id,sig_cui
0,MDACC ID01-233,"(90)YFC, then allo HSCT",19726.0,Consolidation,-,Allogeneic stem cells,38881,primary systemic,1.0,1,...,-,Variant #01,129496.0,,2023-09-19,Allogeneic stem cells,38881,1,0,1
1,MDACC ID01-233,"(90)YFC, then allo HSCT",19726.0,Consolidation,-,Cyclophosphamide,122,primary systemic,1.0,1,...,-,Variant #01,129496.0,,2023-09-19,Cyclophosphamide,122,1,1,2
2,MDACC ID01-233,"(90)YFC, then allo HSCT",19726.0,Consolidation,-,Fludarabine,224,primary systemic,1.0,1,...,-,Variant #01,129496.0,,2023-09-19,Fludarabine,224,1,2,3
3,MDACC ID01-233,"(90)YFC, then allo HSCT",19726.0,Consolidation,-,Ibritumomab tiuxetan,262,primary systemic,1.0,1,...,-,Variant #01,129496.0,,2023-09-19,Ibritumomab tiuxetan,262,1,3,4
4,MDACC ID01-233,"(90)YFC, then allo HSCT",19726.0,Consolidation,-,Ibritumomab tiuxetan,262,primary systemic,1.0,1,...,-,Variant #01,129496.0,,2023-09-19,Ibritumomab tiuxetan,262,1,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18440,Merli et al. 2012|Romaguera et al. 2005|SWOG S...,R-Hyper-CVAD/R-MA,4573.0,,Hyper-CVAD portion,Doxorubicin,170,primary systemic,21.0,21,...,-,Variant #09,161375.0,,2025-11-11,Doxorubicin,170,8566,2,20801
18441,Merli et al. 2012|Romaguera et al. 2005|SWOG S...,R-Hyper-CVAD/R-MA,4573.0,,Hyper-CVAD portion,Vincristine,536,primary systemic,21.0,21,...,on day 5,Variant #09,161375.0,,2025-11-11,Vincristine,536,8566,3,20802
18442,Merli et al. 2012|Romaguera et al. 2005|SWOG S...,R-Hyper-CVAD/R-MA,4573.0,,MA portion,Cytarabine,126,primary systemic,21.0,21,...,-,Variant #09,161375.0,,2025-11-11,Cytarabine,126,8567,0,20803
18443,Merli et al. 2012|Romaguera et al. 2005|SWOG S...,R-Hyper-CVAD/R-MA,4573.0,,MA portion,Methotrexate,329,primary systemic,21.0,21,...,-,Variant #09,161375.0,,2025-11-11,Methotrexate,329,8567,1,20804


In [262]:
# query - missing regimen cui mappings?
sig_vars_components[sig_vars_components.regimen_cui.isna()]

Unnamed: 0,study,regimen,regimen_cui,phase,portion,component,component_cui,component_role,cycle_length_lb,cycle_length_ub,...,tail,variant,variant_cui,temp,date_added,concept_name,concept_code,regimen_part_id,sig_id,sig_cui
11627,UK MRC AML15,FLAG-Ida (Lenograstim),,Induction,-,Cytarabine,126,primary systemic,7.0,7,...,-,Variant #01,132096.0,,2023-09-19,Cytarabine,126,5424,0,7526
11628,UK MRC AML15,FLAG-Ida (Lenograstim),,Induction,-,Fludarabine,224,primary systemic,7.0,7,...,-,Variant #01,132096.0,,2023-09-19,Fludarabine,224,5424,1,7527
11629,UK MRC AML15,FLAG-Ida (Lenograstim),,Induction,-,Idarubicin,265,primary systemic,7.0,7,...,-,Variant #01,132096.0,,2023-09-19,Idarubicin,265,5424,2,7528
11630,UK MRC AML15,FLAG-Ida (Lenograstim),,Induction,-,Lenograstim,300,primary systemic,7.0,7,...,-,Variant #01,132096.0,,2023-09-19,Lenograstim,300,5424,3,7529
18132,LMBA-02,R-COPADM (Prednisolone),,,-,Cyclophosphamide,122,primary systemic,1.0,1,...,-,Variant #01,133735.0,,2023-11-17,Cyclophosphamide,122,8461,0,12300
18133,LMBA-02,R-COPADM (Prednisolone),,,-,Doxorubicin,170,primary systemic,1.0,1,...,-,Variant #01,133735.0,,2023-11-17,Doxorubicin,170,8461,1,12301
18134,LMBA-02,R-COPADM (Prednisolone),,,-,Leucovorin,229,secondary systemic,1.0,1,...,-,Variant #01,133735.0,,2023-11-17,Leucovorin,229,8461,2,12302
18135,LMBA-02,R-COPADM (Prednisolone),,,-,Methotrexate,329,primary systemic,1.0,1,...,-,Variant #01,133735.0,,2023-11-17,Methotrexate,329,8461,3,12303
18136,LMBA-02,R-COPADM (Prednisolone),,,-,Prednisolone,417,primary systemic,1.0,1,...,-,Variant #01,133735.0,,2023-11-17,Prednisolone,417,8461,4,12304
18137,LMBA-02,R-COPADM (Prednisolone),,,-,Rituximab,446,primary systemic,1.0,1,...,-,Variant #01,133735.0,,2023-11-17,Rituximab,446,8461,5,12305


In [273]:
with so.Session(engine) as sess:
    sess.add_all(sig_objects)
    for s in sd:
        sess.add_all(s)
    sess.commit()

In [274]:
sig_vars_components[['component', 'doseMinNum', 'doseMaxNum', 'doseUnit', 'doseCapNum', 'doseCapUnit', 'divided', 'durationMinNum', 'durationMaxNum', 'durationUnit', 'frequency', 'inParens', 'sequence', 'seq.rel', 'seq.rel.what']]['seq.rel.what'].value_counts()

seq.rel.what
Elotuzumab                    48
Leucovorin                     8
Ifosfamide                     6
Fludarabine                    6
Irinotecan                     5
Rituximab                      3
radiation                      2
Radiation therapy              2
HIPEC                          2
Melphalan                      2
Ultraviolet A                  1
Pemetrexed                     1
External beam radiotherapy     1
Goserelin                      1
Teniposide                     1
chemotherapy                   1
Etoposide                      1
Methotrexate                   1
Mesna                          1
Radiotherapy                   1
Levoleucovorin                 1
Cyclophosphamide               1
Tositumomab and I-131          1
Oxaliplatin                    1
Name: count, dtype: int64

In [275]:
regimen[regimen.regimen_cui.duplicated()].head()

Unnamed: 0,regimen,regimen_cui
191,"Busulfan and Fludarabine, then allo HSCT",1584
265,CYVE,1105
630,"Arsenic trioxide, then ATRA and Daunorubicin",2391
823,FC,2913
858,"Fludarabine, Busulfan, ATG, Ibritumomab tiuxetan",3015


In [276]:
### Issues

# Need a n:m context <-> status mapper

# single record with >1 setting - is this a variant?
context[context.setting.map(lambda x: get_enum(Setting, x)).isna()].setting.value_counts()

# list of regimens that have no associated variants

vv = variant.merge(pointer[['regimen', 'regimen_cui']].drop_duplicates(), how='outer')
vv[vv.variant.isna()].regimen.unique()

# and vice-versa
vv[vv.regimen_cui.isna()]


# some potential duplication across variant/sig rows
dup = sig_vars[sig_vars[['study',  'regimen', 'component', 'variant_cui', 'branch', 'timing', 'step_number', 'portion', 'class']].duplicated(keep=False)]

for regimen, reg_dets in dup.groupby('regimen'):
    print(regimen)
    for lab, d in reg_dets.to_dict().items():
        if len(set(d.values())) != 1:
            print('\t', lab, d)

# should these variant definitions have the same cyclesig?

distinct_sigs = sig_vars[~sig_vars.component.str.contains('surg|brachy|radiotherapy', case=False)][['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch']].drop_duplicates()
distinct_sigs[distinct_sigs[['regimen', 'variant', 'phase', 'portion', 'timing', 'branch']].duplicated(keep=False)][['regimen', 'variant']].drop_duplicates()

sig_vars[(sig_vars.regimen=='TACE, then 5-FU') & (sig_vars.variant=='Variant #01')][['regimen', 'variant', 'phase', 'portion', 'cyclesigs', 'timing', 'branch', 'component']]


# components in the variant file that aren't in the drug class - need to revisit to pull in procedures
sig_vars_components[sig_vars_components.concept_code.isna()]

BCG vaccine monotherapy
	 cycle_length_lb {1297: 3.0, 1298: 6.0, 1299: 3.0, 1300: 6.0}
	 cycle_length_ub {1297: '3', 1298: '6', 1299: '3', 1300: '6'}
	 timing_sequence {1297: '2', 1298: '3,4,5,6,7,8', 1299: '2', 1300: '3,4,5,6,7,8'}
	 cyclesigs {1297: '3-month course', 1298: '6-month cycle for 6 cycles', 1299: '3-month course', 1300: '6-month cycle for 6 cycles'}
	 step_number {1297: '1 of 2', 1298: '1 of 2', 1299: '2 of 2', 1300: '2 of 2'}
	 doseMinNum {1297: '50', 1298: '50', 1299: '0.5', 1300: '0.5'}
	 doseMaxNum {1297: '50', 1298: '50', 1299: '0.5', 1300: '0.5'}
	 doseCapNum {1297: nan, 1298: nan, 1299: nan, 1300: nan}
	 doseCapUnit_cui {1297: nan, 1298: nan, 1299: nan, 1300: nan}
	 targetLevelUnit_cui {1297: nan, 1298: nan, 1299: nan, 1300: nan}
	 route {1297: 'intravesicularly', 1298: 'intravesicularly', 1299: 'SC', 1300: 'SC'}
	 route_cui {1297: 45215.0, 1298: 45215.0, 1299: 45153.0, 1300: 45153.0}
	 durationMinNum {1297: nan, 1298: nan, 1299: nan, 1300: nan}
	 durationMaxNum {1

Unnamed: 0,study,regimen,regimen_cui,phase,portion,component,component_cui,component_role,cycle_length_lb,cycle_length_ub,...,tail,variant,variant_cui,temp,date_added,concept_name,concept_code,regimen_part_id,sig_id,sig_cui
254,monarchE,Abemaciclib and ET,31832.0,Adjuvant,-,Antiestrogen,918,primary systemic,28.0,28,...,-,Variant #01,129553.0,,2023-09-19,,,98,0,186
1348,TRANSFORMER,BAT,32337.0,,-,GnRH agonist,9460,primary systemic,28.0,28,...,-,Variant #01,129836.0,,2023-09-19,,,646,0,941
6641,CALGB 40603,CP-ddAC,121944.0,,ddAC portion,Granulocyte colony-stimulating factor,1001,secondary systemic,21.0,21,...,-,Variant #01,131237.0,,2024-10-14,,,3065,2,4696
7305,R-GVHD,"Cyclosporine, Corticosteroids, Rituximab",21048.0,,-,Steroid,45523,secondary systemic,8.0,8,...,"of <a href=""Prednisone_.html"" title=""Prednison...",Variant #01,131374.0,,2023-09-19,,,3314,2,5148
8884,Dreyling et al. 2004,DexaBEAM and G-CSF,24488.0,,-,Granulocyte colony-stimulating factor,1001,primary systemic,1.0,1,...,-,Variant #01,131613.0,,2023-09-19,,,4059,4,6137
10019,Hokusai-VTE,Edoxaban monotherapy,38307.0,,-,Low molecular weight heparin,16190,primary systemic,3.0,12,...,-,Variant #01,131874.0,,2023-09-19,,,4647,1,6860
10209,HE 10/00,EP-ddCMF,120170.0,,ddCMF portion,Granulocyte colony-stimulating factor,1001,secondary systemic,14.0,14,...,-,Variant #01,131916.0,,2023-11-17,,,4730,2,7019
13758,CML-Study IV,Imatinib and Interferon alfa,13162.0,,-,Interferon alfa,76030,primary systemic,7.0,7,...,-,Variant #01,132620.0,,2023-09-19,,,6378,1,9111
13759,CML-Study IV,Imatinib and Interferon alfa,13162.0,,-,Interferon alfa,76030,primary systemic,7.0,7,...,-,Variant #01,132620.0,,2023-09-19,,,6379,1,9113
13760,CML-Study IV,Imatinib and Interferon alfa,13162.0,,-,Interferon alfa,76030,primary systemic,7.0,7,...,-,Variant #01,132620.0,,2023-09-19,,,6380,0,9114


In [277]:

distinct_sigs = sig_vars[~sig_vars.component.str.contains('surg|brachy|radiotherapy', case=False)][['regimen', 'variant', 'portion', 'cyclesigs', 'timing', 'branch']].drop_duplicates()
distinct_sigs[distinct_sigs[['regimen', 'variant', 'portion', 'timing', 'branch']].duplicated(keep=False)][['regimen', 'variant']].drop_duplicates()


Unnamed: 0,regimen,variant
650,ADT and Darolutamide,Variant #01
679,"ADT, Enzalutamide, Talazoparib",Variant #01
1297,BCG vaccine monotherapy,Variant #03
2543,CapeOx and Pembrolizumab,Variant #01
3204,Carboplatin and Paclitaxel (CP) and Nivolumab,Variant #05
3208,Carboplatin and Paclitaxel (CP) and Nivolumab,Variant #06
3212,Carboplatin and Paclitaxel (CP) and Nivolumab,Variant #07
3216,Carboplatin and Paclitaxel (CP) and Nivolumab,Variant #08
3283,Carboplatin and Paclitaxel (CP) and Tislelizumab,Variant #02
3577,"Carboplatin, Pemetrexed, Tislelizumab",Variant #02


In [278]:
ref[ref.concept_code.isna()].condition.value_counts()

condition
Autologous HSCT conditioning regimen                                128
Allogeneic HSCT conditioning regimen                                 80
NCCN guidelines                                                      34
ASCO guidelines                                                      27
ESMO guidelines                                                      25
Antiemesis                                                           22
Stem cell mobilization regimen                                       16
Immunotherapy toxicity management                                    14
Cellular therapy conditioning regimen                                11
Immune effector cells toxicity management                             7
ONS guidelines                                                        6
Palliative Care                                                       6
Basics of Pain Management                                             5
Neutropenia & leukopenia                              