In [1]:
from datetime import datetime
import pandas as pd
pd.options.display.max_columns = 30
import sqlalchemy as sa
import sqlalchemy.orm as so
import numpy as np
import matplotlib.pyplot as plt
from hemonc_alchemy.model.hemonc_model import Hemonc_Study, Hemonc_Modality, Hemonc_Condition, Hemonc_Component_Role, Hemonc_Ref, Hemonc_Component, Hemonc_Component_Class, Hemonc_Context, Hemonc_Regimen, Hemonc_Branch_Conditional, Hemonc_Variant, Hemonc_Regimen_Part, Part_Phase, Hemonc_Cycle_Sig, Hemonc_Sig, Sig_Days, Base, component_to_class_map, variant_study_map, regimen_to_modality_map
from pathlib import Path
from datetime import date
import re
from omop_alchemy.model.vocabulary import Concept, Concept_Relationship

In [2]:
import omop_alchemy as oa
engine = oa.oa_config.engine

In [3]:
hemonc_concept = so.aliased(Concept, name='hemonc_concept')
standard_relationship = so.aliased(Concept_Relationship, name='standard_relationship')

standard_concept_subquery = (
    sa.select(
        Concept_Relationship.concept_id_1,
        Concept_Relationship.relationship_id,
        *Concept.__table__.columns
    )
    .join(Concept, Concept_Relationship.concept_id_2 == Concept.concept_id)
    .where(
        sa.and_(
            Concept_Relationship.relationship_id == 'Maps to',
            Concept.domain_id == 'Condition',
            Concept.standard_concept == 'S'
        )
    )
    .subquery()
)

with so.Session(engine) as session:
    concept_mappings = pd.DataFrame(
        session.query(
            Hemonc_Condition.condition_name,
            Hemonc_Condition.condition_code,
            Hemonc_Condition.condition_concept_id,
            standard_concept_subquery.c.relationship_id,
            standard_concept_subquery.c.concept_name,
            standard_concept_subquery.c.concept_code,
            standard_concept_subquery.c.vocabulary_id,
            standard_concept_subquery.c.standard_concept
        )
        .join(standard_concept_subquery, Hemonc_Condition.condition_concept_id == standard_concept_subquery.c.concept_id_1, isouter=True)
        .join(hemonc_concept, sa.and_(Hemonc_Condition.condition_concept_id == hemonc_concept.concept_id, hemonc_concept.invalid_reason.is_(None)), isouter=True)
    )


standard_component_subquery = (
    sa.select(
        Concept_Relationship.concept_id_1,
        Concept_Relationship.relationship_id,
        *Concept.__table__.columns
    )
    .join(Concept, Concept_Relationship.concept_id_2 == Concept.concept_id)
    .where(
        sa.and_(
            Concept_Relationship.relationship_id == 'Maps to',
            Concept.domain_id == 'Drug'
        )
    )
    .subquery()
)

with so.Session(engine) as session:
    component_mappings = pd.DataFrame(
        session.query(
            Hemonc_Component.component_name,
            Hemonc_Component.component_code,
            Hemonc_Component.component_concept_id,
            standard_component_subquery.c.relationship_id,
            standard_component_subquery.c.concept_name,
            standard_component_subquery.c.concept_code,
            standard_component_subquery.c.vocabulary_id,
            standard_component_subquery.c.standard_concept
        )
        .join(hemonc_concept, sa.and_(Hemonc_Component.component_concept_id == hemonc_concept.concept_id, hemonc_concept.invalid_reason.is_(None)), isouter=True)
        .join(standard_component_subquery, Hemonc_Component.component_concept_id == standard_component_subquery.c.concept_id_1, isouter=True)
    )

In [4]:
# a few missing mappings for conditions that do have concept IDs
concept_mappings[concept_mappings.condition_concept_id.notna() & concept_mappings.concept_name.isna()].head()

Unnamed: 0,condition_name,condition_code,condition_concept_id,relationship_id,concept_name,concept_code,vocabulary_id,standard_concept
8,Anaplastic large cell lymphoma,560,42542134.0,,,,,
20,CNS carcinoma,574,42542147.0,,,,,
22,CNS lymphoma,576,42542149.0,,,,,
23,CNS melanoma,577,42542150.0,,,,,
26,Cholangiocarcinoma,580,42542153.0,,,,,


In [5]:
# other ones are new concepts without any mapping at all so not much to be done there at this stage
concept_mappings[concept_mappings.condition_concept_id.isna()].head()

Unnamed: 0,condition_name,condition_code,condition_concept_id,relationship_id,concept_name,concept_code,vocabulary_id,standard_concept
67,MSI-H or dMMR Malignant solid neoplasm,624,,,,,,
108,Sickle cell disease,665,,,,,,
127,FLT3-positive Acute myeloid leukemia,1891,,,,,,
128,IDH-mutated Acute myeloid leukemia,1919,,,,,,
130,Ph-positive B-cell acute lymphoblastic leukemia,4770,,,,,,


In [6]:
# todo: component to class mappings not inserted properly - revisit if it becomes important, but not high priority at present
# as per conditions - some have no concept ID assigned yet
component_mappings[component_mappings.component_concept_id.isna()].head()

Unnamed: 0,component_name,component_code,component_concept_id,relationship_id,concept_name,concept_code,vocabulary_id,standard_concept
0,Iodine I 131 apamistamab,148916,,,,,,
1,Zenocutuzumab,150749,,,,,,
2,Datopotamab deruxtecan,148417,,,,,,
3,Remestemcel-L,150098,,,,,,
4,Telisotuzumab vedotin,158894,,,,,,


In [7]:
# a few missing mappings for conditions that do have concept IDs
component_mappings[component_mappings.component_concept_id.notna() & component_mappings.concept_name.isna()]#.head()

Unnamed: 0,component_name,component_code,component_concept_id,relationship_id,concept_name,concept_code,vocabulary_id,standard_concept
765,BHQ-880,57,35802910.0,,,,,
766,BL22 immunotoxin,58,35802911.0,,,,,
772,Belagenpumatucel-L,62,35802915.0,,,,,
1235,Cyclosporine modified,123,35802976.0,,,,,
1236,Cyclosporine non-modified,124,35802977.0,,,,,
...,...,...,...,...,...,...,...,...
6011,CARv3-TEAM-E T-cells,139877,37561012.0,,,,,
6015,Mid-luteal phase bilateral oophorectomy,140981,37561148.0,,,,,
6018,Ovarian irradiation,141186,37561160.0,,,,,
6026,CM-313,142243,37561328.0,,,,,


In [8]:
with so.Session(engine) as session:
    reg_study_condition = pd.DataFrame(
        session.query(
            Hemonc_Regimen.regimen_cui, 
            Hemonc_Regimen.regimen_name,
            Hemonc_Variant.variant_name,
            Hemonc_Variant.variant_cui,
            Hemonc_Study.study_code,
            Hemonc_Study.start,
            Hemonc_Study.end,
            Hemonc_Study.sponsor_type,
            Hemonc_Study.enrollment_from,
            Hemonc_Study.enrollment_to, 
            Hemonc_Ref.title,
            Hemonc_Ref.pub_date,            
            Hemonc_Condition.condition_name,
            Hemonc_Condition.condition_code,
            Hemonc_Condition.condition_concept_id
        )
        .join(Hemonc_Variant, Hemonc_Variant.regimen_cui == Hemonc_Regimen.regimen_cui)
        .join(variant_study_map, variant_study_map.c.variant_cui==Hemonc_Variant.variant_cui, isouter=True)
        .join(Hemonc_Study, Hemonc_Study.study_cui == variant_study_map.c.study_cui, isouter=True)
        .join(Hemonc_Ref, Hemonc_Study.study_cui == Hemonc_Ref.study, isouter=True)
        .join(Hemonc_Condition, Hemonc_Condition.condition_code == Hemonc_Study.condition_code, isouter=True)
)


In [9]:
# we should do this so that we can ignore non-canonical sigs that are very old and not being used any more
reg_study_condition.start = pd.to_datetime(reg_study_condition.start)
reg_study_condition.end = pd.to_datetime(reg_study_condition.end)
reg_study_condition.pub_date = pd.to_datetime(reg_study_condition.pub_date)

# study linkages allow us to explore date of first availability for regimens - this is imperfect in the source, but we resolve this by creating a fallback option
reg_study_condition = reg_study_condition.merge(reg_study_condition.groupby('variant_cui').end.min().reset_index().rename(columns={'end': 'earliest_end'}), on='variant_cui', how='left')
reg_study_condition = reg_study_condition.merge(reg_study_condition.groupby('variant_cui').pub_date.min().reset_index().rename(columns={'pub_date': 'earliest_pub'}), on='variant_cui', how='left')
reg_study_condition = reg_study_condition.merge(reg_study_condition.groupby('variant_cui').start.min().reset_index().rename(columns={'start': 'earliest_start'}), on='variant_cui', how='left')


reg_study_condition = reg_study_condition.merge(reg_study_condition.groupby('variant_cui').end.max().reset_index().rename(columns={'end': 'latest_end'}), on='variant_cui', how='left')
reg_study_condition = reg_study_condition.merge(reg_study_condition.groupby('variant_cui').pub_date.max().reset_index().rename(columns={'pub_date': 'latest_pub'}), on='variant_cui', how='left')
reg_study_condition = reg_study_condition.merge(reg_study_condition.groupby('variant_cui').start.max().reset_index().rename(columns={'start': 'latest_start'}), on='variant_cui', how='left')

reg_study_condition['earliest_reference_date'] = reg_study_condition.earliest_end.combine_first(reg_study_condition.earliest_start).combine_first(reg_study_condition.earliest_pub)
reg_study_condition['latest_reference_date'] = reg_study_condition.latest_end.combine_first(reg_study_condition.latest_start).combine_first(reg_study_condition.latest_pub)

# even with all fallbacks, some still are undated - this is correct per source
reg_study_condition_with_date = reg_study_condition.dropna(subset='earliest_reference_date').copy()

In [10]:
reg_study_condition['extract_year'] = reg_study_condition['study_code'].str.extract(r'(?<!\d)(19[5-9]\d|20[0-4]\d)(?!\d)')

In [11]:
# years as available for a final fallback
reg_study_condition[reg_study_condition.earliest_reference_date.isna() & reg_study_condition.extract_year.notna()].head()

Unnamed: 0,regimen_cui,regimen_name,variant_name,variant_cui,study_code,start,end,sponsor_type,enrollment_from,enrollment_to,title,pub_date,condition_name,condition_code,condition_concept_id,earliest_end,earliest_pub,earliest_start,latest_end,latest_pub,latest_start,earliest_reference_date,latest_reference_date,extract_year
20,814,7+3d,Variant #07,129510,Masaoka et al. 1996,NaT,NaT,,,,,NaT,Acute myeloid leukemia,552,42542126.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,1996
27,1095,7+3i,Variant #03,129524,Masaoka et al. 1996,NaT,NaT,,,,,NaT,Acute myeloid leukemia,552,42542126.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,1996
53,5611,Abarelix monotherapy,Variant #01,129546,Koch et al. 2003,NaT,NaT,,,,,NaT,Prostate cancer,658,42542227.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,2003
188,12485,Alemtuzumab and Rituximab,Variant #01,129638,Faderl et al. 2003,NaT,NaT,,,,,NaT,Chronic lymphocytic leukemia,581,42542154.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,2003
191,2712,Alemtuzumab monotherapy,Variant #03,129641,Wagner et al. 2009,NaT,NaT,,,,,NaT,Hypereosinophilic syndrome,616,42542186.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,2009


In [12]:
reg_study_condition[reg_study_condition.earliest_reference_date.isna() & reg_study_condition.extract_year.isna()].study_code.value_counts().head()

study_code
CHAMPION 302    6
CHAMPION 301    6
KEYNOTE-689     4
MK-3475A-D77    3
panobidara      2
Name: count, dtype: int64

In [13]:
# if they were still being used as a baseline comparison in any study after 2010, we could assume they are potentially relevant to our data sources?
relevant_variants = list(set(reg_study_condition[reg_study_condition.latest_reference_date.dt.year >= 2010].variant_cui))

In [14]:
# ~40% of regimen/condition combinations have not been studied since before 2010
100*round(len(reg_study_condition[reg_study_condition.latest_reference_date.dt.year < 2010])/len(reg_study_condition), 2)

40.0

In [15]:
regimens_with_standard_conditions = reg_study_condition.merge(concept_mappings, how='left', left_on=['condition_name', 'condition_code'], right_on=['condition_name', 'condition_code'])

In [16]:
# unmapped conditions that actually have regimens associated (note that some of these do have Athena standard maps, but these are not in the HemOnc source data)
regimens_with_standard_conditions[regimens_with_standard_conditions.concept_name.isna()][['condition_name', 'condition_code']].drop_duplicates().head()

Unnamed: 0,condition_name,condition_code
172,,
209,MALT lymphoma,13940.0
259,Anaplastic large cell lymphoma pediatric,25958.0
288,NK- and T-cell lymphoma,637.0
493,CNS lymphoma,576.0


In [17]:
# regimens of interest for lung cancer - have confirmed this filter is not missing any relevant regimens, despite being a bit dumb
regimens_with_standard_conditions[regimens_with_standard_conditions.condition_name.str.contains('lung', case=False, na=False)].head()

Unnamed: 0,regimen_cui,regimen_name,variant_name,variant_cui,study_code,start,end,sponsor_type,enrollment_from,enrollment_to,title,pub_date,condition_name,condition_code,condition_concept_id_x,earliest_end,earliest_pub,earliest_start,latest_end,latest_pub,latest_start,earliest_reference_date,latest_reference_date,extract_year,condition_concept_id_y,relationship_id,concept_name,concept_code,vocabulary_id,standard_concept
55,29978,ABCP,Variant #02,129549,IMpower150,2015-03-15,2016-12-15,,,,,NaT,Non-small cell lung cancer nonsquamous,25309,912088.0,2016-12-15,NaT,2015-03-15,2016-12-15,NaT,2015-03-15,2016-12-15,2016-12-15,,912088.0,Maps to,Nonsquamous nonsmall cell neoplasm of lung,440173001,SNOMED,S
56,29978,ABCP,Variant #03,129550,IMpower150,2015-03-15,2016-12-15,,,,,NaT,Non-small cell lung cancer nonsquamous,25309,912088.0,2016-12-15,NaT,2015-03-15,2016-12-15,NaT,2015-03-15,2016-12-15,2016-12-15,,912088.0,Maps to,Nonsquamous nonsmall cell neoplasm of lung,440173001,SNOMED,S
57,29978,ABCP,Variant #04,129551,IMpower150,2015-03-15,2016-12-15,,,,,NaT,Non-small cell lung cancer nonsquamous,25309,912088.0,2016-12-15,NaT,2015-03-15,2016-12-15,NaT,2015-03-15,2016-12-15,2016-12-15,,912088.0,Maps to,Nonsquamous nonsmall cell neoplasm of lung,440173001,SNOMED,S
162,24534,Afatinib and Bevacizumab,Variant #01,129625,ABC Study,2014-01-01,2017-01-01,,,,,NaT,Non-small cell lung cancer,642,42542211.0,2017-01-01,NaT,2014-01-01,2017-01-01,NaT,2014-01-01,2017-01-01,2017-01-01,,42542211.0,Maps to,Non-small cell lung cancer,254637007,SNOMED,S
163,24580,Afatinib and Cetuximab,Variant #01,129626,BI 1200.71,2010-01-01,2013-01-01,,,,,NaT,Non-small cell lung cancer,642,42542211.0,2013-01-01,NaT,2010-01-01,2013-01-01,NaT,2010-01-01,2013-01-01,2013-01-01,,42542211.0,Maps to,Non-small cell lung cancer,254637007,SNOMED,S


In [18]:
regimens_with_standard_conditions[
    regimens_with_standard_conditions.regimen_name.str.contains('pembrolizumab', case=False, na=False) & 
    regimens_with_standard_conditions.regimen_name.str.contains('carboplatin', case=False, na=False) & 
    regimens_with_standard_conditions.condition_name.str.contains('lung', case=False, na=False)
].head()

Unnamed: 0,regimen_cui,regimen_name,variant_name,variant_cui,study_code,start,end,sponsor_type,enrollment_from,enrollment_to,title,pub_date,condition_name,condition_code,condition_concept_id_x,earliest_end,earliest_pub,earliest_start,latest_end,latest_pub,latest_start,earliest_reference_date,latest_reference_date,extract_year,condition_concept_id_y,relationship_id,concept_name,concept_code,vocabulary_id,standard_concept
1022,14166,Carboplatin and Etoposide (CE) and Pembrolizumab,Variant #01,130287,KEYNOTE-604,2017-05-15,2018-07-30,,,,,NaT,Small cell lung cancer,666,42542234.0,2018-07-30,NaT,2017-05-15,2018-07-30,NaT,2017-05-15,2018-07-30,2018-07-30,,42542234.0,Maps to,Small cell carcinoma of lung,254632001,SNOMED,S
6672,30172,Carboplatin and Paclitaxel (CP) and Pembrolizumab,Variant #04,134523,CANOPY-1,2019-06-25,2020-01-17,,,,,NaT,Non-small cell lung cancer squamous,25316,912089.0,2020-01-17,NaT,2019-06-25,2020-01-17,NaT,2019-06-25,2020-01-17,2020-01-17,,912089.0,Maps to,Squamous non-small cell lung cancer,723301009,SNOMED,S
6674,30166,Carboplatin and nab-Paclitaxel (CnP) and Pembr...,Variant #02,134525,CANOPY-1,2019-06-25,2020-01-17,,,,,NaT,Non-small cell lung cancer squamous,25316,912089.0,2020-01-17,NaT,2019-06-25,2020-01-17,NaT,2019-06-25,2020-01-17,2020-01-17,,912089.0,Maps to,Squamous non-small cell lung cancer,723301009,SNOMED,S
7213,29572,"Carboplatin, Pemetrexed, Pembrolizumab",Variant #04,136677,LIBRETTO-431,2020-03-15,2022-08-15,,,,,NaT,Non-small cell lung cancer,642,42542211.0,2022-08-15,NaT,2020-03-15,2022-08-15,NaT,2020-03-15,2022-08-15,2022-08-15,,42542211.0,Maps to,Non-small cell lung cancer,254637007,SNOMED,S
8478,30172,Carboplatin and Paclitaxel (CP) and Pembrolizumab,Variant #06,161278,MK-3475A-D77,NaT,NaT,,,,,NaT,Non-small cell lung cancer squamous,25316,912089.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,912089.0,Maps to,Squamous non-small cell lung cancer,723301009,SNOMED,S


In [19]:
# todo: not populated phase info 

with so.Session(engine) as session:
    variants = pd.DataFrame(
        session.query(
            Hemonc_Variant.regimen_cui,
            Hemonc_Variant.variant_name,
            Hemonc_Regimen_Part.regimen_part_id,
            Hemonc_Regimen_Part.variant_cui,
            Hemonc_Regimen_Part.timing_unit,
            Hemonc_Regimen_Part.timing,
            Hemonc_Regimen_Part.portion,
            Hemonc_Cycle_Sig.cycle_sig_cui,
            Hemonc_Cycle_Sig.cycle_sig_id,
            Hemonc_Cycle_Sig.cycle_len_min,
            Hemonc_Cycle_Sig.cycle_len_max,
            Hemonc_Cycle_Sig.cycle_len_units,
            Hemonc_Cycle_Sig.duration_min,
            Hemonc_Cycle_Sig.duration_max,
            Hemonc_Cycle_Sig.duration_units,
            Hemonc_Cycle_Sig.frequency_min,
            Hemonc_Cycle_Sig.frequency_max,
            Hemonc_Cycle_Sig.frequency_units,
            Hemonc_Cycle_Sig.repeats_min,
            Hemonc_Cycle_Sig.repeats_max,
            Hemonc_Cycle_Sig.repeats_units,
            Hemonc_Cycle_Sig.residual
        )
        .join(Hemonc_Regimen_Part, Hemonc_Regimen_Part.variant_cui == Hemonc_Variant.variant_cui)
        .join(Hemonc_Cycle_Sig, Hemonc_Cycle_Sig.cycle_sig_cui == Hemonc_Regimen_Part.cycle_sig_cui)
        .filter(Hemonc_Variant.variant_cui.in_(relevant_variants))
)

In [20]:
variants.cycle_sig_id.value_counts()

cycle_sig_id
28-day cycles                                                                                                                                                                  553
21-day cycles                                                                                                                                                                  435
21-day cycle for 4 cycles                                                                                                                                                      420
21-day course                                                                                                                                                                  369
21-day cycle for 3 cycles                                                                                                                                                      246
                                                                                            

In [21]:
variants[variants.regimen_cui == 29572].sort_values('variant_name')

Unnamed: 0,regimen_cui,variant_name,regimen_part_id,variant_cui,timing_unit,timing,portion,cycle_sig_cui,cycle_sig_id,cycle_len_min,cycle_len_max,cycle_len_units,duration_min,duration_max,duration_units,frequency_min,frequency_max,frequency_units,repeats_min,repeats_max,repeats_units,residual
997,29572,Variant #02,1815,136675,,Cycles 1 to 6,-,713,21-day cycle for 6 cycles,21.0,21,day,,,,21.0,21.0,day,6.0,6.0,cycle,
998,29572,Variant #02,1816,136675,,,-,752,21-day cycle for 35 cycles,21.0,21,day,,,,21.0,21.0,day,35.0,35.0,cycle,
999,29572,Variant #03,1817,136676,,Cycles 1 to 6,-,713,21-day cycle for 6 cycles,21.0,21,day,,,,21.0,21.0,day,6.0,6.0,cycle,
1000,29572,Variant #03,1818,136676,,,-,752,21-day cycle for 35 cycles,21.0,21,day,,,,21.0,21.0,day,35.0,35.0,cycle,
1001,29572,Variant #04,1819,136677,,Cycles 1 to 4,-,705,21-day cycle for 4 cycles,21.0,21,day,,,,21.0,21.0,day,4.0,4.0,cycle,
1002,29572,Variant #04,1820,136677,,Cycles 1 to 35,-,752,21-day cycle for 35 cycles,21.0,21,day,,,,21.0,21.0,day,35.0,35.0,cycle,
1003,29572,Variant #04,1821,136677,,,-,953,21-day cycles,28.0,28,day,,,,21.0,21.0,day,,,,


In [22]:
with so.Session(engine) as session:
    days = pd.DataFrame(
        session.query(
            Hemonc_Sig.sig_cui,
            Hemonc_Sig.sig_id,
            Hemonc_Sig.regimen_part_cui,
            Hemonc_Sig.regimen_part_id,
            Hemonc_Sig.variant_cui,
            Hemonc_Sig.component_code,
            Hemonc_Sig.component_name,
            Hemonc_Sig.component_class,
            Hemonc_Sig.step_number,
            Hemonc_Sig.tail,
            Hemonc_Sig.route,
            Hemonc_Sig.doseminnum,
            Hemonc_Sig.dosemaxnum,
            Hemonc_Sig.doseunit,
            Hemonc_Sig.dosecapnum,
            Hemonc_Sig.dosecapunit,
            Hemonc_Sig.durationminnum,
            Hemonc_Sig.durationmaxnum,
            Hemonc_Sig.durationunit,
            Hemonc_Sig.frequency,
            Hemonc_Sig.inparens,
            Hemonc_Sig.sequence,
            Hemonc_Sig.seq_rel,
            Hemonc_Sig.seq_rel_what,            
            Sig_Days.day
        )
        .join(Sig_Days, sa.and_(Sig_Days.sig_cui == Hemonc_Sig.sig_cui, Sig_Days.sig_id == Hemonc_Sig.sig_id), isouter=True)
        .filter(Hemonc_Sig.variant_cui.in_(relevant_variants))
)

In [23]:
variants[variants.variant_cui == 136675].sort_values('cycle_sig_cui')

Unnamed: 0,regimen_cui,variant_name,regimen_part_id,variant_cui,timing_unit,timing,portion,cycle_sig_cui,cycle_sig_id,cycle_len_min,cycle_len_max,cycle_len_units,duration_min,duration_max,duration_units,frequency_min,frequency_max,frequency_units,repeats_min,repeats_max,repeats_units,residual
997,29572,Variant #02,1815,136675,,Cycles 1 to 6,-,713,21-day cycle for 6 cycles,21.0,21,day,,,,21.0,21.0,day,6.0,6.0,cycle,
998,29572,Variant #02,1816,136675,,,-,752,21-day cycle for 35 cycles,21.0,21,day,,,,21.0,21.0,day,35.0,35.0,cycle,


In [24]:
route_mapper = {
    'IV': 'Onsite',  'IM': 'Onsite', 'SC': 'Onsite', 
    'IT': 'Onsite', 'Intravenous': 'Onsite', 'Subcutaneous': 'Onsite', 
    'intravesicularly': 'Onsite', 'Intramuscular': 'Onsite', 'IA': 'Onsite', 
    'IP': 'Onsite', 'Intracavitary': 'Onsite', 'by scarification': 'Onsite', 
    'Intravesical': 'Onsite', 'Intra-arterial': 'Onsite',
    'Oral': 'Home', 'PO': 'Home', 'Topical': 'Home'
}

days['administration_site'] = days.route.map(route_mapper)

In [25]:
days.administration_site.value_counts()

administration_site
Onsite    12592
Home       6554
Name: count, dtype: int64

In [26]:
days.columns

Index(['sig_cui', 'sig_id', 'regimen_part_cui', 'regimen_part_id',
       'variant_cui', 'component_code', 'component_name', 'component_class',
       'step_number', 'tail', 'route', 'doseminnum', 'dosemaxnum', 'doseunit',
       'dosecapnum', 'dosecapunit', 'durationminnum', 'durationmaxnum',
       'durationunit', 'frequency', 'inparens', 'sequence', 'seq_rel',
       'seq_rel_what', 'day', 'administration_site'],
      dtype='object')

In [27]:
disease_restricted_drugs = '|'.join(['ketoconazole','kacrolimus','megestrol',
                            'estradiol','octreotide','everolimus','valproate',
                            'goserelin','quinine','sirolimus'])

bad_drugs = '|'.join(['Dexamethasone','Prednisone','Prednisolone','Methylprednisolone',
             'Filgrastim','Folinic Acid','Mesna','Mycophenolate mofetil',
             'Folinic acid', 'nan'])

In [28]:
days.head()

Unnamed: 0,sig_cui,sig_id,regimen_part_cui,regimen_part_id,variant_cui,component_code,component_name,component_class,step_number,tail,route,doseminnum,dosemaxnum,doseunit,dosecapnum,dosecapunit,durationminnum,durationmaxnum,durationunit,frequency,inparens,sequence,seq_rel,seq_rel_what,day,administration_site
0,9,0,0,8,129498,126,Cytarabine,IV continuous canonical Sig,1 of 1,-,IV,100,100,mg/m^2/day,,,120.0,120.0,hour,continuous,(total dose: 500 mg/m^2),,,,1,Onsite
1,10,1,0,8,129498,143,Daunorubicin,IV intermittent canonical Sig,1 of 1,-,IV,45,45,mg/m^2,,,,,,once per day,,,,,1,Onsite
2,10,1,0,8,129498,143,Daunorubicin,IV intermittent canonical Sig,1 of 1,-,IV,45,45,mg/m^2,,,,,,once per day,,,,,2,Onsite
3,13,0,0,10,129500,126,Cytarabine,IV continuous canonical Sig,1 of 1,-,IV,100,100,mg/m^2/day,,,120.0,120.0,hour,continuous,(total dose: 500 mg/m^2),,,,1,Onsite
4,14,1,0,10,129500,143,Daunorubicin,IV intermittent canonical Sig,1 of 1,-,IV,60,60,mg/m^2,,,,,,once per day,,,,,1,Onsite


In [29]:
print(len(days[days.component_class=='Non-canonical Sig'])/len(days))

days.component_class.value_counts()

0.06003743104806935


component_class
IV intermittent canonical Sig    10373
Non-IV canonical Sig              7447
Non-canonical Sig                 1219
Rad Sig                            936
IV continuous canonical Sig        328
Name: count, dtype: int64

In [30]:
# we are fully ignoring non-canonical sigs for the moment - to revisit
non_canonical = days[(days.component_class=='Non-canonical Sig') & ~days.component_name.str.contains(bad_drugs)].copy()
# split events by administration location for special handling of drugs that may not appear in the source system
onsite_events = days[(days.component_class!='Non-canonical Sig') & ~days.component_name.str.contains(bad_drugs) & (days.administration_site == 'Onsite')].copy()
home_events = days[(days.component_class!='Non-canonical Sig') & ~days.component_name.str.contains(bad_drugs) & (days.administration_site == 'Home')].copy()

In [31]:
# IV intermittent canonical Sig is a SIG for an intravenous medication administered on a single calendar day that has at a minimum a value for each of the following variables: [Component] [Dose] [Dose Unit] [Route] [Frequency] [Schedule].
# Example: Cisplatin 70 mg/m^2 IV once on day 1
# Note that these SIGs can have additional information; this is the minimum requirement. A more complicated example that is also canonical is as follows:
# Example: Cisplatin 70 mg/m^2 (maximum dose of 140 mg) IV over 30 to 60 minutes once on day 1, given second, 30 minutes after pemetrexed
# Example structure: [Component] [Dose] [Dose Unit] ([Dose Cap] [Dose Cap Unit]) [Route] [Duration Min] [Duration Max] [Duration Unit] [Frequency] [Schedule] [Sequence] [seq.rel.when] [seq.rel.when.unit] [seq.rel] [seq.rel.what]

canonical_cols = ['sig_cui', 'sig_id', 'regimen_part_cui', 'regimen_part_id',
                  'variant_cui', 'component_code', 'component_name', 'step_number',
                  'route', 'doseminnum', 'dosemaxnum', 'doseunit', 'dosecapnum',
                  'frequency', 'day']

#------
# IV continuous canonical Sig is a SIG for an intravenous medication that is infused over a protracted period of time, crossing multiple calendar days. These have a more complicated structure and require a duration in order to be canonical, as follows:
# [Component] [Dose] [Dose Unit]/day [Route] continuous infusion over [Duration] [Duration Unit], started on [Schedule] (total dose: [Dose] [Dose Unit])
# Example: Cytarabine 200 mg/m2/day IV continuous infusion over 7 days, started on day 1 (total dose: 1400 mg/m2)
# Note that while the information in parentheses is technically redundant, it is included both on HemOnc.org and in the HemOncKB as an extra check for accuracy, given the complexity of this dosing format. If the continuous SIG is repeated more than once, the total dose per cycle is given:
# Example: Doxorubicin 37.5 mg/m2/day IV continuous infusion over 48 hours, started on day 1 (total dose per cycle: 75 mg/m2)

continuous_cols = ['sig_cui', 'sig_id', 'regimen_part_cui', 'regimen_part_id',
                  'variant_cui', 'component_code', 'component_name', 'step_number',
                  'route', 'doseminnum', 'dosemaxnum', 'doseunit', 'dosecapnum', 
                  'durationminnum', 'durationmaxnum', 'durationunit',
                  'frequency', 'day']

#------
# Non-IV canonical Sig is a SIG for any medication other than those given by the intravenous route that has at least the following information (the same requirements as IV intermittent canonical Sig):  [Component] [Dose] [Dose Unit] [Route] [Frequency] [Schedule].
# Example: Capecitabine 1000 mg/m^2 PO twice per day on days 1 to 14
# -----
# Non-canonical Sig is a medication SIG that does not meet the above criteria, for any number of reasons. Most commonly, these SIGs are missing one or more required components, such as dose, schedule, and/or route. 
# Our goal is to minimize the number of such non-canonical SIGs, although they are unavoidable as some references do not provide enough specifics to fully define SIGs.
# -----
# Rad Sig is a radiation therapy prescription. These are not further specified into canonical or non-canonical forms, currently.

In [32]:
canonical_iv = onsite_events[onsite_events.component_class=='IV intermittent canonical Sig'][canonical_cols].copy()

In [33]:
canonical_iv[canonical_iv.day=='-2']

Unnamed: 0,sig_cui,sig_id,regimen_part_cui,regimen_part_id,variant_cui,component_code,component_name,step_number,route,doseminnum,dosemaxnum,doseunit,dosecapnum,frequency,day
82,120,0,0,58,129539,17,Alemtuzumab,1 of 3,IV,3,3,mg,,once,-2
86,124,0,1,59,129539,17,Alemtuzumab,1 of 3,IV,3,3,mg,,once,-2
997,1043,2,0,685,129861,126,Cytarabine,1 of 1,IV,200,200,mg/m^2,,every 12 hours,-2
1000,1044,3,0,685,129861,201,Etoposide,1 of 1,IV,200,200,mg/m^2,,once per day,-2
1006,1063,2,0,689,129865,126,Cytarabine,1 of 1,IV,400,400,mg/m^2,,once per day,-2
1009,1064,3,0,689,129865,201,Etoposide,1 of 1,IV,200,200,mg/m^2,,once per day,-2
1021,1070,4,0,690,129866,325,Melphalan,1 of 1,IV,140,140,mg/m^2,,once,-2
1026,1093,2,0,696,129872,126,Cytarabine,1 of 1,IV,400,400,mg/m^2,,once per day,-2
1029,1094,3,0,696,129872,201,Etoposide,1 of 1,IV,200,200,mg/m^2,,once per day,-2
1688,1413,2,0,900,129999,325,Melphalan,1 of 1,IV,70,70,mg/m^2,,once per day,-2


In [35]:
variants.cycle_len_min = variants.cycle_len_min.replace('', 0).astype(float).astype(int)
canonical_iv.day = canonical_iv.day.map(lambda x: x.strip('()')).astype(int)

unit_to_days = {
    "day":   1,
    "week":  7,
    "month": 30,   
    "year":  365,
    "indeterminate": 730 # 2 years if we don't know? it's not common...
}

variants['cycle_len_days'] = (
    variants['cycle_len_min'].astype(float)
      * variants['cycle_len_units'].map(unit_to_days).fillna(730)
).astype(int)

In [36]:
canonical_iv_dose_matrices = {}

for sc, dets in canonical_iv.groupby('regimen_part_id'):
    df = dets.merge(variants[['variant_cui', 'cycle_len_days', 'cycle_len_units']])
    min_day = df['day'].min()
    max_day = int(df['cycle_len_days'].iloc[0])
    drug_mat = pd.DataFrame({'day': range(min_day, max_day + 1)})

    dose_matrix = (
        df.assign(value=1)
        .pivot_table(
            index='day',
            columns='component_name',
            values='value',
            aggfunc='max',
            fill_value=0
        )
        .reindex(drug_mat['day'], fill_value=0)    # extend to full cycle length
        .reset_index()
    )
    canonical_iv_dose_matrices[sc] = dose_matrix

In [37]:
canonical_iv.regimen_part_id.value_counts()

regimen_part_id
3292    22
444     20
1715    20
1755    18
69      18
        ..
4183     1
4187     1
4203     1
4204     1
7803     1
Name: count, Length: 3622, dtype: int64

In [38]:
canonical_iv_dose_matrices[3292]

component_name,day,Asparaginase,Cyclophosphamide,Daunorubicin,Rituximab,Vincristine
0,1,0,1,1,1,1
1,2,0,0,0,0,0
2,3,0,0,1,0,0
3,4,0,0,0,0,0
4,5,0,0,0,0,0
5,6,0,0,0,1,0
6,7,0,0,0,1,1
7,8,1,0,0,0,0
8,9,0,0,0,0,0
9,10,1,0,0,0,0


In [39]:
# consider populating matrix with 2 / 3 / 4 ? 
canonical_iv.frequency.value_counts()

frequency
once per day      5351
once              4085
every 12 hours     187
twice per day       32
daily NOS           18
every 6 hours        3
every 8 hours        1
Name: count, dtype: int64

In [40]:
home_events.component_class.value_counts()

component_class
Non-IV canonical Sig    3998
Name: count, dtype: int64

In [None]:
onsite_events[(onsite_events.component_name=='Cyclophosphamide') & (onsite_events.component_class!='Non-canonical Sig')]

In [None]:
onsite_events[onsite_events.component_class=='Non-canonical Sig'].component_name.value_counts()[:50]

In [None]:
days[days.component_name.str.contains('paclitaxel', case=False, na=False)].head()

In [None]:
days[days.component_name.str.contains('goserelin', case=False, na=False)].head()

In [None]:
days.columns