In [None]:
from datetime import datetime
import pandas as pd
import sqlalchemy as sa
import sqlalchemy.orm as so
import numpy as np
import matplotlib.pyplot as plt
from hemonc_alchemy.model.hemonc_model import Hemonc_Study, Hemonc_Modality, Hemonc_Condition, Hemonc_Component_Role, Hemonc_Ref, Hemonc_Component, Hemonc_Component_Class, Hemonc_Context, Hemonc_Regimen, Hemonc_Branch_Conditional, Hemonc_Variant, Hemonc_Regimen_Part, Part_Phase, Hemonc_Cycle_Sig, Hemonc_Sig, Sig_Days, Base, component_to_class_map, variant_study_map, regimen_to_modality_map
from pathlib import Path
from datetime import date

# in this notebook are included demo functions for exploring the enriched functionality of the whole HemOnc data model, beyond what is available in OMOP alone

In [None]:
import omop_alchemy as oa
engine = oa.oa_config.engine

In [None]:
# study objects provide a more nuanced link between regimen and conditions via the variants and specific condition context in which they were studied

with so.Session(engine) as session:
    reg_study_condition = pd.DataFrame(session.query(Hemonc_Regimen.regimen_cui, 
                                                     Hemonc_Regimen.regimen_name,
                                                     Hemonc_Variant.variant_name,
                                                     Hemonc_Variant.variant_cui,
                                                     Hemonc_Study.study_code,
                                                     Hemonc_Study.start,
                                                     Hemonc_Study.end,
                                                     Hemonc_Study.sponsor_type,
                                                     Hemonc_Study.enrollment_from,
                                                     Hemonc_Study.enrollment_to,
                                                     Hemonc_Ref.title,
                                                     Hemonc_Ref.pub_date,
                                                     Hemonc_Condition.condition_name
                                                     ).join(Hemonc_Variant, Hemonc_Variant.regimen_cui == Hemonc_Regimen.regimen_cui, isouter=True
                                                     ).join(variant_study_map, variant_study_map.c.variant_cui==Hemonc_Variant.variant_cui, isouter=True
                                                     ).join(Hemonc_Study, Hemonc_Study.study_code == variant_study_map.c.study_code, isouter=True
                                                     ).join(Hemonc_Ref, Hemonc_Study.study_code == Hemonc_Ref.study, isouter=True
                                                     ).join(Hemonc_Condition, Hemonc_Condition.condition_code == Hemonc_Study.condition_code, isouter=True
                                                     ).all())

In [None]:
reg_study_condition.start = pd.to_datetime(reg_study_condition.start)
reg_study_condition.end = pd.to_datetime(reg_study_condition.end)
reg_study_condition.pub_date = pd.to_datetime(reg_study_condition.pub_date)

In [None]:
# study linkages allow us to explore date of first availability for regimens - this is imperfect in the source, but we resolve this by creating a fallback option

reg_study_condition = reg_study_condition.merge(reg_study_condition.groupby('variant_cui').end.min().reset_index().rename(columns={'end': 'earliest_end'}), on='variant_cui', how='left')
reg_study_condition = reg_study_condition.merge(reg_study_condition.groupby('variant_cui').pub_date.min().reset_index().rename(columns={'pub_date': 'earliest_pub'}), on='variant_cui', how='left')
reg_study_condition = reg_study_condition.merge(reg_study_condition.groupby('variant_cui').start.min().reset_index().rename(columns={'start': 'earliest_start'}), on='variant_cui', how='left')
reg_study_condition['reference_date'] = reg_study_condition.earliest_end.combine_first(reg_study_condition.earliest_start).combine_first(reg_study_condition.earliest_pub)

In [None]:
# even with all fallbacks, some still are undated - this is correct per source
reg_study_condition[reg_study_condition.reference_date.isna()].study_code.value_counts()
reg_study_condition_with_date = reg_study_condition.dropna(subset='reference_date').copy()

In [None]:
reg_study_condition_with_date.head()

In [None]:
# the full expression of cycle and component sigs are not available in the OMOP vocabularies, and arguably cannot be made available in enough detail to compare
# precisely the prescribed baseline regimen to the drug exposure events as delivered

with so.Session(engine) as session:
    var_component_days = pd.DataFrame(session.query(Hemonc_Regimen.regimen_name,
                                                    Hemonc_Regimen.regimen_cui,  
                                                    Hemonc_Variant.variant_name, 
                                                    Hemonc_Variant.variant_cui,
                                                    Hemonc_Cycle_Sig.cycle_len_min,
                                                    Hemonc_Cycle_Sig.cycle_len_max,
                                                    Hemonc_Cycle_Sig.cycle_len_units,
                                                    Hemonc_Cycle_Sig.frequency_min,
                                                    Hemonc_Cycle_Sig.frequency_max,
                                                    Hemonc_Cycle_Sig.frequency_units,
                                                    Hemonc_Cycle_Sig.residual,
                                                    Hemonc_Regimen_Part.regimen_part_id,
                                                    Hemonc_Regimen_Part.timing,
                                                    Hemonc_Regimen_Part.timing_unit, 
                                                    Hemonc_Regimen_Part.portion,
                                                    Hemonc_Sig.frequency,
                                                    Hemonc_Sig.component_name,
                                                    Hemonc_Sig.component_role,
                                                    Hemonc_Sig.step_number,
                                                    Hemonc_Sig.route,
                                                    Hemonc_Sig.doseMinNum,
                                                    Hemonc_Sig.doseMaxNum,
                                                    Hemonc_Sig.component_class,
                                                    Hemonc_Sig.tail,
                                                    Sig_Days.sig_id,
                                                    Sig_Days.day
                                                  ).join(Hemonc_Variant, Hemonc_Variant.regimen_cui == Hemonc_Regimen.regimen_cui, isouter=True
                                                  ).join(Hemonc_Regimen_Part, Hemonc_Regimen_Part.variant_cui==Hemonc_Variant.variant_cui, isouter=True
                                                  ).join(Hemonc_Cycle_Sig, Hemonc_Cycle_Sig.cycle_sig_id==Hemonc_Regimen_Part.cycle_sig_id, isouter=True
                                                  ).join(Hemonc_Sig, sa.and_(Hemonc_Sig.variant_cui==Hemonc_Regimen_Part.variant_cui, Hemonc_Sig.regimen_part_id==Hemonc_Regimen_Part.regimen_part_id), isouter=True
                                                  ).join(Sig_Days, sa.and_(Sig_Days.variant_cui==Hemonc_Sig.variant_cui, Sig_Days.regimen_part_id==Hemonc_Sig.regimen_part_id, Sig_Days.sig_id==Hemonc_Sig.sig_id)).all())

In [None]:
with so.Session(engine) as session:
    reg_modalities = pd.DataFrame(session.query(Hemonc_Regimen.regimen_name,
                                                Hemonc_Regimen.regimen_cui,  
                                                Hemonc_Modality.modality_name
                                                ).join(regimen_to_modality_map, Hemonc_Regimen.regimen_cui == regimen_to_modality_map.c.regimen_cui
                                                ).join(Hemonc_Modality, Hemonc_Modality.modality_code == regimen_to_modality_map.c.modality_code).all())
                                             

In [None]:
first_reg = reg_study_condition_with_date.reference_date.min()
reg_study_condition_with_date['elapsed_years'] = reg_study_condition_with_date.reference_date.dt.year - first_reg.year
reg_study_condition_with_date['elapsed_months'] = reg_study_condition_with_date.reference_date.dt.month - first_reg.month + 12*reg_study_condition_with_date.elapsed_years


In [None]:
reg_study_modalities = reg_study_condition_with_date.merge(reg_modalities, how='left')[['regimen_cui', 'reference_date', 'condition_name', 'modality_name', 'elapsed_months']].drop_duplicates()

In [None]:
reg_study_modalities['cc'] = reg_study_modalities.sort_values(['reference_date', 'regimen_cui']).groupby('modality_name').regimen_cui.cumcount()

In [None]:
# visualisation for patterns of modality availability over time 

import plotly.graph_objects as go
import plotly.express as px

modalities_of_interest = ['Chemotherapy', 'Immunotherapy', 'Endocrine therapy', 'Targeted therapy']
regimen_modalities = reg_study_modalities[reg_study_modalities.modality_name.isin(modalities_of_interest)].sort_values(['reference_date', 'regimen_cui'])

# Question: instead of plotting regimens alone, should we be plotting variants? Or regimen/condition pairs?

tick_text = list(range(regimen_modalities.reference_date.min().year, date.today().year, 5))
tick_vals = [i*60 + regimen_modalities.elapsed_months.min() for i in range(len(tick_text))]

fig = px.line(regimen_modalities, 
              x='elapsed_months', 
              y='cc', 
              log_y=True, 
              color='modality_name',
              labels={
                     'elapsed_months': 'Date',
                     'cc': 'Cumulative Regimen Count (log)',
                     'modality_name': 'Regimen Modality'
                 },
              title='Availability of New Regimens by Date and Modality')

fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = tick_vals, 
        ticktext = tick_text
    )
)
fig.show()

In [None]:
with so.Session(engine) as session:
    component_roles = pd.DataFrame(session.query(Hemonc_Regimen.regimen_cui,
                                                 Hemonc_Regimen.regimen_name,
                                                 Hemonc_Component_Role.relationship_id,
                                                 Hemonc_Component.component_code,
                                                 Hemonc_Component.component_name,
                                                 Hemonc_Component_Class.component_class_code,
                                                 Hemonc_Component_Class.component_class_name,
                                                 ).join(Hemonc_Component_Role, Hemonc_Regimen.regimen_cui==Hemonc_Component_Role.regimen_cui
                                                 ).join(Hemonc_Component, Hemonc_Component.component_code==Hemonc_Component_Role.component_code, isouter=True
                                                 ).join(Hemonc_Component_Class, Hemonc_Component_Class.component_class_code==Hemonc_Component_Role.component_class_code, isouter=True
                                                 ).all())
    
with so.Session(engine) as session:
    component_classes = pd.DataFrame(session.query(Hemonc_Regimen.regimen_cui,
                                                   Hemonc_Regimen.regimen_name,
                                                   Hemonc_Component_Role.relationship_id,
                                                   Hemonc_Component.component_code,
                                                   Hemonc_Component.component_name,
                                                   Hemonc_Component_Class.component_class_code,
                                                   Hemonc_Component_Class.component_class_name,
                                                   ).join(Hemonc_Component_Role, Hemonc_Regimen.regimen_cui==Hemonc_Component_Role.regimen_cui
                                                   ).join(Hemonc_Component, Hemonc_Component.component_code==Hemonc_Component_Role.component_code, isouter=True
                                                   ).join(component_to_class_map, Hemonc_Component.component_code==component_to_class_map.c.component_code, isouter=True       
                                                   ).join(Hemonc_Component_Class, Hemonc_Component_Class.component_class_code==component_to_class_map.c.component_class_code, isouter=True
                                                   ).all())

In [None]:
component_roles_with_dates = component_roles.merge(reg_study_condition_with_date[['regimen_cui', 'reference_date', 'elapsed_months']].drop_duplicates())

In [None]:
chemo_count = component_roles_with_dates[
    component_roles_with_dates.relationship_id.str.contains('chemo')
    ].groupby(
        'regimen_cui'
        ).component_name.nunique().reset_index().rename(columns={'component_name': 'chemo_n'})

chemo_count = reg_study_condition_with_date[['regimen_cui', 'regimen_name', 'reference_date', 'condition_name', 'elapsed_months']].drop_duplicates().merge(chemo_count, on='regimen_cui')
chemo_count['cc'] = chemo_count.sort_values(['reference_date', 'regimen_cui']).groupby('chemo_n').regimen_cui.cumcount()

In [None]:
# visualisation for # of agents over time as a proxy measure for regimen complexity

tick_text = list(range(chemo_count.reference_date.min().year, date.today().year, 5))
tick_vals = [i*60 + chemo_count.elapsed_months.min() for i in range(len(tick_text))]

fig = px.line(chemo_count.sort_values(['reference_date', 'regimen_cui']),
              x='elapsed_months', 
              y='cc', 
              log_y=True, 
              color='chemo_n',
              labels={
                     'elapsed_months': 'Date',
                     'cc': 'Cumulative Regimen Count (log)',
                     'chemo_n': '# Chemo Agents'
                 },
              title='Availability of New Regimens by Number of Chemotherapy Agents')

fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = tick_vals, 
        ticktext = tick_text
    )
)
fig.show()

In [None]:
with so.Session(engine) as session:
    reg_study_condition = pd.DataFrame(session.query(Hemonc_Regimen.regimen_cui, 
                                                     Hemonc_Regimen.regimen_name,
                                                     Hemonc_Variant.variant_name,
                                                     Hemonc_Variant.variant_cui,
                                                     Hemonc_Study.study_code,
                                                     Hemonc_Study.start,
                                                     Hemonc_Study.end,
                                                     Hemonc_Study.sponsor_type,
                                                     Hemonc_Study.enrollment_from,
                                                     Hemonc_Study.enrollment_to,
                                                     Hemonc_Ref.title,
                                                     Hemonc_Ref.pub_date,
                                                     Hemonc_Condition.condition_code,
                                                     Hemonc_Condition.condition_name
                                                     ).join(Hemonc_Variant, Hemonc_Variant.regimen_cui == Hemonc_Regimen.regimen_cui, isouter=True
                                                     ).join(variant_study_map, variant_study_map.c.variant_cui==Hemonc_Variant.variant_cui, isouter=True
                                                     ).join(Hemonc_Study, Hemonc_Study.study_code == variant_study_map.c.study_code, isouter=True
                                                     ).join(Hemonc_Ref, Hemonc_Study.study_code == Hemonc_Ref.study, isouter=True
                                                     ).join(Hemonc_Condition, Hemonc_Condition.condition_code == Hemonc_Study.condition_code, isouter=True
                                                     ).all())