In [1]:
import os
import sys
import pandas as pd
base_path = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(base_path, '../src')))
from utils import importer as imp
from utils import helper as hp
from utils import exporter as ex
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
parent_dir = os.path.dirname(os.getcwd())
os.chdir(parent_dir)

# GMDS related stuff

In [None]:
# Sunburst Plot - Disease Dynamics
# data
sql = """select e.*, c.concept_name, c2.concept_name as episode_source_concept_name, c2.vocabulary_id, c2.concept_code
            from cdm.episode e 
            inner join cdm.concept c 
            on episode_concept_id = c.concept_id  
            and c.concept_class_id = 'Disease Dynamic'
            left join cdm.concept c2
            on c2.concept_id = e.episode_source_concept_id
            where c2.concept_code like '%C50%';"""
disease_dynamic = ex.execute_sql(sql)
disease_dynamic.head(10)

# formatting
disease_dynamic['episode_start_date'] = pd.to_datetime(disease_dynamic['episode_start_date'])

# sorting
df_sorted = disease_dynamic.sort_values(by=['person_id', 'episode_start_date', 'episode_source_concept_id'])


# dnymaics path sorted
paths = (
    df_sorted
    .groupby(['person_id','episode_source_concept_id'])['concept_name']
    .apply(lambda x: ' -> '.join(x))
    .reset_index(name='concept_path')
)

# calculate paths
paths['path_length'] = paths['concept_path'].apply(lambda x: len(x.split(' -> ')))

# splitting paths into levels
max_depth = paths['concept_path'].str.count('->').max() + 1

for i in range(max_depth):
    paths[f'level_{i+1}'] = paths['concept_path'].apply(
        lambda x: x.split(' -> ')[i] if i < len(x.split(' -> ')) else None
    )
print("Paths with levels:")
print(paths)

# 2. tree structure
ids = []
parents = []
labels = []

for _, row in paths.iterrows():
    levels = [row[f'level_{i+1}'] for i in range(max_depth)]
    for i, level in enumerate(levels):
        if pd.isna(level):
            continue
        curr_id = " - ".join(levels[:i+1])
        parent = " - ".join(levels[:i]) if i > 0 else ""
        ids.append(curr_id)
        parents.append(parent)
        labels.append(levels[i])

# 3. Calculate unique id-parents for DataFrame
n = 1000
df_nodes = pd.DataFrame({'id': ids, 'parent': parents, 'label': labels})
df_nodes = df_nodes.value_counts().reset_index(name='count')
df_nodes
df_nodes= df_nodes[df_nodes['count'] > n]
# df_nodes = df_nodes[~df_nodes['id'].isin(['Complete Remission - Stable Disease - Complete Remission', 'Complete Remission - Complete Remission', 'Stable Disease - Complete Remission'])]
print(f"Filtered nodes with count > {n}:")
print(df_nodes)

# color formatting
color_map = {
    'Stable Disease': 'rgba(153, 217, 190, 0.5)',  
    'Complete Remission': 'rgba(158, 202, 225, 0.5)',  
    'Partial Remission': 'rgba(141, 160, 203, 0.5)',  
    'Progression': 'rgba(244, 182, 216, 0.5)'          
}
df_nodes['color'] = df_nodes['label'].map(color_map).fillna('#D3D3D380') 
df_nodes['label_count'] = df_nodes.apply(
    lambda row: f"{row['label']} ({row['count']})", axis=1
)

# 4. Sunburst plotten
fig = px.sunburst(
    df_nodes,
    ids='id',
    names='label_count',
    parents='parent',
    color='label',
    color_discrete_map=color_map,
    values='count',
    title=f"Patientenpfade der concept_name (Länge > {n}, zeitlich sortiert)",
    width=1200,   
    height=1200
)

fig.update_traces(
    insidetextfont=dict(size=18),
    outsidetextfont=dict(size=14)
    #textinfo='label_count'
)

fig.update_layout(
    uniformtext=dict(minsize=11.5, mode='show'),
    title_font_size=24,
    font=dict(size=16)
)
fig.show()
fig.write_html("sunburst.html")



In [None]:
# Sunburst Plot - Regimen
# data

sql2 = """with pat_filter as(
    select distinct ep.person_id
    from cdm.episode ep
    inner join cdm.concept c
    on ep.episode_concept_id = c.concept_id
    and c.concept_name = 'Complete Remission'
    inner join cdm.concept c2
            on c2.concept_id = ep.episode_source_concept_id
            where c2.concept_code like '%C50%'
            and ep.episode_number=1
    )
    select e.*, c.concept_name
            from cdm.episode e 
            inner join cdm.concept c 
            on episode_object_concept_id = c.concept_id  
            and c.vocabulary_id = 'HemOnc'
            inner join pat_filter p
            on p.person_id = e.person_id;"""
            
regimen = ex.execute_sql(sql2)
#disease_dynamic.head(10)

# formatting
regimen['episode_start_date'] = pd.to_datetime(regimen['episode_start_date'])

# sorting
df_sorted = regimen.sort_values(by=['person_id', 'episode_start_date'])


# dnymaics path sorted
paths = (
    df_sorted
    .groupby(['person_id'])['concept_name']
    .apply(lambda x: ' -> '.join(x))
    .reset_index(name='concept_path')
)

# calculate paths
paths['path_length'] = paths['concept_path'].apply(lambda x: len(x.split(' -> ')))

# splitting paths into levels
max_depth = paths['concept_path'].str.count('->').max() + 1

for i in range(max_depth):
    paths[f'level_{i+1}'] = paths['concept_path'].apply(
        lambda x: x.split(' -> ')[i] if i < len(x.split(' -> ')) else None
    )

level1_counts = paths['level_1'].value_counts()
n = 1000
valid_level1 = level1_counts[level1_counts >= n].index
paths = paths[paths['level_1'].isin(valid_level1)]
print("Paths with levels:")
print(paths)

# 2. tree structure
ids = []
parents = []
labels = []

for _, row in paths.iterrows():
    levels = [row[f'level_{i+1}'] for i in range(max_depth)]
    for i, level in enumerate(levels):
        if pd.isna(level):
            continue
        curr_id = " - ".join(levels[:i+1])
        parent = " - ".join(levels[:i]) if i > 0 else ""
        ids.append(curr_id)
        parents.append(parent)
        labels.append(levels[i])

# 3. Calculate unique id-parents for DataFrame
n2 = 200
df_nodes = pd.DataFrame({'id': ids, 'parent': parents, 'label': labels})
df_nodes = df_nodes.value_counts().reset_index(name='count')
df_nodes
df_nodes= df_nodes[df_nodes['count'] > n2]
print(f"Filtered nodes with count > {n2}:")
print(df_nodes)

# color formatting with matplotlib
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# unique Labels of df_nodes
unique_labels = df_nodes['label'].unique()

# Farbmap from matplotlib
base_cmap = cm.get_cmap('Pastel1', len(unique_labels))  # oder 'Pastel1', 'Set3', ...

# hexa - rgba transformation
color_map = {
    label: f'rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, 0.5)'
    for label, (r, g, b, _) in zip(unique_labels, base_cmap(range(len(unique_labels))))
}
df_nodes['color'] = df_nodes['label'].map(color_map).fillna('#D3D3D380') 
df_nodes['label_count'] = df_nodes.apply(
    lambda row: f"{row['label']} ({row['count']})", axis=1
)

# 4. Sunburst plotten
fig = px.sunburst(
    df_nodes,
    ids='id',
    names='label_count',
    parents='parent',
    color='label',
    color_discrete_map=color_map,
    values='count',
    #title=f"Patientenpfade der concept_name (Länge > {n}, zeitlich sortiert)",
    width=1200,   
    height=1200
)

fig.update_traces(
    insidetextfont=dict(size=18),
    outsidetextfont=dict(size=14)
    #textinfo='label_count'
)

fig.update_layout(
    uniformtext=dict(minsize=14, mode='show'),
    title_font_size=24,
    font=dict(size=16)
)
fig.show()
fig.write_html("sunburst.html")



In [None]:
# Gant Diagramm - episode
sql3 = """select e.*,c.concept_name as label, case when c.concept_name = 'Cancer Drug Treatment' then 'Drug Treatment ' || cc.concept_name
                            when c.concept_name = 'Treatment Regimen' then 'Treatment Regimen ' || cc.concept_name
else c.concept_name end as concept_name from cdm.episode e inner join cdm.concept c on episode_concept_id = c.concept_id inner join cdm.concept cc on e.episode_object_concept_id = cc.concept_id;"""
episode = ex.execute_sql(sql3)


patient_id =245
df = episode[episode["person_id"] == patient_id].copy()

# impute missung date data
df["episode_end_date"] = df["episode_end_date"].fillna(pd.Timestamp.today())

# y-axis formatting
df["episode_label"] = (
    "Episode " + df["episode_id"].astype(str) + ": " + df["concept_name"]
)
df["episode_start_date"] = pd.to_datetime(df["episode_start_date"], errors="coerce")
df["episode_end_date"] = pd.to_datetime(df["episode_end_date"], errors="coerce")
df["episode_end_date_filled"] = df["episode_end_date"].fillna(pd.Timestamp.today())
df["text_label"] = (
    df["concept_name"] + ":\nStart: " +
    df["episode_start_date"].dt.strftime('%d.%m.%Y') + " - End: " +
    df["episode_end_date"].dt.strftime('%d.%m.%Y')
)


color_map = {
    "Disease Episode": "#B3C7E6",           
    "Progression": "#E6A9C3",           
    "Metastatic Disease": "#D3B7E5" ,
    "Cancer Drug Treatment": "#A9D6A5",
    "Treatment Regimen": "#F5BFA3"
    }

# Plotly Gantt
fig = px.timeline(
    df,
    x_start="episode_start_date",
    x_end="episode_end_date",
    y="episode_label",  
    color="label",  # legend plotting
    labels={"label": "Episode"}, ## legend formatting
    color_discrete_map=color_map,
    text="text_label",
)

# axis-formatting
fig.update_traces(insidetextanchor="middle", textposition="outside", textfont=dict(
    size=14
))
fig.update_yaxes(autorange="reversed", title=None, showticklabels=False)
fig.update_layout(height=800,
                  xaxis=dict(
                            range=[
                            df["episode_start_date"].min().strftime('%Y-%m-%d'),
                            (df["episode_end_date"].max() + pd.Timedelta(days=280)).strftime('%Y-%m-%d')
                            ]
                            ),
                  legend=dict(
                        font=dict(
                        size=16
                                )
                        ),
                  title=dict(
                        text=f"Patient Journey: {patient_id}",
                        font=dict(
                        size=24,      
                        weight="bold"  
                            )
                    )
                #plot_bgcolor="white"   
                #paper_bgcolor="white"  
)
                  


In [None]:
# Gant Diagram - regimen 
sql4 = "select e.episode_id,e.person_id,e.episode_start_date,e.episode_end_date, c.concept_name from cdm.episode e inner join cdm.concept c on c.concept_id = e.episode_object_concept_id and c.vocabulary_id = 'HemOnc' and concept_name <> 'ACP'"
regimen_pat = ex.execute_sql(sql4)

patient_id =525306
df = regimen_pat[regimen_pat["person_id"] == patient_id].copy()

# impute missung date data
df["episode_end_date"] = df["episode_end_date"].fillna(pd.Timestamp.today())

# y-axis formatting
df["episode_label"] = (
    "Episode " + df["episode_id"].astype(str) + ": " + df["concept_name"]
)
df["episode_start_date"] = pd.to_datetime(df["episode_start_date"], errors="coerce")
df["episode_end_date"] = pd.to_datetime(df["episode_end_date"], errors="coerce")
df["episode_end_date_filled"] = df["episode_end_date"].fillna(pd.Timestamp.today())
df["text_label"] = (
    df["concept_name"] + ": Start: " +
    df["episode_start_date"].dt.strftime('%d.%m.%Y') + " - End: " +
    df["episode_end_date"].dt.strftime('%d.%m.%Y')
)
color_seq = [
    "#B3C7E6",        
    "#A9D6A5",  
    "#F5BFA3",      
    "#E6A9C3",           
    "#D3B7E5" 
]

# Plotly Gantt
fig = px.timeline(
    df,
    x_start="episode_start_date",
    x_end="episode_end_date",
    y="episode_label",  
    color="concept_name",  
    labels={"concept_name": "Regimen"},
    color_discrete_sequence=color_seq,
    text="text_label",
)

# axis-formatting
fig.update_traces(insidetextanchor="middle", textposition="outside", textfont=dict(
    size=14
))
fig.update_yaxes(autorange="reversed", title=None, showticklabels=False)
fig.update_layout(height=800,
                  xaxis=dict(
                            range=[
                            df["episode_start_date"].min().strftime('%Y-%m-%d'),
                            (df["episode_end_date"].max() + pd.Timedelta(days=160)).strftime('%Y-%m-%d')
                            ]
                            ),
                  legend=dict(
                        font=dict(
                        size=16
                                )
                        ),
                  title=dict(
                        text=f"Patient Journey: {patient_id}",
                        font=dict(
                        size=24,      
                        weight="bold"  
                            )
                        )
)