# Obtaining Concepts for All Patients

In [None]:
import duckdb
import pandas as pd
import numpy as np
import json

import ipywidgets as widgets
from IPython.display import display, Markdown

# timeoutput
import datetime

# regex
import re

# plots
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

from collections import Counter 
from dateutil.relativedelta import relativedelta

from sklearn.preprocessing import normalize

In [None]:
#! change the base_path to the IC data location in Wynton


# Functions for easy pulling of CDW data

def file_path_parquet(filename, datatype):
    base_path = f"path/to/ic/data/{datatype}/"
    parquet_wild = "/*.parquet"
    return f"{base_path}{filename}{parquet_wild}"

def rtime():
    # Get the current datetime
    current_datetime = datetime.datetime.now()
    # Define a mapping of days of the week to colors
    day_color_mapping = {
        0: 'red',       # Monday
        1: 'orange',    # Tuesday
        2: 'green',     # Wednesday
        3: 'blue',      # Thursday
        4: 'purple',    # Friday
        5: 'brown',     # Saturday
        6: 'gray',      # Sunday
    }

    # Get the day of the week (0=Monday, 1=Tuesday, ..., 6=Sunday)
    day_of_week = current_datetime.weekday()
    # Get the color based on the day of the week
    text_color = day_color_mapping.get(day_of_week, 'black')  # Default to black if the day is not found in the mapping
    # Format the current datetime
    formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
    # Generate the formatted output with the corresponding color
    formatted_output = f"\n<b><span style='color:{text_color}'>Ran: {formatted_datetime}</span></b>\n"
    # Display the formatted output using Markdown
    display(Markdown(formatted_output))
    
rtime()

In [None]:
#! change the path to scratch and the username


# wynton_username with your actual Wynton username
username = 'name'

# Spill data that doesn't fit into memory into Wynton Scratch storage (BeeGFS)
# Increase up to 12 threads and 150 GB of memory to not overwhelm the system
# Recommendation: ~12 GB of memory for each thread
# reduce if there are other system limitations in place
config_query = f"""
    SET temp_directory = 'path/to/scratch/{username}/duckdb_dir';
    SET preserve_insertion_order = false;
    SET memory_limit = '150GB';
    SET threads TO 12;
"""

# Create a connection with configurations
con = duckdb.connect()
con_info = con.execute(config_query)  # Apply configuration settings

display(con_info)
rtime()

# Data

In [None]:
cohort = pd.read_csv("matched25_cohort.csv")


rtime()

### CDW Data

In [None]:
# deid_note_key and negation terms
note_concepts = con.read_parquet(file_path_parquet('note_concepts', 'DEID_CDW'))

# linker to patientdurablekey, encoutnerkey, and deid_note_key
note_metadata = con.read_parquet(file_path_parquet('note_metadata', 'DEID_CDW'))

# note text - only deid_note_key and note_text
note_text = con.read_parquet(file_path_parquet('note_text', 'DEID_CDW'))

# diagnosis event fact
diag_fact = con.read_parquet(file_path_parquet('diagnosiseventfact', 'DEID_CDW'))

# patdurabledim
patdurabledim = con.read_parquet(file_path_parquet('patdurabledim', 'DEID_CDW'))


rtime()

# Queries

Only run this once and save the output! It takes a while to complete and the data can be reused. 

Skip this section in subsequent runs and proceed to the next section.

In [None]:
# Create a temporary table with the patient IDs
temp_df = pd.DataFrame(cohort['patientepicid'].unique(), columns=['patientepicid'])
con.register('temp_patients', temp_df)

# drop the existing table if exists
con.query("DROP TABLE IF EXISTS note_key_table")

query_note_key_table = """
CREATE TABLE note_key_table AS
    SELECT patientepicid,
        encounterkey,
        deid_note_key,
        deid_service_date
    FROM note_metadata n
    WHERE deid_service_date >= DATE '1930-01-01'
        AND deid_service_date <= DATE '2027-01-01'
        AND EXISTS (
            SELECT 1 
            FROM temp_patients t 
            WHERE t.patientepicid = n.patientepicid
        )
"""

con.query(query_note_key_table)

# index the new table
con.query("CREATE INDEX idx_note_key ON note_key_table(deid_note_key)")

rtime()

In [None]:
# # Sanity check to be sure everything is all there
# tmp = con.query("SELECT * FROM note_key_table").df()
# np.sum(tmp['deid_service_date'].isna()) / tmp.shape[0]
# del tmp

In [None]:
con.query("SELECT COUNT(DISTINCT(deid_note_key)) AS count FROM note_key_table")

**Save Intermediate**

In [None]:
parquet_query = """
SELECT *
FROM note_key_table
"""

con.execute(f"COPY ({parquet_query}) TO 'cohort_note_key_table.parquet' (FORMAT PARQUET)")

rtime()

In [None]:
query_allnote_dates = f"""
SELECT tbl.patientepicid,
    tbl.encounterkey,
    tbl.deid_note_key,
    tbl.deid_service_date,
    con.canon_text,
    con.vocab,
    con.vocab_term_id,
    con.cui,
    con.negated,
    con.history,
    con.family_history
FROM note_key_table tbl
JOIN note_concepts con
    ON tbl.deid_note_key = con.deid_note_key
"""

note_all_result = con.query(query_allnote_dates)

**Save Complete**

In [None]:
parquet_query = """
SELECT *
FROM note_all_result
"""

con.execute(f"COPY ({parquet_query}) TO 'cohort_note_concepts.parquet' (FORMAT PARQUET)")

rtime()

# Reformat concepts

In [None]:
# all concepts within the cohort
cohort_con = con.read_parquet("cohort_note_concepts.parquet").df()

In [None]:
# all note keys and linkers
cohort_note_key = con.read_parquet("cohort_note_key_table.parquet").df()

In [None]:
cohort_con.shape
# ~ 94M

In [None]:
cohort_note_key.shape
# ~ 2.4M

In [None]:
# some patients might have been dropped due to there being no extracted concepts in the notes
# keep track of these
diff_pats = set(cohort['patientepicid']).difference(set(cohort_con['patientepicid']))
len(diff_pats)

diff_pats_df = pd.DataFrame(list(diff_pats), columns=['non-matching_pats'])

diff_pats_df.to_csv('data/cohort/non-matching_pats.csv', index=False)
rtime()

### Optional - Check PSM again

Look at "what if we pulled these patients from the pool directly, is it still good?" given we lost some patients

In [None]:
unmatched = pd.read_parquet('data/unmatched_cohort_250319.parquet')

In [None]:
unmatched

In [None]:
cohort_prop = unmatched[unmatched['patientepicid'].isin(cohort['patientepicid'])]

In [None]:
cohort_prop_diff = cohort_prop[~cohort_prop['patientepicid'].isin(diff_pats)]

In [None]:
cohort_prop_diff

In [None]:
# Summary table
def summary_stats_table(data, covariates, categorical, treatment_col):
    
    def round4(num):
        round(num, 3)
    
    rows = []
    
    # numeric covariates
    for covariate in covariates:
        mean_treated = data[data[treatment_col] == 1][covariate].mean()
        std_treated = data[data[treatment_col] == 1][covariate].std()
        mean_control = data[data[treatment_col] == 0][covariate].mean()
        std_control = data[data[treatment_col] == 0][covariate].std()

        smd = calculate_smd(data[data[treatment_col] == 1], 
                            data[data[treatment_col] == 0], 
                            [covariate])[covariate]

        rows.append({
            'Covariate': covariate,
            'Mean_Treated': mean_treated,
            'Std_Treated': std_treated,
            'Mean_Control': mean_control,
            'Std_Control': std_control,
            'SMD': smd
        })
    
    # categorical covariates
    for covariate in categorical:
        prop_treated = data[data[treatment_col] == 1][covariate].value_counts(normalize=True)
        prop_control = data[data[treatment_col] == 0][covariate].value_counts(normalize=True)
        
        smds = calculate_smd_categorical(prop_treated, prop_control, covariate)
        
        for category in prop_treated.index:
            rows.append({
                'Covariate': f"{covariate}_{category}",
                'Mean_Treated': prop_treated[category] * 100,
                'Std_Treated': np.nan,  # Standard deviation is not applicable for proportions
                'Mean_Control': prop_control[category] * 100,
                'Std_Control': np.nan,  # Standard deviation is not applicable for proportions
                'SMD': smds[category]
            })
    
    return pd.DataFrame(rows)


def calculate_smd(group1, group2, var_list):
    smds = {}
    for var in var_list:
        mean1 = group1[var].mean()
        mean2 = group2[var].mean()
        std1 = group1[var].std()
        std2 = group2[var].std()

        smd = abs(mean1 - mean2) / np.sqrt((std1**2 + std2**2) / 2)
        smds[var] = smd
    return smds


def calculate_smd_categorical(prop_treated, prop_control, covariate):
    smds = {}
    for category in prop_treated.index:
        prop_treated_val = prop_treated.get(category, 0)
        prop_control_val = prop_control.get(category, 0)
        
        smd = abs(prop_treated_val - prop_control_val) / np.sqrt((prop_treated_val * (1 - prop_treated_val) + prop_control_val * (1 - prop_control_val)) / 2)
        smds[category] = smd
    return smds


def plot_covariate_balance(data, covariates, treatment_col):
    for covariate in covariates:
        plt.figure(figsize=(8, 4))
        sns.kdeplot(data[data[treatment_col] == 1][covariate], label='Treated')
        sns.kdeplot(data[data[treatment_col] == 0][covariate], label='Control')
        plt.title(f'Distribution of {covariate} by Treatment Status')
        plt.xlabel(covariate)
        plt.ylabel('Density')
        plt.legend()
        plt.show()

In [None]:
covariates = ['follow_up', 'n_notes', 'age_at_first_visit']
categorical = ['gender_concept_id', 'race_concept_id']   # numerics that should be treated as categoricals
summary_stats = summary_stats_table(cohort_prop_diff, covariates, categorical, 'is_ms')
np.round(summary_stats, 3)

**Ok, we still good**

### Continue

The continuation of the script filters out dropped patients and then creates three separate cohorts based on their preexposure time

In [None]:
diff_pats = pd.read_csv('data/cohort/non-matching_pats.csv')
isms_key = cohort_t[['patientepicid', 'is_ms']]

In [None]:
# redefine cohort here
cohort = cohort[~cohort['patientepicid'].isin(diff_pats['non-matching_pats'])]
# add time in
pat_note_times = cohort_note_key[['patientepicid', 'deid_service_date']].groupby('patientepicid').min().reset_index()
cohort_t = pd.merge(cohort, pat_note_times, how='inner')

rtime()

In [None]:
# some patients might be lost
sum(cohort_t['is_ms']==1)

In [None]:
def create_labels(x):
    if 1 <= x < 3:
        return 1
    elif 3 <= x < 5:
        return 3
    elif x >= 5:
        return 5
    else:
        return np.nan

cohort_t['exposure_group'] = cohort_t['follow_up'].apply(create_labels)

In [None]:
# create the exposure periods [p1, p3, p5]
def create_exposure_periods(df):
    for years in [1, 3, 5]:
        df[f'p{years}'] = (df['exposure_group'] >= years).astype(int)
        df[f'p{years}_exposure_end'] = df.apply(
            lambda row: row['deid_service_date'] + relativedelta(years=years) 
            if row['exposure_group'] >= years else np.nan, 
            axis=1
        )
    return df

cohort_t = create_exposure_periods(cohort_t)

In [None]:
def valid_note_keys(note_keys, pats, period):
    if 'deid_service_date' in pats.columns:
        pats.drop(columns=['deid_service_date'], inplace=True)
    pat_subset = pats[pats[period] == 1]
    df_merge = pd.merge(note_keys, pat_subset, on='patientepicid', how='inner')
    df_merge = df_merge[df_merge['deid_service_date'] < df_merge[f'{period}_exposure_end']]
    return df_merge

**Separate out to three different cohort processes**

In [None]:
import pandas as pd
from collections import defaultdict

# reverses the dictionary
def reverse_dict(original):
    reversed_dict = {}
    for key, values in original.items():
        for value in values:
            reversed_dict[value] = key
    return reversed_dict

# combines columns and rename them based on the mapping
def combine_columns(df, mapping):
    # filter to only keep columns that are in the mapping
    valid_columns = [col for col in df.columns if col in mapping]
    df_filtered = df[valid_columns]
    
    print(f"Total columns in original dataframe: {len(df.columns)}")
    print(f"Columns found in mapping: {len(valid_columns)}")
    
    # group columns by their mapped values
    column_groups = defaultdict(list)
    for col in df_filtered.columns:
        column_groups[mapping[col]].append(col)
        
    # create all combined columns at once
    combined_cols = {
        new_col: df_filtered[old_cols].sum(axis=1) 
        for new_col, old_cols in column_groups.items()
    }
    new_df = pd.DataFrame(combined_cols)
    
    print(f"\nCombined {len(df_filtered.columns)} columns to {len(new_df.columns)}")
    return new_df

# only normalize rows that are non-zero
# you will get rows that sum to 0 since a lot of the CUI information is lost when moving into SPOKE
def normalize_non_zero_rows(df):
    df_norm = df.astype(float)
    
    row_sums = df_norm.sum(axis=1)
    
    non_zero_mask = row_sums != 0
    df_norm.loc[non_zero_mask] = df_norm.loc[non_zero_mask].div(row_sums[non_zero_mask], axis=0)
    
    print(f"Total rows: {len(df)}")
    print(f"Rows with zero sums: {(~non_zero_mask).sum()}")
    print(f"Rows normalized: {non_zero_mask.sum()}")
    
    return df_norm

In [None]:
# you get this data from the SPOKE embedding overlap with CUIs using BioPortal
# or saved for the purpose of this cohort
# Load dictionary from a JSON file
with open('data/mapping/spoke_mappings.json', 'r') as json_file:
    spoke_mappings = json.load(json_file)

In [None]:
# Reverse the dictionary
spoke_mappings_rev = reverse_dict(spoke_mappings)
print("Reversed spoke mapping")

In [None]:
def pivot_concepts(valid_note_keys, cohort_note_key, cohort_con, isms_key, periods=[], spoke_mappings_rev=None):
    
    if len(periods)==0:
        ValueError("Must provide some periods to iterate across")

    for p in periods:
        p_notes = valid_note_keys(cohort_note_key, cohort_t, p)
        print(f'period {p}:', p_notes.shape)

        p_ms_con = cohort_con[cohort_con['deid_note_key'].isin(p_notes['deid_note_key'])]
        print(f'period {p} concepts:', p_ms_con.shape)

        max_cui_vocab = set(p_ms_con['cui'])
        print("Max number of unique CUI concepts:", len(max_cui_vocab))

        pats_lost = len(set(cohort_t['patientepicid'])) - len(set(p_ms_con['patientepicid']))
        print(f"{pats_lost} period {p} patients were lost due to note density and follow_up time differences")

        pivot_p = pd.pivot_table(
            p_ms_con,
            values='deid_note_key',
            index='patientepicid',
            columns='cui',
            aggfunc='count',
            fill_value=0
        )
        
        if spoke_mappings_rev is not None:
            pivot_p = combine_columns(pivot_p, spoke_mappings_rev)

        # Normalize by dividing each row by the number of concepts for that patient
        pivot_p = normalize_non_zero_rows(pivot_p)
        pivot_p = pd.merge(pivot_p, isms_key, left_index=True, right_on='patientepicid')
        print("Patinets:", sum(pivot_p['is_ms']))
        
        if spoke_mappings_rev is not None:
            pivot_p_out.to_parquet(f"{p}_cohort_spoke.parquet")
        else:
            pivot_p_out.to_parquet(f"{p}_cohort.parquet")
        

In [None]:
# this can take a while
pivot_concepts(valid_note_keys, cohort_note_key, cohort_con, 
               isms_key, periods=["p1", "p3", "p5"], 
               spoke_mappings_rev=spoke_mappings_rev)

rtime()