# Obtaining MS Patients and All Note Encounters

In [None]:
import duckdb
import pandas as pd
import numpy as np
import json

import ipywidgets as widgets
from IPython.display import display, Markdown

# timeoutput
import datetime

# regex
import re

# plots
import matplotlib.pyplot as plt

In [None]:
#! change the base_path to the IC data location in Wynton


# Functions for easy pulling of CDW data

def file_path_parquet(filename, datatype):
    base_path = f"path/to/ic/data/{datatype}/"
    parquet_wild = "/*.parquet"
    return f"{base_path}{filename}{parquet_wild}"

def rtime():
    # Get the current datetime
    current_datetime = datetime.datetime.now()
    # Define a mapping of days of the week to colors
    day_color_mapping = {
        0: 'red',       # Monday
        1: 'orange',    # Tuesday
        2: 'green',     # Wednesday
        3: 'blue',      # Thursday
        4: 'purple',    # Friday
        5: 'brown',     # Saturday
        6: 'gray',      # Sunday
    }

    # Get the day of the week (0=Monday, 1=Tuesday, ..., 6=Sunday)
    day_of_week = current_datetime.weekday()
    # Get the color based on the day of the week
    text_color = day_color_mapping.get(day_of_week, 'black')  # Default to black if the day is not found in the mapping
    # Format the current datetime
    formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
    # Generate the formatted output with the corresponding color
    formatted_output = f"\n<b><span style='color:{text_color}'>Ran: {formatted_datetime}</span></b>\n"
    # Display the formatted output using Markdown
    display(Markdown(formatted_output))
    
rtime()

In [None]:
#! change the path to scratch and the username


# wynton_username with your actual Wynton username
username = 'name'

# Spill data that doesn't fit into memory into Wynton Scratch storage (BeeGFS)
# Increase up to 12 threads and 150 GB of memory to not overwhelm the system
# Recommendation: ~12 GB of memory for each thread
# reduce if there are other system limitations in place
config_query = f"""
    SET temp_directory = 'path/to/scratch/{username}/duckdb_dir';
    SET preserve_insertion_order = false;
    SET memory_limit = '150GB';
    SET threads TO 12;
"""

# Create a connection with configurations
con = duckdb.connect()
con_info = con.execute(config_query)  # Apply configuration settings

display(con_info)
rtime()

# Data

In [None]:
# list of MS OMOP diagnostic codes
ms_list = [374919, 4178929, 4145049, 4137855, 37110514]

bad_pats = ['-1', '*Unspecified']

rtime()

### OMOP

In [None]:
# condition_occurrence
condition_occurrence_ucsf = con.read_parquet(file_path_parquet('condition_occurrence', 'DEID_OMOP'))

# person demographics
person_ucsf = con.read_parquet(file_path_parquet('person', 'DEID_OMOP'))

# person linkage OMOP - CDW
person_extension_ucsf = con.read_parquet(file_path_parquet('person_extension', 'DEID_OMOP'))

# visit_occurrence
visit_occurrence_ucsf = con.read_parquet(file_path_parquet('visit_occurrence', 'DEID_OMOP'))

# condition occurrence to link to CDW
condition_occurrence_extension_ucsf = con.read_parquet(file_path_parquet('condition_occurrence_extension', 'DEID_OMOP'))


rtime()

### CDW Data

In [None]:
# deid_note_key and negation terms
note_concepts = con.read_parquet(file_path_parquet('note_concepts', 'DEID_CDW'))

# linker to patientdurablekey, encoutnerkey, and deid_note_key
note_metadata = con.read_parquet(file_path_parquet('note_metadata', 'DEID_CDW'))

# note text - only deid_note_key and note_text
note_text = con.read_parquet(file_path_parquet('note_text', 'DEID_CDW'))

# diagnosis event fact
diag_fact = con.read_parquet(file_path_parquet('diagnosiseventfact', 'DEID_CDW'))

# patdurabledim
patdurabledim = con.read_parquet(file_path_parquet('patdurabledim', 'DEID_CDW'))


rtime()

In [None]:
# Query for a specific patient
specific_query = """
SELECT person_id,
    source_key_value
FROM person_extension_ucsf
WHERE person_id = 'KEY'
"""
con.query(specific_query)

# Cohort

**MS patients -> with notes -> notes with N ctakes terms -> MS diagnosis limitations**

### OMOP <-> CDW: patient table

In [None]:
# drop the existing table if exists
con.query("DROP TABLE IF EXISTS patient_table")

# create the table
query_create_table = f"""
CREATE TABLE patient_table AS
    SELECT
        co.person_id,
        p.person_source_value AS patientepicid,
        pd.patientdurablekey
    FROM condition_occurrence_ucsf co
    JOIN person_ucsf p
        ON co.person_id = p.person_id
    JOIN patdurabledim pd
        ON p.person_source_value = pd.patientepicid
    WHERE p.person_source_value NOT IN {tuple(bad_pats)}
        AND co.condition_concept_id IN {tuple(ms_list)}
    GROUP BY co.person_id, p.person_source_value, pd.patientdurablekey
    HAVING COUNT(DISTINCT co.visit_occurrence_id) >= 5
"""

con.query(query_create_table)

# index the new table
con.query("CREATE INDEX idx_person_id ON patient_table(person_id)")
con.query("CREATE INDEX idx_patientepicid ON patient_table(patientepicid)")
con.query("CREATE INDEX idx_patientdurablekey ON patient_table(patientdurablekey)")

rtime()

### OMOP <-> CDW: condition and encounter linker

In [None]:
# drop the existing table if exists
con.query("DROP TABLE IF EXISTS conlink_table")

# create the table
query_create_table = f"""
CREATE TABLE conlink_table AS
    SELECT
        pt.person_id,
        pt.patientepicid,
        co.condition_occurrence_id,
        co.condition_concept_id,
        exco.diagnosiseventkey
    FROM condition_occurrence_ucsf co
    JOIN patient_table pt
        ON co.person_id = pt.person_id
    JOIN (
        SELECT condition_occurrence_id,
            source_key_value AS diagnosiseventkey
        FROM condition_occurrence_extension_ucsf
        WHERE source_table_name = 'DiagnosisEventFact'
    ) exco
        ON exco.condition_occurrence_id = co.condition_occurrence_id
    WHERE co.condition_concept_id IN {tuple(ms_list)}
"""
con.query(query_create_table)

# index the new table
# con.query("CREATE INDEX idx_person_id ON conlink_table(person_id)")
# con.query("CREATE INDEX idx_patientepicid ON conlink_table(patientepicid)")
# con.query("CREATE INDEX idx_patientdurablekey ON conlink_table(patientdurablekey)")
con.query("CREATE INDEX idx_diagnosiseventkey ON conlink_table(diagnosiseventkey)")

rtime()

### CDW: patient encounters

In [None]:
con.query("DROP TABLE IF EXISTS enc_table")

# create the table
query_enc_table = f"""
CREATE TABLE enc_table AS
    SELECT 
        co.person_id,
        co.patientepicid,
        enc.patientdurablekey,
        co.condition_occurrence_id,
        enc.encounterkey,
        enc.startdatekeyvalue,
        enc.diagnosisname
    FROM conlink_table co
    JOIN (
        SELECT
            pt.patientdurablekey,
            df.diagnosiseventkey,
            df.diagnosisname,
            df.encounterkey,
            df.startdatekeyvalue
        FROM diag_fact df
        JOIN patient_table pt
            ON pt.patientdurablekey = df.patientdurablekey
    ) AS enc
        ON co.diagnosiseventkey = enc.diagnosiseventkey
"""
con.query(query_enc_table)

# index the new table
con.query("CREATE INDEX idx_person_id_2 ON enc_table(person_id)")
con.query("CREATE INDEX idx_patientepicid_2 ON enc_table(patientepicid)")
con.query("CREATE INDEX idx_patientdurablekey_2 ON enc_table(patientdurablekey)")
con.query("CREATE INDEX idx_encounterkey_2 ON enc_table(encounterkey)")

rtime()

### Preexposue years and MS years

In [None]:
query_msnote_dates = f"""
WITH note_dates AS (
    SELECT enc.patientepicid,
        MIN(note.deid_service_date) AS ms_note_fdate,
        MAX(note.deid_service_date) AS ms_note_ldate,
        CAST(DATEDIFF('day', ms_note_fdate, ms_note_ldate) / 365.25 AS FLOAT) AS ms_note_years,
        COUNT(DISTINCT(note.deid_note_key)) AS ms_note_count
    FROM note_metadata note
    JOIN enc_table enc ON enc.encounterkey = note.encounterkey
    WHERE note.deid_service_date >= DATE '1930-01-01'
        AND note.deid_service_date <= DATE '2027-01-01'
    GROUP BY enc.patientepicid
)
SELECT enc.*,
    note.ms_note_fdate,
    note.ms_note_ldate,
    note.ms_note_years,
    note.ms_note_count,
FROM note_dates note
JOIN enc_table enc
ON note.patientepicid = enc.patientepicid
WHERE ms_note_years >= 1
    AND ms_note_count >= 5
"""
note_result = con.query(query_msnote_dates)

In [None]:
query_msnote_people = f"""
SELECT person_id,
    patientepicid, 
    patientdurablekey,
    MIN(ms_note_fdate) AS ms_note_fdate,
    MIN(ms_note_ldate) AS ms_note_ldate,
    MIN(ms_note_years) AS ms_note_years,
    MIN(ms_note_count) AS ms_note_count,
FROM note_result
GROUP BY person_id, patientepicid, patientdurablekey,
"""
note_people_result = con.query(query_msnote_people)

**Merge with all notes to get counts and statistics for these**

In [None]:
query_allnote_dates = f"""
WITH note_dates AS (
    SELECT pt.patientepicid,
        MIN(note.deid_service_date) AS note_fdate,
        MAX(note.deid_service_date) AS note_ldate,
        CAST(DATEDIFF('day', note_fdate, note_ldate) / 365.25 AS FLOAT) AS note_years,
        COUNT(DISTINCT(note.deid_note_key)) AS note_count,
        COUNT(DISTINCT CASE 
            WHEN note.deid_service_date < npt.ms_note_fdate 
            THEN note.deid_note_key 
            END) AS preex_note_count
    FROM note_metadata note
    JOIN patient_table pt ON pt.patientepicid = note.patientepicid
    JOIN note_people_result npt ON note.patientepicid = npt.patientepicid
    WHERE note.deid_service_date >= DATE '1930-01-01'
        AND note.deid_service_date <= DATE '2027-01-01'
    GROUP BY pt.patientepicid
)
SELECT enc.*,
    note.note_fdate,
    note.note_ldate,
    note.note_years,
    note.note_count,
    note.preex_note_count,
    note.note_count - enc.ms_note_count AS non_ms_note_count,
    CAST(DATEDIFF('day', note.note_fdate, enc.ms_note_fdate) / 365.25 AS FLOAT) AS preex_years
FROM note_dates note
JOIN note_people_result enc
ON note.patientepicid = enc.patientepicid
WHERE preex_years >= 1
"""
note_all_result = con.query(query_allnote_dates)

Only this once and save the output.

In [None]:
# note_all_result_df = note_all_result.df()
# note_all_result_df.to_csv("ms_cohort.csv", index=False)
# print("Number of patients:", len(note_all_result_df))
# rtime()

# Plots

In [None]:
note_all_result_df = pd.read_csv("ms_cohort.csv")

In [None]:
# Create the label column
def create_labels(x):
    if 1 <= x < 3:
        return '1-3'
    elif 3 <= x < 5:
        return '3-5'
    elif x >= 5:
        return '5+'
    else:
        return 'Other'  # For values < 1

# Avoid the SettingWithCopyWarning
ms_notes = note_all_result_df.copy()
ms_notes['label'] = ms_notes['preex_years'].apply(create_labels)

In [None]:
plt.figure(figsize=(10, 6))
bins = np.arange(0, 40.5, 0.5)

groups = ['1-3', '3-5', '5+']
colors = ['#264653', '#2A9D8F', '#E9C46A']
# offset_coef and counter is used to offset the labels on the plot
offset_coef = 90
counter = 1

for group, color in zip(groups, colors):
    data = ms_notes[ms_notes['label'] == group]['preex_years']
    plt.hist(data, bins=bins, alpha=1, label=f'{group} years', color=color, 
             histtype='bar', rwidth=0.8)
    
    count = len(data)
    plt.text(data.mean()+0.5, plt.gca().get_ylim()[1]+offset_coef-(offset_coef*counter), 
             f'[{group})\nn={count}', 
             ha='center', 
             va='bottom')
    counter += 1

plt.xlabel('Prodromal Years with Notes')
plt.ylabel('Frequency')
plt.title(f'Distribution of Prodromal Period by Group ({len(ms_notes)})')
plt.legend()
plt.tight_layout()
plt.show()
