# Modeling Each Cohort and Model Variation

In [None]:
import duckdb
import pandas as pd
import numpy as np
import json
import os

# display and widgets
import ipywidgets as widgets
from IPython.display import display, Markdown

# time
import datetime
from dateutil.relativedelta import relativedelta

# strings
import re

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# data
from collections import Counter

# ML
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    auc
)
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
#! change the base_path to the IC data location in Wynton


# Functions for easy pulling of CDW data

def file_path_parquet(filename, datatype):
    base_path = f"path/to/ic/data/{datatype}/"
    parquet_wild = "/*.parquet"
    return f"{base_path}{filename}{parquet_wild}"

def rtime():
    # Get the current datetime
    current_datetime = datetime.datetime.now()
    # Define a mapping of days of the week to colors
    day_color_mapping = {
        0: 'red',       # Monday
        1: 'orange',    # Tuesday
        2: 'green',     # Wednesday
        3: 'blue',      # Thursday
        4: 'purple',    # Friday
        5: 'brown',     # Saturday
        6: 'gray',      # Sunday
    }

    # Get the day of the week (0=Monday, 1=Tuesday, ..., 6=Sunday)
    day_of_week = current_datetime.weekday()
    # Get the color based on the day of the week
    text_color = day_color_mapping.get(day_of_week, 'black')  # Default to black if the day is not found in the mapping
    # Format the current datetime
    formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
    # Generate the formatted output with the corresponding color
    formatted_output = f"\n<b><span style='color:{text_color}'>Ran: {formatted_datetime}</span></b>\n"
    # Display the formatted output using Markdown
    display(Markdown(formatted_output))
    
rtime()

In [None]:
#! change the path to scratch and the username


# wynton_username with your actual Wynton username
username = 'name'

# Spill data that doesn't fit into memory into Wynton Scratch storage (BeeGFS)
# Increase up to 12 threads and 150 GB of memory to not overwhelm the system
# Recommendation: ~12 GB of memory for each thread
# reduce if there are other system limitations in place
config_query = f"""
    SET temp_directory = 'path/to/scratch/{username}/duckdb_dir';
    SET preserve_insertion_order = false;
    SET memory_limit = '150GB';
    SET threads TO 12;
"""

# Create a connection with configurations
con = duckdb.connect()
con_info = con.execute(config_query)  # Apply configuration settings

display(con_info)
rtime()

# Data

In [None]:
#! load whichever is relevant

# p_cohort = pd.read_parquet("p1_cohort.parquet")
# p_cohort = pd.read_parquet("p5_cohort_spoke.parquet")

# p_cohort = pd.read_parquet("p3_cohort.parquet")
# p_cohort = pd.read_parquet("p5_cohort_spoke.parquet")

# p_cohort = pd.read_parquet("p5_cohort.parquet")
# p_cohort = pd.read_parquet("p5_cohort_spoke.parquet")

rtime()

In [None]:
df = p_cohort.copy()
person_id_index = df['patientepicid'].to_list()
df.drop('patientepicid', axis=1, inplace=True)

rtime()

In [None]:
X = df.drop('is_ms', axis=1)
y = df['is_ms']

rtime()

In [None]:
cuis = X.columns.tolist()

# Models

In [None]:
def model_eval_metrics(X, y, model, scale=False, select=False, psev_in=False):
    # Store original column names
    original_columns = X.columns if hasattr(X, 'columns') else None
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale if requested
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        print("Data scaled")
        
    # Feature selection
    if select:
        pre_selector = RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=40)
        selector = SelectFromModel(pre_selector, prefit=False, max_features=2000)
        X_train = selector.fit_transform(X_train, y_train)
        X_test = selector.transform(X_test)
        if not psev_in:
            selected_features_mask = selector.get_support()
            selected_feature_names = original_columns[selected_features_mask]
        if psev_in:
            selected_features_mask = selector.get_support()
            selected_feature_names = [col for idx, col in enumerate(columns) if selected_features_mask[idx]]
        print("Features selected")
    
    # Determine feature names for importance analysis
    if select or psev_in:
        feature_names = selected_feature_names
    elif original_columns is not None:
        feature_names = original_columns
    else:
        print("The data does not have the correct format to interpret coefficients")
        return None
    
    # Fit model and make predictions
    print("Fitting model...")
    model.fit(X_train, y_train)
    print("Model fit")
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Print evaluation metrics
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
    
    # Feature importance
    if hasattr(model, 'coef_'):
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': model.coef_[0]
        })
        feature_importance = feature_importance.sort_values('importance', ascending=False)
        print("\nTop 10 Most Important Features:")
        print(feature_importance.head(10))
        print("\nBottom 10 Most Important Features:")
        print(feature_importance.sort_values('importance', ascending=True).head(10))
    
    # Accuracy and confusion matrix
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.title('Confusion Matrix Heatmap')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()
    
    return y_pred_proba

### Determining a good cutoff

In [None]:
pre_selector.fit(X_train, y_train)

importances = pre_selector.feature_importances_
feature_imp = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
})
feature_imp = feature_imp.sort_values('importance', ascending=False)


plt.figure(figsize=(12, 6))

# Distribution plot
plt.subplot(1, 2, 1)
sns.histplot(feature_imp['importance'], bins=50)
plt.title('Distribution of Feature Importance Scores')
plt.xlabel('Importance Score')

# Cumulative importance plot
plt.subplot(1, 2, 2)
cumulative_importance = np.cumsum(feature_imp['importance'])
plt.plot(range(len(cumulative_importance)), cumulative_importance)
plt.title('Cumulative Feature Importance')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.tight_layout()
plt.show()

# Elbow plot
plt.figure(figsize=(10, 6))
plt.plot(range(len(feature_imp['importance'])), feature_imp['importance'])
plt.title('Feature Importance Scores (Elbow Plot)')
plt.xlabel('Feature Rank')
plt.ylabel('Importance Score')
plt.show()

# summary statistics
print("\nImportance Score Statistics:")
print(feature_imp['importance'].describe())

# potential cutoff points
mean_importance = feature_imp['importance'].mean()
median_importance = feature_imp['importance'].median()
percentile_75 = feature_imp['importance'].quantile(0.75)

print("\nPotential Cutoff Points:")
print(f"Mean importance: {mean_importance:.6f}")
print(f"Median importance: {median_importance:.6f}")
print(f"75th percentile: {percentile_75:.6f}")

# features above different thresholds
print("\nNumber of features above thresholds:")
thresholds = [mean_importance, median_importance, percentile_75]
for threshold in thresholds:
    n_features = sum(feature_imp['importance'] > threshold)
    print(f"Threshold {threshold:.6f}: {n_features} features")

# top features and their importance scores
print("\nTop 20 features and their importance scores:")
print(feature_imp.head(20))


In [None]:
# top 100 features to see where importance drops significantly
plt.figure(figsize=(12, 6))
plt.plot(range(100), feature_imp['importance'][:100])
plt.title('Top 100 Feature Importance Scores')
plt.xlabel('Feature Rank')
plt.ylabel('Importance Score')
plt.show()

# top 20 importance values to see the pattern
print("\nTop 20 importance values:")
print(feature_imp['importance'][:20])

In [None]:
# number of features needed for 80% of importance
cumsum = np.cumsum(feature_imp['importance'])
n_features_80 = len(cumsum[cumsum <= 0.8])
print(f"Features needed for 80% importance: {n_features_80}")

## Models

In [None]:
logreg = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=40, solver='saga')
y_baseline = model_eval_metrics(X, y, logreg)

In [None]:
logreg = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=40, solver='saga')
y_select = model_eval_metrics(X, y, logreg, select=True)

In [None]:
logreg = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=40, solver='saga')
y_select_scale = model_eval_metrics(X, y, logreg, select=True, scale=True)

In [None]:
#! be sure to load in the p_cohort that has the new SPOKE mappings and make this the new X and y

logreg = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=40, solver='saga')
y_select_scale_spoke = model_eval_metrics(X, y, logreg, select=True, scale=True)

**SPOKE Model**

In [None]:
def load_psev_matrices(directory):
    # Get all files that start with 'filtered_patient_psevs_' but don't include 'columns'
    psev_files = [f for f in os.listdir(directory) 
                  if f.startswith('filtered_patient_psevs_') 
                  and 'columns' not in f
                  and f != 'person_id_index.npy']
    
    matrices = []
    column_names = []
    
    print("Loading matrices:")
    for psev_file in sorted(psev_files):
        matrix = np.load(os.path.join(directory, psev_file))
        
        # Load corresponding column names
        col_file = psev_file.replace('filtered_patient_psevs_', 
                                   'filtered_patient_psevs_columns_')
        cols = np.load(os.path.join(directory, col_file), allow_pickle=True)  # Added allow_pickle=True
        
        print(f"{psev_file}: shape {matrix.shape}, {len(cols)} columns")
        
        matrices.append(matrix)
        column_names.extend(cols)
    
    # concatenate horizontally
    combined_matrix = np.hstack(matrices)
    
    # load person ids
    person_ids = np.load(os.path.join(directory, 'person_id_index.npy'), allow_pickle=True)  # Added allow_pickle=True
    
    print(f"\nFinal matrix shape: {combined_matrix.shape}")
    print(f"Number of columns: {len(column_names)}")
    print(f"Number of patients: {len(person_ids)}")
    
    return combined_matrix, column_names, person_ids

In [None]:
directory = 'data/psev/p5'
psev_mat, psev_cols, psev_pats = load_psev_matrices(directory)

In [None]:
#! be sure to load in the p_cohort that has the new SPOKE mappings and make this the new X and y

logreg = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=40, solver='saga')
y_select_scale_spoke = model_eval_metrics(psev_mat, y, logreg, select=True, scale=True, psev_in=True)

## Interpretation

In [None]:
def calculate_95_ci(data):
    mean = np.mean(data)
    std = np.std(data, ddof=1)  # ddof=1 for sample standard deviation
    n = len(data)
    
    # For 95% CI, we use 1.96 as the critical value
    margin_of_error = 1.96 * (std / np.sqrt(n))
    
    ci_lower = mean - margin_of_error
    ci_upper = mean + margin_of_error
    
    return (ci_lower, ci_upper)


# ci = calculate_95_ci(roc_scores)
# print(f"95% CI: ({ci[0]:.4f}, {ci[1]:.4f})")

In [None]:
# Plot a figure of all the AUC ROCs

plt.figure(figsize=(6, 4), dpi=300)
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'



#! Below are all the lines for different y_pred_proba for each model


# # Calculate ROC curve
# fpr, tpr, _ = roc_curve(y_test, no_select_y_pred_proba)
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve with more prominent colors
# plt.plot(fpr, tpr, color='#4B8BCB', lw=2.5,  
#          label=f'All CUIs (AUC = {roc_auc:.4f})')

# # Calculate ROC curve
# fpr, tpr, _ = roc_curve(y_test, no_scale_y_pred_proba)
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve with more prominent colors
# plt.plot(fpr, tpr, color='#EED91F', lw=2.5,  
#          label=f'Select CUIs (AUC = {roc_auc:.4f})')

# # Calculate ROC curve
# fpr, tpr, _ = roc_curve(y_test, no_spoke_y_pred_proba)
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve with more prominent colors
# plt.plot(fpr, tpr, color='#FF8C42', lw=2.5,  
#          label=f'SPOKE CUIs (AUC = {roc_auc:.4f})')

# # Calculate ROC curve
# fpr, tpr, _ = roc_curve(y_test, psev_y_pred_proba)
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve with more prominent colors
# plt.plot(fpr, tpr, color='#FF3C38', lw=2.5,  
#          label=f'PSEV (AUC = {np.mean(roc_scores):.4f})')


plt.plot([0, 1], [0, 1], color='#7B7D7D', linestyle='--', lw=1.5,  
         label='Random (AUC = 0.5)')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14, labelpad=10)
plt.ylabel('True Positive Rate', fontsize=14, labelpad=10)
plt.title('Model Performance: ROC Curve', fontsize=16, pad=20)
plt.legend(loc='lower right', fontsize=12, framealpha=0.9)

plt.grid(True, linestyle='--', alpha=0.3)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()