# Concept Mapping and Generation of PSEVs for Each Cohort

In [None]:
import duckdb
import pandas as pd
import numpy as np
import json
import os

# display and widgets
import ipywidgets as widgets
from IPython.display import display, Markdown

# time
import datetime
from dateutil.relativedelta import relativedelta

# strings
import re

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# data
from collections import Counter

# ML
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    auc
)
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


import requests
from typing import Dict, List, Set
import time
from collections import defaultdict
import concurrent.futures
import statistics
from tqdm import tqdm

In [None]:
#! change the base_path to the IC data location in Wynton


# Functions for easy pulling of CDW data

def file_path_parquet(filename, datatype):
    base_path = f"path/to/ic/data/{datatype}/"
    parquet_wild = "/*.parquet"
    return f"{base_path}{filename}{parquet_wild}"

def rtime():
    # Get the current datetime
    current_datetime = datetime.datetime.now()
    # Define a mapping of days of the week to colors
    day_color_mapping = {
        0: 'red',       # Monday
        1: 'orange',    # Tuesday
        2: 'green',     # Wednesday
        3: 'blue',      # Thursday
        4: 'purple',    # Friday
        5: 'brown',     # Saturday
        6: 'gray',      # Sunday
    }

    # Get the day of the week (0=Monday, 1=Tuesday, ..., 6=Sunday)
    day_of_week = current_datetime.weekday()
    # Get the color based on the day of the week
    text_color = day_color_mapping.get(day_of_week, 'black')  # Default to black if the day is not found in the mapping
    # Format the current datetime
    formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
    # Generate the formatted output with the corresponding color
    formatted_output = f"\n<b><span style='color:{text_color}'>Ran: {formatted_datetime}</span></b>\n"
    # Display the formatted output using Markdown
    display(Markdown(formatted_output))
    
rtime()

In [None]:
#! change the path to scratch and the username


# wynton_username with your actual Wynton username
username = 'name'

# Spill data that doesn't fit into memory into Wynton Scratch storage (BeeGFS)
# Increase up to 12 threads and 150 GB of memory to not overwhelm the system
# Recommendation: ~12 GB of memory for each thread
# reduce if there are other system limitations in place
config_query = f"""
    SET temp_directory = 'path/to/scratch/{username}/duckdb_dir';
    SET preserve_insertion_order = false;
    SET memory_limit = '150GB';
    SET threads TO 12;
"""

# Create a connection with configurations
con = duckdb.connect()
con_info = con.execute(config_query)  # Apply configuration settings

display(con_info)
rtime()

# Data

In [None]:
#! load whichever is relevant

# p_cohort = pd.read_parquet("p1_cohort.parquet")

# p_cohort = pd.read_parquet("p3_cohort.parquet")

# p_cohort = pd.read_parquet("p5_cohort.parquet")

rtime()

In [None]:
df = p_cohort.copy()
person_id_index = df['patientepicid'].to_list()
df.drop('patientepicid', axis=1, inplace=True)

rtime()

In [None]:
X = df.drop('is_ms', axis=1)
y = df['is_ms']

rtime()

In [None]:
cuis = X.columns.tolist()

# Mapping

In [None]:
# BioPortalMapper (BPM) from CUI to any other ontology selected in the BioPortal interface
class BMP_from_CUI:
    def __init__(self, api_key: str, max_concurrent: int = 10, monitor_performance: bool = True):
        self.api_key = api_key
        self.base_url = "http://data.bioontology.org"
        self.headers = {
            "Authorization": f"apikey token={api_key}",
            "Accept": "application/json"
        }
        self.max_concurrent = max_concurrent
        self.response_times = []
        self.monitor_performance = monitor_performance
        
    def extract_id_from_url(self, url: str, ontology: str) -> str:
        """Extract the actual ID from the BioPortal URL"""
        return url.split('/')[-1] if url else ""

    def get_mappings_for_cui(self, cui: str) -> Dict:
        """Get mappings for a single CUI"""
        mappings = defaultdict(set)
        
        start_time = time.time()
        try:
            response = requests.get(
                f"{self.base_url}/search",
                headers=self.headers,
                params={
                    "q": cui,
                    "require_exact_match": "false",
                    "pagesize": 50,
                    "include": "prefLabel,cui",
                    "display_context": "false",
                    "display_links": "true"
                }
            )
            response.raise_for_status()
            
            if self.monitor_performance:
                self.response_times.append(time.time() - start_time)
            
            data = response.json()
            
            for result in data.get("collection", []):
                links = result.get("links", {})
                ont_url = links.get("ontology", "")
                ont_acronym = ont_url.split("/")[-1] if ont_url else ""
                
                class_url = result.get("@id", "")
                pref_label = result.get("prefLabel", "")
                cui_list = result.get("cui", [])
                
                class_id = self.extract_id_from_url(class_url, ont_acronym)
                
                if ont_acronym and class_id and cui in cui_list:
                    mappings[ont_acronym].add((class_id, pref_label))
                    
        except requests.exceptions.RequestException as e:
            print(f"Error processing CUI {cui}: {str(e)}")
            return {}
            
        return {k: {"terms": [{"id": id, "label": label} for id, label in v]} 
                for k, v in mappings.items()}

    def process_cui_batch(self, cuis: List[str]) -> Dict[str, Dict]:
        """Process a batch of CUIs"""
        results = {}
        for cui in cuis:
            results[cui] = self.get_mappings_for_cui(cui)
            time.sleep(0.07)  # rate limit (<15 requests/second)
        return results

    def batch_process_cuis(self, cui_list: List[str], batch_size: int = 100) -> Dict[str, Dict]:
        """
        Process CUIs in batches using concurrent processing
        """
        all_mappings = {}
        total_batches = (len(cui_list) + batch_size - 1) // batch_size
        
        # Split CUIs into batches
        cui_batches = [cui_list[i:i + batch_size] for i in range(0, len(cui_list), batch_size)]
        
        print(f"\nProcessing {len(cui_list)} CUIs in {total_batches} batches")
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_concurrent) as executor:

            future_to_batch = {
                executor.submit(self.process_cui_batch, batch): i 
                for i, batch in enumerate(cui_batches)
            }
            
            with tqdm(total=len(cui_batches), desc="Processing batches") as pbar:
                for future in concurrent.futures.as_completed(future_to_batch):
                    batch_num = future_to_batch[future]
                    try:
                        batch_results = future.result()
                        all_mappings.update(batch_results)
                        
                        if self.monitor_performance and self.response_times:
                            avg_response = statistics.mean(self.response_times[-100:])
                            pbar.set_postfix({
                                'avg_response': f'{avg_response:.2f}s',
                                'batch': batch_num
                            })
                            
                            if avg_response > 2.0: 
                                self.max_concurrent = max(1, self.max_concurrent - 1)
                            elif avg_response < 1.0:
                                self.max_concurrent = min(15, self.max_concurrent + 1)
                                
                    except Exception as e:
                        print(f"\nError processing batch {batch_num}: {str(e)}")
                    
                    pbar.update(1)
        
        return all_mappings
    
    

# BioPortalMapper (BPM) from SPOKE embeddings to CUIs, creates a reversible
class BPM_SPOKE_to_CUI:
    def __init__(self, api_key: str, max_concurrent: int = 10, monitor_performance: bool = True):
        self.api_key = api_key
        self.base_url = "http://data.bioontology.org"
        self.headers = {
            "Authorization": f"apikey token={api_key}",
            "Accept": "application/json"
        }
        self.max_concurrent = max_concurrent
        self.response_times = []
        self.monitor_performance = monitor_performance

    def get_cuis(self, term: str) -> List[str]:
        """Get all CUIs for a given term"""
        start_time = time.time()
        cuis = set() 
        
        try:
            # if CUI, return it
            if term.startswith('C') and term[1:].isdigit():
                cuis.add(term)
            
            # If DOID, get CUIs from database_cross_reference
            elif term.startswith('DOID:'):
                doid_num = term.replace('DOID:', '')
                response = requests.get(
                    f"{self.base_url}/ontologies/DOID/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FDOID_{doid_num}",
                    headers=self.headers,
                    params={"include": "properties"}
                )
                
                if response.status_code == 200:
                    data = response.json()
                    if 'properties' in data:
                        # Get CUIs from hasDbXref
                        xrefs = data['properties'].get('http://www.geneontology.org/formats/oboInOwl#hasDbXref', [])
                        for ref in xrefs:
                            if ref.startswith('UMLS_CUI:'):
                                cuis.add(ref.replace('UMLS_CUI:', ''))
            
            # For all terms (or if no CUIs found yet), try search endpoint
            if not cuis or not term.startswith('DOID:'):
                response = requests.get(
                    f"{self.base_url}/search",
                    headers=self.headers,
                    params={
                        "q": term,
                        "require_exact_match": "true",
                        "include": "cui",
                        "display_context": "false",
                        "pagesize": 100,
                        "ontologies": "UMLS"  # target UMLS ontology
                    }
                )
                
                if response.status_code == 200:
                    data = response.json()
                    for result in data.get("collection", []):
                        if "cui" in result and isinstance(result["cui"], list):
                            cuis.update(result["cui"])
                        elif "cui" in result and isinstance(result["cui"], str):
                            cuis.add(result["cui"])
                
        except requests.exceptions.RequestException as e:
            print(f"Error processing term {term}: {str(e)}")
        finally:
            if self.monitor_performance:
                self.response_times.append(time.time() - start_time)

        return list(cuis)

    def process_cui_batch(self, terms: List[str]) -> Dict[str, List[str]]:
        """Process a batch of terms and return their CUIs"""
        results = {}
        for term in terms:
            cuis = self.get_cuis(term)
            if cuis:
                results[term] = cuis
            time.sleep(0.07)  # rate limit
        return results

    def batch_process_cuis(self, term_list: List[str], batch_size: int = 100) -> Dict[str, List[str]]:
        """Process terms in batches using concurrent processing"""
        all_mappings = {}
        total_batches = (len(term_list) + batch_size - 1) // batch_size
        
        term_batches = [term_list[i:i + batch_size] for i in range(0, len(term_list), batch_size)]
        
        print(f"\nProcessing {len(term_list)} terms in {total_batches} batches")
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_concurrent) as executor:
            future_to_batch = {
                executor.submit(self.process_cui_batch, batch): i 
                for i, batch in enumerate(term_batches)
            }
            
            with tqdm(total=len(term_batches), desc="Processing batches") as pbar:
                for future in concurrent.futures.as_completed(future_to_batch):
                    batch_num = future_to_batch[future]
                    try:
                        batch_results = future.result()
                        all_mappings.update(batch_results)
                        
                        if self.monitor_performance and self.response_times:
                            avg_response = statistics.mean(self.response_times[-100:])
                            pbar.set_postfix({
                                'avg_response': f'{avg_response:.2f}s',
                                'batch': batch_num
                            })
                    except Exception as e:
                        print(f"\nError processing batch {batch_num}: {str(e)}")
                    
                    pbar.update(1)
        
        return all_mappings

## Mapping AWAY from CUI


Maps AWAY from CUIs to the designated ontologies in BioPortal. This can be set by the user in the web interface to return faster response times

In [None]:
API_KEY = "your-API-key"

mapper = BMP_from_CUI(API_KEY, max_concurrent=5, monitor_performance=True)

mappings = mapper.batch_process_cuis(cuis, batch_size=20)

print(f"\nProcessed {len(mappings)} CUIs")
print(f"Average response time: {statistics.mean(mapper.response_times):.2f} seconds")

In [None]:
# save dictionary to a JSON file
with open('mappings.json', 'w') as json_file:
    json.dump(mappings, json_file)

## Mapping TO CUI from SPOKE

In [None]:
spoke = np.load('PSEV_matrix')
sep = np.load('PSEV_SEP_map')
spoke_node = np.load('PSEV_SPOKE_node_map')

spoke = pd.DataFrame(spoke, columns=spoke_node)
spoke.index = sep
spoke.index = spoke.index.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
spoke.columns = spoke.columns.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

In [None]:
# get the indices
spoke_ind = list(spoke.index)

In [None]:
API_KEY = "your-API-key"

mapper = BPM_SPOKE_to_CUI(API_KEY, max_concurrent=5, monitor_performance=True)

spoke_ind_mini = spoke_ind

spoke_mappings = mapper.batch_process_cuis(spoke_ind_mini, batch_size=5)

print(f"\nProcessed {len(spoke_mappings)} CUIs")
print(f"Average response time: {statistics.mean(mapper.response_times):.2f} seconds")

In [None]:
# save dictionary to a JSON file
with open('spoke_mappings.json', 'w') as json_file:
    json.dump(spoke_mappings, json_file)

In [None]:
len(spoke_mappings)

In [None]:
import pandas as pd
from collections import defaultdict

In [None]:
# Function to reverse the dictionary
def reverse_dict(original):
    reversed_dict = {}
    for key, values in original.items():
        for value in values:
            reversed_dict[value] = key
    return reversed_dict

# Function to combine columns and rename them based on the mapping
def combine_columns(df, mapping):
    # First filter to only keep columns that are in the mapping
    valid_columns = [col for col in df.columns if col in mapping]
    df_filtered = df[valid_columns]
    
    # Group columns by their mapped values
    column_groups = defaultdict(list)
    for col in df_filtered.columns:
        column_groups[mapping[col]].append(col)
    
    # Create all combined columns at once
    combined_cols = {
        new_col: df_filtered[old_cols].sum(axis=1) 
        for new_col, old_cols in column_groups.items()
    }
    
    # Create new dataframe all at once
    new_df = pd.DataFrame(combined_cols)
    
    return new_df

# Reverse the dictionary
spoke_mappings_rev = reverse_dict(spoke_mappings)
print("Reversed spoke mapping")

In [None]:
#! Apply the transformation
#! This is important to accurately merge the CUI data with SPOKE data

X_rev = combine_columns(X, spoke_mappings_rev)

# Create Embeddings

In [None]:
def chunk_dot_product(df, matrix, num_chunks):
    results = []
    chunk_size = (df.shape[0] + num_chunks - 1) // num_chunks  # Calculate chunk size based on num_chunks
    
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, df.shape[0])
        
        chunk = df.iloc[start_idx:end_idx]
        result_chunk = np.dot(chunk.values, matrix.values)
        results.append(result_chunk)
    
    return np.vstack(results)

In [None]:
#! Remember to load the X for the proper cohort above in the `Data` section
#! Then, obtain the combined version of the data in the `Mapping TO CUI from SPOKE` section to get X_rev

concept_list = list(X_rev.columns)

In [None]:
#Remove SEPs that are not in our cohort
spoke_filt = spoke.loc[spoke.index.isin(concept_list)]

In [None]:
#PSEVs for each patient
patient_psevs = np.dot(X.values, spoke_filt.values)

In [None]:
patient_psevs.shape

In [None]:
#! save the data
#! do this for each cohort!

np.save('p_cohort.npy', patient_psevs)

## Segment SPOKE top 30% variance

In [None]:
node_type = np.load('node_type_list.npy')
node_type = [x.decode('utf-8') if isinstance(x, bytes) else x for x in node_type]
node_type = pd.DataFrame({
    'node': spoke.columns,
    'type': node_type
})

In [None]:
unique_node_types = node_type['type'].unique()

for nt in unique_node_types:
    nt_patient_psevs = patient_psevs[:, node_type[node_type['type'] == nt].index]
    nodes = node_type[node_type['type'] == nt]['node']

    # Step 1: Calculate variance for each column
    chunk_size = 1000
    num_columns = nt_patient_psevs.shape[1]
    variances = []

    for start in range(0, num_columns, chunk_size):
        end = min(start + chunk_size, num_columns)
        chunk = nt_patient_psevs[:, start:end]
        chunk_variances = np.var(chunk, axis=0)
        variances.extend(chunk_variances)

    variances = np.array(variances)

    # Step 2: Determine the threshold for the top 30%
    threshold = np.percentile(variances, 70)  # 70th percentile

    # Step 3: Find the columns with variance above the threshold
    selected_columns = variances > threshold

    # Step 4: Filter the array to retain only these columns
    filtered_array = nt_patient_psevs[:, selected_columns]
    nodes = np.array(nodes[selected_columns])

    #Save the node-specific file
    np.save(f'p5/filtered_patient_psevs_{nt}.npy', filtered_array)
    np.save(f'p5/filtered_patient_psevs_columns_{nt}.npy', nodes)

np.save('p5/person_id_index.npy', np.array(person_id_index))

In [None]:
logreg = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=40, solver='saga')
y_select = model_eval_metrics(X, y, logreg, select=True)

In [None]:
logreg = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=40, solver='saga')
y_select_scale = model_eval_metrics(X, y, logreg, select=True, scale=True)

In [None]:
#! be sure to load in the p_cohort that has the new SPOKE mappings and make this the new X and y

logreg = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=40, solver='saga')
y_select_scale_spoke = model_eval_metrics(X, y, logreg, select=True, scale=True)

**SPOKE Model**

In [None]:
def load_psev_matrices(directory):
    # Get all files that start with 'filtered_patient_psevs_' but don't include 'columns'
    psev_files = [f for f in os.listdir(directory) 
                  if f.startswith('filtered_patient_psevs_') 
                  and 'columns' not in f
                  and f != 'person_id_index.npy']
    
    matrices = []
    column_names = []
    
    print("Loading matrices:")
    for psev_file in sorted(psev_files):
        matrix = np.load(os.path.join(directory, psev_file))
        
        # Load corresponding column names
        col_file = psev_file.replace('filtered_patient_psevs_', 
                                   'filtered_patient_psevs_columns_')
        cols = np.load(os.path.join(directory, col_file), allow_pickle=True)  # Added allow_pickle=True
        
        print(f"{psev_file}: shape {matrix.shape}, {len(cols)} columns")
        
        matrices.append(matrix)
        column_names.extend(cols)
    
    # concatenate horizontally
    combined_matrix = np.hstack(matrices)
    
    # load person ids
    person_ids = np.load(os.path.join(directory, 'person_id_index.npy'), allow_pickle=True)  # Added allow_pickle=True
    
    print(f"\nFinal matrix shape: {combined_matrix.shape}")
    print(f"Number of columns: {len(column_names)}")
    print(f"Number of patients: {len(person_ids)}")
    
    return combined_matrix, column_names, person_ids

In [None]:
directory = 'data/psev/p5'
psev_mat, psev_cols, psev_pats = load_psev_matrices(directory)

In [None]:
#! be sure to load in the p_cohort that has the new SPOKE mappings and make this the new X and y

logreg = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=40, solver='saga')
y_select_scale_spoke = model_eval_metrics(psev_mat, y, logreg, select=True, scale=True, psev_in=True)

## Interpretation

In [None]:
def calculate_95_ci(data):
    mean = np.mean(data)
    std = np.std(data, ddof=1)  # ddof=1 for sample standard deviation
    n = len(data)
    
    # For 95% CI, we use 1.96 as the critical value
    margin_of_error = 1.96 * (std / np.sqrt(n))
    
    ci_lower = mean - margin_of_error
    ci_upper = mean + margin_of_error
    
    return (ci_lower, ci_upper)


# ci = calculate_95_ci(roc_scores)
# print(f"95% CI: ({ci[0]:.4f}, {ci[1]:.4f})")

In [None]:
# Plot a figure of all the AUC ROCs

plt.figure(figsize=(6, 4), dpi=300)
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'



#! Below are all the lines for different y_pred_proba for each model


# # Calculate ROC curve
# fpr, tpr, _ = roc_curve(y_test, no_select_y_pred_proba)
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve with more prominent colors
# plt.plot(fpr, tpr, color='#4B8BCB', lw=2.5,  
#          label=f'All CUIs (AUC = {roc_auc:.4f})')

# # Calculate ROC curve
# fpr, tpr, _ = roc_curve(y_test, no_scale_y_pred_proba)
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve with more prominent colors
# plt.plot(fpr, tpr, color='#EED91F', lw=2.5,  
#          label=f'Select CUIs (AUC = {roc_auc:.4f})')

# # Calculate ROC curve
# fpr, tpr, _ = roc_curve(y_test, no_spoke_y_pred_proba)
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve with more prominent colors
# plt.plot(fpr, tpr, color='#FF8C42', lw=2.5,  
#          label=f'SPOKE CUIs (AUC = {roc_auc:.4f})')

# # Calculate ROC curve
# fpr, tpr, _ = roc_curve(y_test, psev_y_pred_proba)
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve with more prominent colors
# plt.plot(fpr, tpr, color='#FF3C38', lw=2.5,  
#          label=f'PSEV (AUC = {np.mean(roc_scores):.4f})')


plt.plot([0, 1], [0, 1], color='#7B7D7D', linestyle='--', lw=1.5,  
         label='Random (AUC = 0.5)')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14, labelpad=10)
plt.ylabel('True Positive Rate', fontsize=14, labelpad=10)
plt.title('Model Performance: ROC Curve', fontsize=16, pad=20)
plt.legend(loc='lower right', fontsize=12, framealpha=0.9)

plt.grid(True, linestyle='--', alpha=0.3)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()