# Before generating SPOKEsigs you will need EHR data...

This code used EHR data from PatientExplorer RDB (tables: Diagnosis, Medications, and Labs)<br/>

# Import modules

In [1]:
import pandas as pd
import numpy as np
import os
import requests
from requests.auth import HTTPBasicAuth
from retrying import retry
import warnings
from config import read_config
warnings.filterwarnings("ignore")

# Initialize file paths

In [12]:
data_path = os.path.join(os.path.dirname(os.getcwd()),'data')
omop_to_spoke_cdw_file = os.path.join(data_path, 'omop_to_spoke_cdw.tsv')
cohort_patient_spokesigs_file = os.path.join(data_path, 'cohort_patient_spokesigs.npy')
cohort_patient_info_file = os.path.join(data_path, 'cohort_patient_info.tsv')

# Files from patient explorer

In [13]:
diag_filename = os.path.join(data_path, 'breast_colon_ibd_conditions.csv')
med_filename = os.path.join(data_path, 'breast_colon_ibd_drugs.csv')
lab_filename = os.path.join(data_path, 'breast_colon_ibd_measurements_w_ab.csv')
new_patient_info_filename = os.path.join(data_path, 'random_patient_info.tsv')
new_patient_spokesig_filename = os.path.join(data_path, 'random_patient_spokesigs.npy')

if not os.path.exists(diag_filename):
    print('diag_filename is not saved. save it before proceeding.')
if not os.path.exists(med_filename):
    print('med_filename is not saved. save it before proceeding.')
if not os.path.exists(lab_filename):
    print('lab_filename is not saved. save it before proceeding.')
if not os.path.exists(new_patient_info_filename):
    print('new_patient_info_filename is not saved. save it before proceeding.')
if not os.path.exists(new_patient_spokesig_filename):
    print('new_patient_spokesig_filename is not saved. save it before proceeding.')

diag_filename is not saved. save it before proceeding.
med_filename is not saved. save it before proceeding.
lab_filename is not saved. save it before proceeding.
new_patient_info_filename is not saved. save it before proceeding.
new_patient_spokesig_filename is not saved. save it before proceeding.


# Custom functions

In [8]:
@retry(stop_max_attempt_number=50, wait_fixed=10000)
def get_api_resp(clinical_variables):
    return requests.get(url, params={'src':clinical_variables}, auth = HTTPBasicAuth(usr, psw), verify=False)

def make_spoke_sigs(all_concept_df):
    patients_seen, spoke_sigs = [], []
    for patient, clinical_variables in all_concept_df[['Patient_Index', 'concept_id']].values:
        result = get_api_resp(clinical_variables)
        if result.status_code == 200:
            if len(result.json()['spoke_sig']) >0:
                print("API call was successfull, spoke signature for patient %s is available in 'spoke_sig' variable" % patient)
                patients_seen.append(patient)
                spoke_sigs.append(result.json()['spoke_sig'])
            else:
                print('API call could not create a spoke signature for patient %s' % patient)
        else:
            print('API call was not successfull for patient %s, returned %s HTTP status code' % (patient, str(result.status_code)))    
    return np.array(patients_seen), np.array(spoke_sigs)

def load_and_filter_rdb_data(diag_filename, med_filename, lab_filename, omop_to_spoke):
    file_list = [diag_filename, med_filename, lab_filename]
    col_change = [{"icd10_code":'ICD10CM',"icd9_code":'ICD9CM'}, {'medication_id':'MED'}, {'lab_component_id':'LAB'}]
    domain_list = ['Condition', 'Drug', 'Measurement']
    all_rdb_db = pd.DataFrame(columns=['patient_id','rdb_id'])
    for i, (filename, col_change_dict) in enumerate(zip(file_list, col_change)):
        df = pd.read_csv(filename, header=0, index_col=False).rename(columns=col_change_dict)
        if domain_list[i] == 'Measurement':
            df = df[df.lab_result_abnormal=='Yes'].drop(['lab_result_abnormal'],axis=1)
        for col in list(col_change_dict.values()):
            df.loc[:,col] = ['%s:%s' % (col, rdb) for rdb in df[col].values]
            all_rdb_db = pd.concat((all_rdb_db, df[['patient_id', col]].rename(index=str, columns={col:'rdb_id'})),axis=0)
        del df
    all_rdb_db = pd.merge(all_rdb_db.rename(columns={'rdb_id':'Node_ID'}), omop_to_spoke[['Node_ID', 'concept_id', 'SPOKE_ID']], on='Node_ID')
    all_rdb_db.loc[:, 'concept_id'] = ['OMOP:%s' % concept_id for concept_id in all_rdb_db.concept_id.values]
    all_rdb_db = all_rdb_db.drop(['Node_ID'], axis=1).drop_duplicates()
    return all_rdb_db

def filter_by_sep_count(all_rdb_db, max_seps = 200):
    sep_count = all_rdb_db[['patient_id', 'SPOKE_ID']].drop_duplicates().groupby('patient_id').count().reset_index()
    print(sep_count.patient_id.unique().shape)
    sep_count = sep_count[sep_count.SPOKE_ID<=max_seps]
    print(sep_count.patient_id.unique().shape)
    all_rdb_db = pd.merge(all_rdb_db, sep_count[['patient_id']], on='patient_id')
    del sep_count
    return all_rdb_db

# API specifications

In [2]:
base_url = 'https://spokeapi.ucsf.edu'
end_point = '/v1/spoke_sig'
c = dict(read_config().items('API'))
usr = c['user']
psw = c['psw']
url = base_url + end_point

# load OMOP+RDB to SPOKE

In [10]:
omop_to_spoke = pd.read_csv(omop_to_spoke_cdw_file, sep='\t', header=0, index_col=False)
omop_to_spoke.head()

Unnamed: 0,OMOP_ID,Node_ID,SPOKE_ID,Node_type,Node_name,concept_id,map_id
0,OMOP:8690,ICD10CM:M76.9,DOID:204,Disease,enthesopathy,8690,M76.9
1,OMOP:1571038,ICD10CM:M76,DOID:204,Disease,enthesopathy,1571038,M76
2,OMOP:1571047,ICD10CM:M76.8,DOID:204,Disease,enthesopathy,1571047,M76.8
3,OMOP:35208994,ICD10CM:M76.89,DOID:204,Disease,enthesopathy,35208994,M76.89
4,OMOP:45567656,ICD10CM:M76.891,DOID:204,Disease,enthesopathy,45567656,M76.891


# Load and filter patient data

In [3]:
all_rdb_db = load_and_filter_rdb_data(diag_filename, med_filename, lab_filename, omop_to_spoke)
all_rdb_db.head()

Unnamed: 0,patient_id,concept_id,SPOKE_ID
0,D00040B4CC68B5,OMOP:35207155,DOID:1470
1,D00040B4CC68B5,OMOP:35207155,DOID:1596
6,D004F2E125F11F,OMOP:35207155,DOID:1470
7,D004F2E125F11F,OMOP:35207155,DOID:1596
8,D005B407B3BE66,OMOP:35207155,DOID:1470


# Filter by SEP Count (workshop ONLY)

In [4]:
all_rdb_db = filter_by_sep_count(all_rdb_db, max_seps = 200)
all_rdb_db.head()

(9506,)
(8515,)


Unnamed: 0,patient_id,concept_id,SPOKE_ID
0,D00040B4CC68B5,OMOP:35207155,DOID:1470
1,D00040B4CC68B5,OMOP:35207155,DOID:1596
2,D00040B4CC68B5,OMOP:35211261,C0039231
3,D00040B4CC68B5,OMOP:45567469,DOID:8398
4,D00040B4CC68B5,OMOP:35211262,C0428977


# Make Disease DF

In [5]:
disease_list = ['DOID:1612','DOID:219','DOID:9778']
# make df with patients and target diseases
disease_df = all_rdb_db[['patient_id', 'SPOKE_ID']][all_rdb_db.SPOKE_ID.isin(disease_list)].drop_duplicates().rename(columns={'SPOKE_ID':'Disease'})
# drop patients with more than one disease
disease_df = disease_df.drop_duplicates(subset=['patient_id'], keep=False)
disease_df.head()

Unnamed: 0,patient_id,Disease
139,D00040B4CC68B5,DOID:9778
331,D005B407B3BE66,DOID:1612
826,D0169ED985F916,DOID:9778
1127,D0193B605CBD87,DOID:1612
1201,D01AC0A9CBD63C,DOID:9778


# Convert concept_id into list

In [6]:
all_rdb_db = all_rdb_db[['patient_id', 'concept_id']].groupby('patient_id')['concept_id'].apply(list).reset_index()
all_rdb_db.head()

Unnamed: 0,patient_id,concept_id
0,D00040B4CC68B5,"[OMOP:35207155, OMOP:35207155, OMOP:35211261, ..."
1,D000468A469EFC,"[OMOP:45556996, OMOP:45556996, OMOP:35209392, ..."
2,D0006B6F5F988B,"[OMOP:45582718, OMOP:45562457, OMOP:45562457, ..."
3,D00091B0E81964,"[OMOP:45558454, OMOP:45558454, OMOP:35211268, ..."
4,D002F6662170CA,"[OMOP:45600511, OMOP:45600511, OMOP:45587508, ..."


# Filter for patients in cohort

In [7]:
cohort_list = ['DCE5223B53DC0D', 'D38AED218510BA', 'D22F698B840EFE', 'D2FD78AD500C21', 'DB6650E66595E6', 'D6D3EFE8304EE2', 'DDD4DF543E4F15', 'DBAA51EF364B2A', 'D060061FF9F4FD', 'D0BF19F4BF830A', 'DB757350D863FA']
filtered_concept_df = all_rdb_db[all_rdb_db.patient_id.isin(cohort_list)]
filtered_concept_df.loc[:,'Patient_Index'] = np.arange(len(filtered_concept_df))
filtered_concept_df.head()

Unnamed: 0,patient_id,concept_id,Patient_Index
185,D060061FF9F4FD,"[OMOP:35207155, OMOP:35207155, OMOP:45534443, ...",0
379,D0BF19F4BF830A,"[OMOP:35208190, OMOP:45542912, OMOP:45542912, ...",1
1182,D22F698B840EFE,"[OMOP:35207155, OMOP:35207155, OMOP:35211262, ...",2
1609,D2FD78AD500C21,"[OMOP:35211261, OMOP:45582718, OMOP:35208968, ...",3
1858,D38AED218510BA,"[OMOP:45558454, OMOP:45558454, OMOP:45534435, ...",4


# Make SPOKEsigs

In [8]:
patients_seen, spoke_sigs = make_spoke_sigs(filtered_concept_df)

API call was successfull, spoke signature for patient 0 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 1 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 2 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 3 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 4 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 5 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 6 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 7 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 8 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 9 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 10 is available 

# Save SPOKEsigs and cohort information

In [9]:
# add Diseases to cohort info 
filtered_concept_df = pd.merge(filtered_concept_df, disease_df, on='patient_id').sort_values('Patient_Index')
print(filtered_concept_df.head())

# filter patients w/o SPOKEsigs
if len(filtered_concept_df) > len(patients_seen):
    filtered_concept_df = filtered_concept_df[filtered_concept_df.Patient_Index.isin(patients_seen)]
    filtered_concept_df.loc[:,'Patient_Index'] = np.arange(len(filtered_concept_df))

# save patient info and SPOKEsigs
np.save(cohort_patient_spokesigs_file, spoke_sigs, allow_pickle=False)
filtered_concept_df.to_csv(cohort_patient_info_file, sep='\t', header=True, index=False)

       patient_id                                         concept_id  \
0  D060061FF9F4FD  [OMOP:35207155, OMOP:35207155, OMOP:45534443, ...   
1  D0BF19F4BF830A  [OMOP:35208190, OMOP:45542912, OMOP:45542912, ...   
2  D22F698B840EFE  [OMOP:35207155, OMOP:35207155, OMOP:35211262, ...   
3  D2FD78AD500C21  [OMOP:35211261, OMOP:45582718, OMOP:35208968, ...   
4  D38AED218510BA  [OMOP:45558454, OMOP:45558454, OMOP:45534435, ...   

   Patient_Index    Disease  
0              0  DOID:1612  
1              1  DOID:1612  
2              2  DOID:9778  
3              3  DOID:9778  
4              4  DOID:9778  


# Make SPOKEsigs for random cohort

In [10]:
n_per_disease=5
random_patients = np.concatenate([np.random.choice(disease_df[disease_df.Disease==d].patient_id.unique(), n_per_disease, replace=False) for d in disease_df.Disease.unique()])
random_patient_concepts = pd.merge(all_rdb_db[all_rdb_db.patient_id.isin(random_patients)], disease_df, on='patient_id')
random_patient_concepts.loc[:,'Patient_Index'] = np.arange(len(random_patient_concepts))
random_patient_concepts

Unnamed: 0,patient_id,concept_id,Disease,Patient_Index
0,D17520B5D8BA4E,"[OMOP:37200312, OMOP:37200312, OMOP:35209281, ...",DOID:9778,0
1,D217E4FA3B5A83,"[OMOP:45600511, OMOP:45600511, OMOP:920170, OM...",DOID:1612,1
2,D346AA72E51811,"[OMOP:45582718, OMOP:45534458, OMOP:45534458, ...",DOID:9778,2
3,D471E6957130F9,"[OMOP:45556996, OMOP:45556996, OMOP:35209392, ...",DOID:1612,3
4,D628A303CFA65B,"[OMOP:35207668, OMOP:35207668, OMOP:35207668, ...",DOID:219,4
5,D6C3373D09F8E8,"[OMOP:37200312, OMOP:37200312, OMOP:45572468, ...",DOID:219,5
6,D8672D017775BF,"[OMOP:35207668, OMOP:35207668, OMOP:35207668, ...",DOID:1612,6
7,D8ED8D57D97F54,"[OMOP:35208287, OMOP:35208287, OMOP:45576183, ...",DOID:9778,7
8,D97D1DDA7182DB,"[OMOP:35208969, OMOP:35208969, OMOP:45553714, ...",DOID:9778,8
9,DA05072A44A583,"[OMOP:35208968, OMOP:35208968, OMOP:35208190, ...",DOID:1612,9


In [11]:
patients_seen, spoke_sigs = make_spoke_sigs(random_patient_concepts)

# filter patients w/o SPOKEsigs
if len(random_patient_concepts) > len(patients_seen):
    random_patient_concepts = random_patient_concepts[random_patient_concepts.Patient_Index.isin(patients_seen)]
    random_patient_concepts.loc[:,'Patient_Index'] = np.arange(len(random_patient_concepts))

# save patient info and SPOKEsigs
np.save(new_patient_spokesig_filename, spoke_sigs, allow_pickle=False)
random_patient_concepts.to_csv(new_patient_info_filename, sep='\t', header=True, index=False)

API call was successfull, spoke signature for patient 0 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 1 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 2 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 3 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 4 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 5 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 6 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 7 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 8 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 9 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 10 is available 