# Before generating SPOKEsigs you will need EHR data...

This code used EHR data from PatientExplorer RDB (tables: Diagnosis, Medications, and Labs)<br/>

# Import modules

In [1]:
import pandas as pd
import numpy as np
import os
import requests
from requests.auth import HTTPBasicAuth
from retrying import retry
import warnings
from config import read_config
warnings.filterwarnings("ignore")

# Initialize file paths

In [2]:
data_path = os.path.join(os.path.dirname(os.getcwd()),'data')
omop_to_spoke_cdw_file = os.path.join(data_path, 'omop_to_spoke_cdw.tsv')
cohort_patient_info_file = os.path.join(data_path, 'patient_info_df.tsv')
node_info_file = os.path.join(data_path, 'node_info_df.tsv')

# Files from patient explorer

In [3]:
diag_filename = os.path.join(data_path, 'breast_colon_ibd_conditions.csv')
med_filename = os.path.join(data_path, 'breast_colon_ibd_drugs.csv')
lab_filename = os.path.join(data_path, 'breast_colon_ibd_measurements_w_ab.csv')


if not os.path.exists(diag_filename):
    print('diag_filename is not saved. save it before proceeding.')
if not os.path.exists(med_filename):
    print('med_filename is not saved. save it before proceeding.')
if not os.path.exists(lab_filename):
    print('lab_filename is not saved. save it before proceeding.')

# API specifications

In [4]:
@retry(stop_max_attempt_number=50, wait_fixed=10000)
def get_api_resp(clinical_variables):
    return requests.get(url, params={'src':clinical_variables}, auth = HTTPBasicAuth(usr, psw), verify=False)

def make_spoke_sigs(all_concept_df):
    patients_seen, spoke_sigs = [], []
    for patient, clinical_variables in all_concept_df[['Patient_Index', 'concept_id']].values:
        result = get_api_resp(clinical_variables)
        if result.status_code == 200:
            if len(result.json()['spoke_sig']) >0:
                print("API call was successfull, spoke signature for patient %s is available in 'spoke_sig' variable" % patient)
                patients_seen.append(patient)
                spoke_sigs.append(result.json()['spoke_sig'])
            else:
                print('API call could not create a spoke signature for patient %s' % patient)
        else:
            print('API call was not successfull for patient %s, returned %s HTTP status code' % (patient, str(result.status_code)))    
    return np.array(patients_seen), np.array(spoke_sigs)


base_url = 'https://spokeapi.ucsf.edu'
end_point = '/v1/spoke_sig'
c = dict(read_config().items('API'))
usr = c['user']
psw = c['psw']
url = base_url + end_point

# Load SPOKE and mapping data

In [5]:
print('loading SPOKE nodes ...')
node_info_df = pd.read_csv(node_info_file, sep='\t', header=0, index_col=False)

print('loading mapping files')
omop_to_spoke = pd.read_csv(omop_to_spoke_cdw_file, sep='\t', header=0, index_col=False)
omop_to_spoke.head()

loading SPOKE nodes ...
loading mapping files


Unnamed: 0,OMOP_ID,Node_ID,SPOKE_ID,Node_type,Node_name,concept_id,map_id
0,OMOP:8690,ICD10CM:M76.9,DOID:204,Disease,enthesopathy,8690,M76.9
1,OMOP:1571038,ICD10CM:M76,DOID:204,Disease,enthesopathy,1571038,M76
2,OMOP:1571047,ICD10CM:M76.8,DOID:204,Disease,enthesopathy,1571047,M76.8
3,OMOP:35208994,ICD10CM:M76.89,DOID:204,Disease,enthesopathy,35208994,M76.89
4,OMOP:45567656,ICD10CM:M76.891,DOID:204,Disease,enthesopathy,45567656,M76.891


# Load and filter patient data

In [6]:
from mapping_functions import load_and_filter_rdb_data

all_rdb_db, patient_info_df = load_and_filter_rdb_data(diag_filename, med_filename, lab_filename, cohort_patient_info_file, omop_to_spoke) #32082
all_rdb_db.head()

(27104,)
(15558,)


Unnamed: 0,patient_id,concept_id,SPOKE_ID
0,D00040B4CC68B5,OMOP:35207097,C0020461
1,D00040B4CC68B5,OMOP:45552665,C0035321
3,D00040B4CC68B5,OMOP:35207790,C0003811
4,D00040B4CC68B5,OMOP:45548672,C0242350
5,D00040B4CC68B5,OMOP:35206800,DOID:0060025


# Add diseases to patient info df

In [7]:
from mapping_functions import add_diseases_to_patient_info

disease_list = ['DOID:9778','DOID:1612','DOID:219']
patient_info_df = add_diseases_to_patient_info(disease_list, all_rdb_db, patient_info_df, node_info_df)
patient_info_df.head()

Unnamed: 0,patient_id,Ethnicity,Sex,Smoking Status,patient_status,Race,Age,Patient_Index,train_or_test,SEP_Count,OMOP_Count,SPOKE_ID,label,Disease
0,D00040B4CC68B5,Not Hispanic or Latino,Male,Never Smoker,Alive,White or Caucasian,60,0,Train,184,152,DOID:9778,0,irritable_bowel_syndrome
2,D0006B6F5F988B,Not Hispanic or Latino,Female,Never Smoker,Deceased,Black or African American,57,1,Test,138,156,DOID:1612,1,breast_cancer
20,D004224317CECE,Not Hispanic or Latino,Male,Never Smoker,Alive,White or Caucasian,70,2,Train,57,82,DOID:219,2,colon_cancer
96,D012F6D43E93F8,Not Hispanic or Latino,Male,Former Smoker,Deceased,Black or African American,68,3,Train,156,192,DOID:219,2,colon_cancer
159,D01CDFCAE04C90,Not Hispanic or Latino,Female,Current Every Day Smoker,Alive,Mixed Race/Other,53,4,Train,90,72,DOID:219,2,colon_cancer


# Convert concept_id into list

In [8]:
all_rdb_db = all_rdb_db[['patient_id', 'concept_id']].groupby('patient_id')['concept_id'].apply(list).reset_index()
all_rdb_db.head()

Unnamed: 0,patient_id,concept_id
0,D00040B4CC68B5,"[OMOP:35207097, OMOP:45552665, OMOP:35207790, ..."
1,D00061F466FD92,"[OMOP:35207155, OMOP:35207155, OMOP:45548395, ..."
2,D0006B6F5F988B,"[OMOP:35207097, OMOP:35207790, OMOP:45534458, ..."
3,D000FC7BE3A3BE,"[OMOP:35211336, OMOP:35211336, OMOP:35207163, ..."
4,D00136EA1A9254,"[OMOP:37200312, OMOP:37200312, OMOP:35207155, ..."


# Make random cohort

In [9]:
test_patients_df = patient_info_df[patient_info_df.train_or_test=='Test']

n_per_disease=5
random_patients = np.concatenate([np.random.choice(test_patients_df[test_patients_df.Disease==d].patient_id.unique(), n_per_disease, replace=False) for d in test_patients_df.Disease.unique()])
random_patient_concepts = pd.merge(all_rdb_db, test_patients_df[test_patients_df.patient_id.isin(random_patients)], on='patient_id')
random_patient_concepts.loc[:,'Patient_Index'] = np.arange(len(random_patient_concepts))
random_patient_concepts

Unnamed: 0,patient_id,concept_id,Ethnicity,Sex,Smoking Status,patient_status,Race,Age,Patient_Index,train_or_test,SEP_Count,OMOP_Count,SPOKE_ID,label,Disease
0,D01F016387AAEB,"[OMOP:35206139, OMOP:35206139, OMOP:35206139, ...",Hispanic or Latino,Female,,Alive,Mixed Race/Other,69,0,Test,9,6,DOID:219,2,colon_cancer
1,D0400FA478A5EA,"[OMOP:45600511, OMOP:45600511, OMOP:35209456, ...",Not Hispanic or Latino,Female,*Unknown,Alive,Black or African American,53,1,Test,4,5,DOID:1612,1,breast_cancer
2,D129239DD38C14,"[OMOP:45532939, OMOP:35206139, OMOP:35206139, ...",Not Hispanic or Latino,Male,,Alive,White or Caucasian,69,2,Test,16,19,DOID:219,2,colon_cancer
3,D2AF879CE6695C,"[OMOP:35208287, OMOP:35208287, OMOP:45568114, ...",Unknown/Declined,Male,Former Smoker,Alive,Mixed Race/Other,40,3,Test,30,18,DOID:9778,0,irritable_bowel_syndrome
4,D38541725FC340,"[OMOP:45552539, OMOP:35208287, OMOP:35208287, ...",Not Hispanic or Latino,Female,Former Smoker,Alive,White or Caucasian,42,4,Test,119,110,DOID:9778,0,irritable_bowel_syndrome
5,D3F7BC157234A3,"[OMOP:35211336, OMOP:35211336, OMOP:35208287, ...",Not Hispanic or Latino,Female,,Alive,White or Caucasian,51,5,Test,24,22,DOID:9778,0,irritable_bowel_syndrome
6,D403ECC44F92F2,"[OMOP:45600511, OMOP:45600511, OMOP:35209501, ...",Unknown/Declined,Female,*Unknown,Alive,Mixed Race/Other,43,6,Test,3,4,DOID:1612,1,breast_cancer
7,D408046DB40400,"[OMOP:45534458, OMOP:45534458, OMOP:45582718, ...",Hispanic or Latino,Male,,Alive,Mixed Race/Other,58,7,Test,24,30,DOID:219,2,colon_cancer
8,D5EC9C1A817435,"[OMOP:35206139, OMOP:35206139, OMOP:35206139, ...",Unknown/Declined,Female,,Alive,Mixed Race/Other,67,8,Test,5,4,DOID:219,2,colon_cancer
9,D651CC5B8DE02B,"[OMOP:35207790, OMOP:45562457, OMOP:45562457, ...",Not Hispanic or Latino,Male,,Alive,White or Caucasian,53,9,Test,50,52,DOID:219,2,colon_cancer


# Make SPOKEsigs for random cohort

In [10]:
patients_seen, spoke_sigs = make_spoke_sigs(random_patient_concepts)

# filter patients w/o SPOKEsigs
if len(random_patient_concepts) > len(patients_seen):
    random_patient_concepts = random_patient_concepts[random_patient_concepts.Patient_Index.isin(patients_seen)]
    random_patient_concepts.loc[:,'Patient_Index'] = np.arange(len(random_patient_concepts))


API call was successfull, spoke signature for patient 0 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 1 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 2 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 3 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 4 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 5 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 6 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 7 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 8 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 9 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 10 is available 

# Save data

In [11]:
# save new test patient info and SPOKEsigs
new_patient_info_filename = os.path.join(data_path, 'random_patient_info.tsv')
new_patient_spokesig_filename = os.path.join(data_path, 'random_patient_spokesigs.npy')
np.save(new_patient_spokesig_filename, spoke_sigs, allow_pickle=False)
random_patient_concepts = random_patient_concepts.drop(['concept_id'], axis=1)
random_patient_concepts.to_csv(new_patient_info_filename, sep='\t', header=True, index=False)

# save new train patient info
example_cohort_info_filename = os.path.join(data_path, 'example_cohort.tsv')
example_cohort = patient_info_df[patient_info_df.train_or_test=='Train']
example_cohort.loc[:,'Patient_Index'] = np.arange(len(example_cohort))
example_cohort.to_csv(example_cohort_info_filename, sep='\t', header=True, index=False)
