# Before generating SPOKEsigs you will need EHR data...

This code used EHR data from PatientExplorer RDB (tables: Diagnosis, Medications, and Labs)<br/>

# Import modules

In [8]:
import pandas as pd
import numpy as np
import os
import requests
from requests.auth import HTTPBasicAuth
from retrying import retry
import warnings
from config import read_config
warnings.filterwarnings("ignore")

# Initialize file paths

In [9]:
data_path = os.path.join(os.path.dirname(os.getcwd()),'data')
omop_to_spoke_cdw_file = os.path.join(data_path, 'omop_to_spoke_cdw.tsv')
cohort_patient_spokesigs_file = os.path.join(data_path, 'cohort_patient_spokesigs.npy')
cohort_patient_info_file = os.path.join(data_path, 'cohort_patient_info.tsv')

# Files from patient explorer

In [10]:
diag_filename = os.path.join(data_path, 'breast_colon_ibd_conditions.csv')
med_filename = os.path.join(data_path, 'breast_colon_ibd_drugs.csv')
lab_filename = os.path.join(data_path, 'breast_colon_ibd_measurements_w_ab.csv')
new_patient_info_filename = os.path.join(data_path, 'random_patient_info.tsv')
new_patient_spokesig_filename = os.path.join(data_path, 'random_patient_spokesigs.npy')

if not os.path.exists(diag_filename):
    print('diag_filename is not saved. save it before proceeding.')
if not os.path.exists(med_filename):
    print('med_filename is not saved. save it before proceeding.')
if not os.path.exists(lab_filename):
    print('lab_filename is not saved. save it before proceeding.')

# Custom functions

In [11]:
@retry(stop_max_attempt_number=50, wait_fixed=10000)
def get_api_resp(clinical_variables):
    return requests.get(url, params={'src':clinical_variables}, auth = HTTPBasicAuth(usr, psw), verify=False)

def make_spoke_sigs(all_concept_df):
    patients_seen, spoke_sigs = [], []
    for patient, clinical_variables in all_concept_df[['Patient_Index', 'concept_id']].values:
        result = get_api_resp(clinical_variables)
        if result.status_code == 200:
            if len(result.json()['spoke_sig']) >0:
                print("API call was successfull, spoke signature for patient %s is available in 'spoke_sig' variable" % patient)
                patients_seen.append(patient)
                spoke_sigs.append(result.json()['spoke_sig'])
            else:
                print('API call could not create a spoke signature for patient %s' % patient)
        else:
            print('API call was not successfull for patient %s, returned %s HTTP status code' % (patient, str(result.status_code)))    
    return np.array(patients_seen), np.array(spoke_sigs)

def load_and_filter_rdb_data(diag_filename, med_filename, lab_filename, omop_to_spoke):
    file_list = [diag_filename, med_filename, lab_filename]
    col_change = [{"icd10_code":'ICD10CM',"icd9_code":'ICD9CM'}, {'medication_id':'MED'}, {'lab_component_id':'LAB'}]
    domain_list = ['Condition', 'Drug', 'Measurement']
    all_rdb_db = pd.DataFrame(columns=['patient_id','rdb_id'])
    for i, (filename, col_change_dict) in enumerate(zip(file_list, col_change)):
        df = pd.read_csv(filename, header=0, index_col=False).rename(columns=col_change_dict)
        if domain_list[i] == 'Measurement':
            df = df[df.lab_result_abnormal=='Yes'].drop(['lab_result_abnormal'],axis=1)
        for col in list(col_change_dict.values()):
            df.loc[:,col] = ['%s:%s' % (col, rdb) for rdb in df[col].values]
            all_rdb_db = pd.concat((all_rdb_db, df[['patient_id', col]].rename(index=str, columns={col:'rdb_id'})),axis=0)
        del df
    all_rdb_db = pd.merge(all_rdb_db.rename(columns={'rdb_id':'Node_ID'}), omop_to_spoke[['Node_ID', 'concept_id', 'SPOKE_ID']], on='Node_ID')
    all_rdb_db.loc[:, 'concept_id'] = ['OMOP:%s' % concept_id for concept_id in all_rdb_db.concept_id.values]
    all_rdb_db = all_rdb_db.drop(['Node_ID'], axis=1).drop_duplicates()
    return all_rdb_db

def filter_by_sep_count(all_rdb_db, max_seps = 200):
    sep_count = all_rdb_db[['patient_id', 'SPOKE_ID']].drop_duplicates().groupby('patient_id').count().reset_index()
    print(sep_count.patient_id.unique().shape)
    sep_count = sep_count[sep_count.SPOKE_ID<=max_seps]
    print(sep_count.patient_id.unique().shape)
    all_rdb_db = pd.merge(all_rdb_db, sep_count[['patient_id']], on='patient_id')
    del sep_count
    return all_rdb_db

# API specifications

In [12]:
base_url = 'https://spokeapi.ucsf.edu'
end_point = '/v1/spoke_sig'
c = dict(read_config().items('API'))
usr = c['user']
psw = c['psw']
url = base_url + end_point

# load OMOP+RDB to SPOKE

In [13]:
omop_to_spoke = pd.read_csv(omop_to_spoke_cdw_file, sep='\t', header=0, index_col=False)
omop_to_spoke.head()

Unnamed: 0,OMOP_ID,Node_ID,SPOKE_ID,Node_type,Node_name,concept_id,map_id
0,OMOP:8690,ICD10CM:M76.9,DOID:204,Disease,enthesopathy,8690,M76.9
1,OMOP:1571038,ICD10CM:M76,DOID:204,Disease,enthesopathy,1571038,M76
2,OMOP:1571047,ICD10CM:M76.8,DOID:204,Disease,enthesopathy,1571047,M76.8
3,OMOP:35208994,ICD10CM:M76.89,DOID:204,Disease,enthesopathy,35208994,M76.89
4,OMOP:45567656,ICD10CM:M76.891,DOID:204,Disease,enthesopathy,45567656,M76.891


# Load and filter patient data

In [14]:
all_rdb_db = load_and_filter_rdb_data(diag_filename, med_filename, lab_filename, omop_to_spoke)
all_rdb_db.head()

Unnamed: 0,patient_id,concept_id,SPOKE_ID
0,D000074BF3D64D,OMOP:45562457,C0004096
1,D000074BF3D64D,OMOP:45562457,DOID:2841
2,D000074BF3D64D,OMOP:45562457,DOID:9360
3,D000074BF3D64D,OMOP:45562457,DOID:9415
4,D000206A2EACC1,OMOP:45562457,C0004096


# Filter by SEP Count (workshop ONLY)

In [15]:
all_rdb_db = filter_by_sep_count(all_rdb_db, max_seps = 200)
all_rdb_db.head()

(52251,)
(51578,)


Unnamed: 0,patient_id,concept_id,SPOKE_ID
0,D000074BF3D64D,OMOP:45562457,C0004096
1,D000074BF3D64D,OMOP:45562457,DOID:2841
2,D000074BF3D64D,OMOP:45562457,DOID:9360
3,D000074BF3D64D,OMOP:45562457,DOID:9415
4,D000074BF3D64D,OMOP:35207990,C0151790


# Make Disease DF

In [16]:
disease_list = ['DOID:1612','DOID:219','DOID:9778']
# make df with patients and target diseases
disease_df = all_rdb_db[['patient_id', 'SPOKE_ID']][all_rdb_db.SPOKE_ID.isin(disease_list)].drop_duplicates().rename(columns={'SPOKE_ID':'Disease'})
# drop patients with more than one disease
disease_df = disease_df.drop_duplicates(subset=['patient_id'], keep=False)
disease_df.head()

Unnamed: 0,patient_id,Disease
461,D00040B4CC68B5,DOID:9778
779,D0006B6F5F988B,DOID:1612
2027,D001DF6EB09A06,DOID:219
2268,D0020A85670CAF,DOID:9778
5508,D004728B24000B,DOID:1612


# Convert concept_id into list

In [17]:
all_rdb_db = all_rdb_db[['patient_id', 'concept_id']].groupby('patient_id')['concept_id'].apply(list).reset_index()
all_rdb_db.head()

Unnamed: 0,patient_id,concept_id
0,D000074BF3D64D,"[OMOP:45562457, OMOP:45562457, OMOP:45562457, ..."
1,D000078BCE8F37,"[OMOP:45605556, OMOP:45605556, OMOP:45552539, ..."
2,D0000ADAACE372,"[OMOP:45582909, OMOP:35209489, OMOP:35209489, ..."
3,D0000BB51C51E3,"[OMOP:45587068, OMOP:45577227, OMOP:45577227, ..."
4,D0000BE1261731,"[OMOP:45587068, OMOP:45563299, OMOP:35208722, ..."


# Make SPOKEsigs for random cohort

In [19]:
n_per_disease=5
random_patients = np.concatenate([np.random.choice(disease_df[disease_df.Disease==d].patient_id.unique(), n_per_disease, replace=False) for d in disease_df.Disease.unique()])
random_patient_concepts = pd.merge(all_rdb_db[all_rdb_db.patient_id.isin(random_patients)], disease_df, on='patient_id')
random_patient_concepts.loc[:,'Patient_Index'] = np.arange(len(random_patient_concepts))
random_patient_concepts

Unnamed: 0,patient_id,concept_id,Disease,Patient_Index
0,D03F0C21992024,"[OMOP:45534422, OMOP:45534422, OMOP:45587496, ...",DOID:1612,0
1,D043F71A76736B,"[OMOP:45552539, OMOP:35207668, OMOP:35207668, ...",DOID:9778,1
2,D04BB7194EC25D,"[OMOP:44824029, OMOP:44824029]",DOID:1612,2
3,D04BB899BF3AD8,"[OMOP:45576183, OMOP:45576183, OMOP:45576183, ...",DOID:9778,3
4,D0512CE0ADC5C5,"[OMOP:35209489, OMOP:35209489, OMOP:45576183, ...",DOID:9778,4
5,D053D962208C2A,"[OMOP:35207668, OMOP:35207668, OMOP:35207668, ...",DOID:1612,5
6,D05779A93D3E18,"[OMOP:35209383, OMOP:35207668, OMOP:35207668, ...",DOID:219,6
7,D0707AE5B73B9D,"[OMOP:45587068, OMOP:45577227, OMOP:45577227, ...",DOID:1612,7
8,D07AE0C89AE03E,"[OMOP:45562457, OMOP:45562457, OMOP:45562457, ...",DOID:219,8
9,D081DA005C12E0,"[OMOP:45576183, OMOP:45576183, OMOP:45576183, ...",DOID:1612,9


In [20]:
patients_seen, spoke_sigs = make_spoke_sigs(random_patient_concepts)

# filter patients w/o SPOKEsigs
if len(random_patient_concepts) > len(patients_seen):
    random_patient_concepts = random_patient_concepts[random_patient_concepts.Patient_Index.isin(patients_seen)]
    random_patient_concepts.loc[:,'Patient_Index'] = np.arange(len(random_patient_concepts))

# save patient info and SPOKEsigs
np.save(new_patient_spokesig_filename, spoke_sigs, allow_pickle=False)
random_patient_concepts.to_csv(new_patient_info_filename, sep='\t', header=True, index=False)

API call was successfull, spoke signature for patient 0 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 1 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 2 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 3 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 4 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 5 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 6 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 7 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 8 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 9 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 10 is available 