# Before running this code please use the following queries to download the patient data using the DE-ID UCSF OMOP data on Microsoft Remote Desktop:

SELECT DISTINCT OMOP_DEID.dbo.condition_occurrence.person_id, OMOP_DEID.dbo.condition_occurrence.condition_concept_id AS concept_id, OMOP_DEID.dbo.condition_occurrence.condition_start_date AS omop_date FROM OMOP_DEID.dbo.condition_occurrence
WHERE OMOP_DEID.dbo.condition_occurrence.person_id IN (742728, 1918661, 2956795, 3075073, 3516176, 230130, 547231, 1291295, 1504450, 3745109, 411126, 1509724, 2417557, 3607047, 4260151)

SELECT DISTINCT OMOP_DEID.dbo.drug_exposure.person_id, OMOP_DEID.dbo.drug_exposure.drug_concept_id AS concept_id, OMOP_DEID.dbo.drug_exposure.drug_exposure_start_date AS omop_date FROM OMOP_DEID.dbo.drug_exposure
WHERE OMOP_DEID.dbo.drug_exposure.person_id IN (742728, 1918661, 2956795, 3075073, 3516176, 230130, 547231, 1291295, 1504450, 3745109, 411126, 1509724, 2417557, 3607047, 4260151)

SELECT DISTINCT OMOP_DEID.dbo.measurement.person_id, OMOP_DEID.dbo.measurement.measurement_concept_id AS concept_id, OMOP_DEID.dbo.measurement.measurement_date AS omop_date, OMOP_DEID.dbo.measurement.value_as_number, OMOP_DEID.dbo.measurement.value_as_concept_id, OMOP_DEID.dbo.measurement.range_low, OMOP_DEID.dbo.measurement.range_high, OMOP_DEID.dbo.measurement.value_source_value FROM OMOP_DEID.dbo.measurement
WHERE OMOP_DEID.dbo.measurement.person_id IN (742728, 1918661, 2956795, 3075073, 3516176, 230130, 547231, 1291295, 1504450, 3745109, 411126, 1509724, 2417557, 3607047, 4260151)


# Import modules

In [None]:
import pandas as pd
import numpy as np
import requests
from requests.auth import HTTPBasicAuth
from retrying import retry
import warnings
warnings.filterwarnings("ignore")
'''
base_url = 'https://spokeapi.ucsf.edu'
end_point = '/v1/spoke_sig'
usr = 'user1'
psw = 'Ou0At7jXZJmt9oGP9fbGZ2TwstNFFE2l7Tru69HxnCI'
url = base_url + end_point
'''
base_url = 'http://44.233.240.105'
end_point = '/v1/spoke_sig'
url = base_url + end_point
usr = 'bridge'
psw = 'hrPHsWc3JVm8Sn0s2UPpzDB7v9Ix11iYBJei1gZOfxM'

diseases = ['DOID:9778', 'DOID:1612', 'DOID:219']

@retry(stop_max_attempt_number=50, wait_fixed=10000)
def get_api_resp(clinical_variables):
    return requests.get(url, params={'src':clinical_variables}, auth = HTTPBasicAuth(usr, psw), verify=False)

def make_spoke_sigs(all_concept_df):
    patients_seen, spoke_sigs = [], []
    for patient, clinical_variables in all_concept_df[['Patient_Index', 'concept_id']].values:
        result = get_api_resp(clinical_variables)
        if result.status_code == 200:
            if len(result.json()[0]['spoke_sig']) >0:
                print("API call was successfull, spoke signature for patient %s is available in 'spoke_sig' variable" % patient)
                patients_seen.append(patient)
                spoke_sigs.append(result.json()[0]['spoke_sig'])
            else:
                print('API call could not create a spoke signature for patient %s' % patient)
        else:
            print('API call was not successfull for patient %s, returned %s HTTP status code' % (patient, str(result.status_code)))    
    return np.array(patients_seen), np.array(spoke_sigs)

def check_if_float(val):
    try:
        val = float(val)
        val = True
    except ValueError:
        val = False
    return val

def load_patient_information(diag_filename, med_filename, lab_filename, omop_to_diseases):
    print('Reading diagnosis file ...')
    diag_df = pd.read_csv(diag_filename, sep='\t', header=0, index_col=False)
    patient_to_disease = pd.merge(diag_df[['person_id', 'concept_id']], omop_to_diseases, on='concept_id').drop(['concept_id'], axis=1).drop_duplicates()
    print('Reading medication file ...')
    med_df = pd.read_csv(med_filename, sep='\t', header=0, index_col=False)
    print('Reading lab file ...')
    lab_df = pd.read_csv(lab_filename, sep='\t', header=0, index_col=False)
    print('Filtering lab file ...')
    lab_df.loc[:,'is_float'] = [check_if_float(val) for val in lab_df.value_as_number.values]
    lab_df = lab_df[lab_df.is_float==True]
    lab_df.loc[:,'value_as_number'] = lab_df.value_as_number.values.astype(float)
    lab_df = lab_df[(lab_df.value_as_number<lab_df.range_low)|(lab_df.value_as_number>lab_df.range_high)]
    print('Merging OMOP files ...')
    cols = ['person_id', 'concept_id']
    all_concept_df = pd.concat((diag_df[cols], med_df[cols], lab_df[cols]), axis=0).drop_duplicates()
    all_concept_df.loc[:,'concept_id'] = ['OMOP:%s' % concept_id for concept_id in all_concept_df.concept_id.values]
    all_concept_df = all_concept_df.groupby('person_id')['concept_id'].apply(list).reset_index()
    all_concept_df = pd.merge(all_concept_df, patient_to_disease, on='person_id')
    all_concept_df.loc[:,'Patient_Index'] = np.arange(len(all_concept_df))
    return all_concept_df

# Load OMOP to SPOKE conversions

In [None]:
omop_to_spoke = pd.read_csv('https://raw.githubusercontent.com/BaranziniLab/SPOKEsig-Workshop/main/omop_to_spoke_cdw.tsv', sep='\t', header=0, index_col=False)
omop_to_diseases = omop_to_spoke[omop_to_spoke.SPOKE_ID.isin(diseases)][['concept_id', 'SPOKE_ID']].rename(index=str, columns={'SPOKE_ID':'Disease'}).drop_duplicates()
omop_to_diseases.head()

# Load and filter patient data

In [3]:
# filenames from SQL queries
diag_filename = 'drop_in_condition_occurrence.tsv'
med_filename = 'drop_in_drug_exposure.tsv'
lab_filename = 'drop_in_measurement.tsv'

all_concept_df = load_patient_information(diag_filename, med_filename, lab_filename, omop_to_diseases)
all_concept_df.head()

Reading diagnosis file ...
Reading medication file ...
Reading lab file ...
Filtering lab file ...
Merging OMOP files ...


Unnamed: 0,person_id,concept_id,Disease,Patient_Index
0,230130,"[OMOP:35225058, OMOP:45556996, OMOP:44824029, ...",DOID:1612,0
1,411126,"[OMOP:44833044, OMOP:37202304, OMOP:44825717, ...",DOID:219,1
2,547231,"[OMOP:45600511, OMOP:35225339, OMOP:35209124, ...",DOID:1612,2
3,742728,"[OMOP:45587523, OMOP:44826854, OMOP:44832716, ...",DOID:9778,3
4,1291295,"[OMOP:45601745, OMOP:45551542, OMOP:44829584, ...",DOID:1612,4


# Make SPOKEsigs

In [4]:
patients_seen, spoke_sigs = make_spoke_sigs(all_concept_df)

API call was successfull, spoke signature for patient 0 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 1 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 2 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 3 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 4 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 5 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 6 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 7 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 8 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 9 is available in 'spoke_sig' variable
API call was successfull, spoke signature for patient 10 is available 

# Save new patient information

In [5]:
# filter patients w/o SPOKEsigs
if len(all_concept_df) > len(patients_seen):
    all_concept_df = all_concept_df[all_concept_df.Patient_Index.isin(patients_seen)]
    all_concept_df.loc[:,'Patient_Index'] = np.arange(len(all_concept_df))
# save patient info and SPOKEsigs
np.save('new_patient_spokesigs.npy', spoke_sigs, allow_pickle=False)
all_concept_df.to_csv('new_patient_info.tsv', sep='\t', header=True, index=False)