In [1]:
import numpy as np
import pandas as pd
import random 

np.random.seed(22)

In [2]:
def generate_gender_critiera(docset, values, cols, final):
    termset_f = 'termset Female: ["woman", "women", "female", "girl", "girls", "pregnant", "menopausal", "postmenopausal"];';
    termset_m = 'termset Male: ["man", "men", "male", "boy", "boys"];';
    

    # need conditional logic for {male; female; both} and termset
    if (np.count_nonzero(values) == 1):
        if values[0] == 1:
            # male
            name = 'Male'
            termset = termset_m
            
        else:
            # female
            name = 'Female'
            termset = termset_f
        pa = '''
%s

define %s is%s
 Clarity.ProviderAssertion({
   termset:[%s],
   documentset:[%s]
   });
               '''% (termset, final, name, name, docset)
        return ("is" + name), pa
    else:
        return '', ''
            


In [3]:
def generate_age_critiera(docset, values, cols, final, min_age=0, max_age=100, comparator=">=", target_value=18):
    
    age_nlpql = """
termset AgeTerms:
   ["age","aged"];

define Age:
   Clarity.ValueExtraction({
     termset:[AgeTerms],
     documentset: [%s],
     minimum_value: "%d",
     maximum_value: "%d"
     });

define meetsAgeCriteria:
    where Age.value %s %d;
        """ % (docset, min_age, max_age, comparator, target_value, values, cols)
    
    return age_nlpql

In [4]:
def generate_race_critiera(docset, values, cols, final):
    if (np.count_nonzero(values) == 1):
        name = "RaceMatch"
        match_race = 'white';
        for i in range(len(cols)):
            col = cols[i]
            val = values[i]
            
            if val == 1:
                match_race = col.replace('is_', '')
                break
        race_nlpql = '''
termset Races:["white","caucasian","black","african american","asian","pacific islander","alaska native","native american", "native hawaiian"];

define Race:
    Clarity.RaceFinderTask({
        documentset: [%s],
        termset:[Races]
    });

define %s RaceMatch:
    where Race.value_normalized = "%s";
        ''' % (docset, final, match_race)

        return name, race_nlpql
    else:
        return '', ''


In [5]:
def generate_location_critiera(docset, values, cols, final):
    return '', ''

In [6]:
medication_lookup = {
     'is_sodium_chloride':["Sodium Chloride"],
 'is_glucose':["Glucose"],
 'is_potassium':["potassium"],
 'is_docusate':["docusate"],
 'is_heparin':["heparin"],
 'is_magnesium_sulfate':["magnesium sulfate"],
 'is_acetaminophen':["acetaminophen"],
 'is_pantoprazole':["pantoprazole"],
 'is_metoprolol':["metoprolol"],
 'is_furosemide':["furosemide"]
}

def generate_medication_criteria(docset, values, cols, final):
    if (np.count_nonzero(values) == 1):
        name = 'Medications'
        
        terms = list()
        for i in range(len(cols)):
            col = cols[i]
            val = values[i]
            
            if val == 1:
                terms.extend(medication_lookup[col])
        term_names = '"' + '", "'.join(terms) + '"'
        nlpql = '''
        
termset MedicationTerms:[
    %s
];

define %s Medications:
  Clarity.ProviderAssertion({
    termset:[ConditionTerms],
    documentset:[%s]
    }); 
        '''% (term_names, final, docset)
        
        return name, nlpql
    else:
        return '', ''

In [7]:
condition_lookup = {'is_hypertension':["hypertension", "High Blood Pressure"],
 'is_chf':["congestive heart failure","chf","ccf - congestive cardiac failure","chf - congestive heart failure","congestive cardiac failure","congestive heart disease","congestive heart failure","congestive heart failure (disorder)","congestive heart failure (finding)"],
 'is_afib':["Atrial fibrillation",
"Atrial fibrulation",
"Atrial fabrillation",
"Atrial fibrilation",
"a fib",
"afib",
"atrial fib",
"atr fibrillation",
"atr fibrulation",
"atr fabrillation",
"atr fibrilation",
"atr fib",
"auricular fibrillation",
"auricular fibrulation",
"auricular fabrillation",
"auricular fibrilation",
"auricular fib",
"aflutter",
"atrial flutter"],
 'is_diabetes':["diabetes","dm"],
 'is_renal_failure':["renal failure"],
 'is_high_cholesterol':["Hyperlipidemia",
"High blood cholesterol",
"High cholesterol"],
 'is_uti':["Urinary tract infectious disease","UTI", "urinary tract infection"],
 'is_gerd':["Gastroesophageal reflux","GERD"],
 'is_arteriosclerosis':["arteriosclerosis"],
 'is_respiratory_failure':["respiratory failure"]}

def generate_condition_criteria(docset, values, cols, final):
    if (np.count_nonzero(values) == 1):
        name = 'Conditions'
        
        terms = list()
        for i in range(len(cols)):
            col = cols[i]
            val = values[i]
            
            if val == 1:
                terms.extend(condition_lookup[col])
        term_names = '"' + '", "'.join(terms) + '"'
        nlpql = '''
        
termset ConditionTerms:[
    %s
];

define %s Conditions:
  Clarity.ProviderAssertion({
    termset:[ConditionTerms],
    documentset:[%s]
    }); 
        '''% (term_names, final, docset)
        
        return name, nlpql
    else:
        return '', ''

In [8]:
def generate_results(names):
    clause = " AND ".join(names)
    return '''

define final SyntheticQueryResults:
    where %s;
    '''% clause

In [9]:
# need a way to gracefully handle empty strings OR paramterize the list of termsets we actually evaluate
gender_criteria = """termset Gender: [];"""
age_criteria =  """termset Age: [];"""
location_criteria =   """termset Location: [];"""
conditions_criteria =  """termset Condtions: [];"""
medications_criteria =   """termset Medications: [];"""

synthetic_query_template ="""

// Phenotype library name
phenotype "SyntheticQueryTesting_%s" version "1";

// Phenotype library description 
description "Synthetically generated, paramaterized query; for computing benchark runtime stats";

// # Structured Data Model #
datamodel OMOP version "5.3";

// # Referenced libraries #
include ClarityCore version "1.0" called Clarity;
include OHDSIHelpers version "1.0" called OHDSI;

// ## Code Systems ##
codesystem OMOP: "http://omop.org";

documentset %s:
    Clarity.createReportTagList(["Physician","Nurse","Note","Discharge Summary"]); 

// Gender inclusion criteria termset, if gender criteria present
%s

// Race inclusion criteria termset, if race criteria present
%s

// Condition(s) inclusion criteria termset, if condition criteria present
%s

// Medication(s) inclusion criteria termset, if medication criteria present
%s

// Results
%s

""" 
# % (query_name,
#         docset_name, 
#        gender_criteria,
#        race_criteria,
#        conditions_criteria, 
#        medications_criteria, 
#        results_string,
#        docset_name) 


In [10]:
features = pd.read_csv('data/nct_features.csv')
features = features.set_index(features.columns[0])
features.index.name = 'id'
features

Unnamed: 0_level_0,is_asian,is_black,is_native_american,is_pacific_islander,is_white,is_male,is_female,is_hypertension,is_chf,is_afib,...,is_sodium_chloride,is_glucose,is_potassium,is_docusate,is_heparin,is_magnesium_sulfate,is_acetaminophen,is_pantoprazole,is_metoprolol,is_furosemide
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NCT00000271,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00000475,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00000484,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00000485,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00000487,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00000495,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00000497,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00000498,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00000499,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00000537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
sample_df = features.sample(n=1000)
sample_df

Unnamed: 0_level_0,is_asian,is_black,is_native_american,is_pacific_islander,is_white,is_male,is_female,is_hypertension,is_chf,is_afib,...,is_sodium_chloride,is_glucose,is_potassium,is_docusate,is_heparin,is_magnesium_sulfate,is_acetaminophen,is_pantoprazole,is_metoprolol,is_furosemide
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NCT00763412,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT01879800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00288132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00269191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
NCT02785575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
NCT01041963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT01298245,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT01388842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT00538174,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCT01827826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
cols = list(sample_df.columns)
cols

['is_asian',
 'is_black',
 'is_native_american',
 'is_pacific_islander',
 'is_white',
 'is_male',
 'is_female',
 'is_hypertension',
 'is_chf',
 'is_afib',
 'is_diabetes',
 'is_renal_failure',
 'is_high_cholesterol',
 'is_uti',
 'is_gerd',
 'is_arteriosclerosis',
 'is_respiratory_failure',
 'is_sodium_chloride',
 'is_glucose',
 'is_potassium',
 'is_docusate',
 'is_heparin',
 'is_magnesium_sulfate',
 'is_acetaminophen',
 'is_pantoprazole',
 'is_metoprolol',
 'is_furosemide']

In [14]:
conditions = ['is_hypertension',
 'is_chf',
 'is_afib',
 'is_diabetes',
 'is_renal_failure',
 'is_high_cholesterol',
 'is_uti',
 'is_gerd',
 'is_arteriosclerosis',
 'is_respiratory_failure']
medications = ['is_sodium_chloride',
 'is_glucose',
 'is_potassium',
 'is_docusate',
 'is_heparin',
 'is_magnesium_sulfate',
 'is_acetaminophen',
 'is_pantoprazole',
 'is_metoprolol',
 'is_furosemide'
]
races = [
    'is_asian',
 'is_black',
 'is_native_american',
 'is_pacific_islander',
 'is_white'
]
genders = [
    'is_male',
 'is_female'
]

condition_cols = [sample_df.columns.get_loc(c) for c in sample_df.columns if c in conditions]
medication_cols = [sample_df.columns.get_loc(c) for c in sample_df.columns if c in medications]
race_cols = [sample_df.columns.get_loc(c) for c in sample_df.columns if c in races]
gender_cols = [sample_df.columns.get_loc(c) for c in sample_df.columns if c in genders]
docset_name = "PotentiallyEligiblePatientsNotes"

n = 0
for index, row in sample_df.iterrows():

    vals = row.values.astype(int)
    val_string = str(n) + "_" + index + "_" + "".join(vals.astype(str))
    condition_vals = row[condition_cols]
    medication_vals = row[medication_cols]
    race_vals = row[race_cols]
    gender_vals = row[gender_cols]
    
    features = np.count_nonzero(vals)
    if features == 1:
        is_feature_final = "final"
    else:
        is_feature_final = ""
        
    g_name, g_criteria = generate_gender_critiera(docset_name, gender_vals, genders, is_feature_final)
    r_name, r_criteria = generate_race_critiera(docset_name, race_vals, races, is_feature_final)
    c_name, c_criteria = generate_condition_criteria(docset_name, condition_vals, conditions, is_feature_final)
    m_name, m_criteria = generate_medication_criteria(docset_name, medication_vals, medications, is_feature_final)
    
    valid_names = list()
    if len(g_name) > 0: 
        valid_names.append(g_name)
    if len(r_name) > 0: 
        valid_names.append(r_name)
    if len(c_name) > 0: 
        valid_names.append(c_name)
    if len(m_name) > 0: 
        valid_names.append(m_name)
    
    
    if features == 1:
        results = ''
    else:
        results = generate_results(valid_names)
        print(n)

    query = synthetic_query_template % (val_string,
           docset_name, 
           g_criteria,
           r_criteria,
           c_criteria, 
           m_criteria, 
           results) 
    f = open('gen_nlpql/query_%d.nlpql' % n, "w+")
    f.write(query)
    f.write('\n')
    f.close()
    n += 1
    


0
1
6
8
10
11
19
20
30
32
35
36
37
38
39
45
46
59
61
62
63
65
67
68
70
71
72
73
74
75
77
78
82
83
84
85
90
93
101
104
108
110
112
113
118
123
124
125
126
130
134
136
142
144
145
147
148
149
150
152
153
159
160
161
166
167
168
169
177
184
185
186
187
191
193
194
198
200
202
203
206
209
212
214
216
220
224
226
228
229
230
231
234
235
236
239
244
245
247
248
251
254
256
259
266
267
272
276
279
280
281
282
284
286
289
292
293
294
299
302
304
306
309
313
319
327
329
330
331
333
334
336
337
338
340
346
361
362
366
368
369
370
371
372
376
377
380
383
384
386
392
395
396
398
403
406
408
409
410
413
416
419
421
423
432
434
438
440
443
444
446
447
448
449
451
453
454
457
459
461
467
474
477
478
480
482
484
486
496
499
500
501
502
504
507
508
509
510
513
516
520
525
528
533
534
537
539
542
543
546
548
551
552
556
559
561
567
571
574
577
579
580
581
582
583
584
592
594
596
598
599
600
601
605
608
611
617
618
619
620
623
624
625
627
628
630
631
632
633
635
639
648
649
650
658
664
669
672
673
675
67