In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import gensim 
import sys
from parse import *
from prep_data import read_aact
import re
from collections import OrderedDict
import numpy as np

In [202]:
# Christine: gender; location; medication
# Charity: age; race; condition

### Gender Approach 1:

Leverages regexes (modified from `nlp.custom_tasks.RaceFinderTask`)

In [3]:
def compile_gender_regex():
    
    str_sep = r'(\s-\s|-\s|\s-|\s)'
    str_word = r'\b[-a-z.\d]+'
    str_punct = r'[,.\s]*'
    str_words = r'(' + str_word + str_punct + r'){0,6}'
    str_category = r'\b(woman|women|female|girl'                      +\
               r'man|men|male|boy)'
    
    str_gender1 = r'(\brace:?\s*)' + r'(?P<category>' + str_category + r')'
    regex_gender1 = re.compile(str_gender1, re.IGNORECASE)
    str_gender2 = r'(?P<category>' + str_category + r')' + str_punct    +\
            str_words 
    regex_gender2 = re.compile(str_gender2, re.IGNORECASE)
    str_gender3 =  str_punct + str_words + r'(?P<category>' +\
            str_category + r')'
    regex_gender3 = re.compile(str_gender3, re.IGNORECASE)
    REGEXES = [regex_gender1, regex_gender2, regex_gender3]

    return REGEXES
     

In [4]:
def find_gender(gender_regexes, sentence_list):

    result_list = []

    found_match = False
    for i in range(len(sentence_list)):
        s = sentence_list[i]
        for regex in gender_regexes:
            match = regex.search(s)
            if match:
                match_text = match.group('category')
        
                start = match.start()
                end   = match.end()
                result = (i, start, end, match_text)
                result_list.append(result)
                found_match = True

    if len(result_list) > 0:
        return result_list[0]
    else:
        return ''


gender_regexes = compile_gender_regex()
find_gender(gender_regexes, ["15 year old men and women"])

(0, 12, 25, 'men')

### Gender Approach 2:

Leverages substring search 

In [63]:
def find_gender_boolean(single_study_inclusion_criteria):
    
    male_terms = ["man", "men", "male", "boy", "boys"]
    female_terms = ["woman", "women", "female", "girl", "girls", "pregnant", "menopausal", "postmenopausal"]

    contains_female_term = False
    contains_male_term = False
    
    male_results = OrderedDict()
    female_results = OrderedDict()
    
    results = OrderedDict({"male_results":{}, "female_results":{}, "gender_boolean":{}})
    
    for i, criteria in enumerate(single_study_inclusion_criteria.split("\n")): 
           
        criteria = criteria.lower().replace("human", "person")
        
        male_results[i]  = {"term":[], "start_index":[]}
        female_results[i]  = {"term":[], "start_index":[]}
        
        male_results[i]["term"] = [male_term for male_term in male_terms for word in criteria.split() if len(word) <= len(male_term)]
        male_results[i]["start_index"] = [word.find(male_term) for male_term in male_terms for word in criteria.split() if len(word) <= len(male_term)]
        
        
        #print(male_results[i])
        
        female_results[i]["term"] = [female_term for female_term in female_terms for word in criteria.split() if len(word) <= len(female_term)]
        female_results[i]["start_index"] = [word.find(female_term) for female_term in female_terms for word in criteria.split() if len(word) <= len(female_term)]
        
        male_results[i]["male_boolean"] = 1 if 0 in male_results[i]['start_index']  else 0
        female_results[i]["female_boolean"] = 1 if 0 in female_results[i]['start_index']  else 0
 
    
    contains_male_term = int(np.any([male_results[x]["male_boolean"] for x in male_results.keys()]))
    contains_female_term = int(np.any([female_results[x]["female_boolean"] for x in female_results.keys()]))
    any_gender_mention = int(contains_male_term or contains_female_term)
    
    return [contains_male_term, contains_female_term, any_gender_mention]
    #return  male_results, female_results, contains_male_term, contains_female_term, any_gender_mention

    
result = find_gender_boolean("Inclusion Criteria:\n" + \
"Patients must have a diagnosis of cancer of any histologic type.\n" +\
          "Patients must have a Karnofsky performance status great or equal to 70%.\n" +\
        "All patients must sign informed consent approved by the Committee on the Use of man human\n" +\
             "Subjects at the University of Minnesota")


result

[1, 0, 1]

### Location Approach 1:

In [141]:
def find_location_boolean(single_study_inclusion_criteria, list_of_cities):
    
    proper_nouns = []
    
    loc_terms = ["reside", "resided", "live", "lived", "located", "nearby", "location", "study site", "city", "metropolitan", "clinic", "university"]

    contains_proper_noun = False
    contains_phrase_related_to_location = False
    contains_city = False
    
    proper_noun_results = OrderedDict()
    loc_phrases_results = OrderedDict()
    city_results = OrderedDict()
    
    results = OrderedDict({"proper_noun_results":{}, "phrase_rel_loc_results":{}, "location_boolean":{}})
    
    for i, criteria in enumerate(single_study_inclusion_criteria.split("\n")): 
           
        proper_noun_results[i]  = {"term":[], "start_index":[]}
        loc_phrases_results[i]  = {"term":[], "start_index":[]}
        city_results[i]  = {"term":[], "start_index":[]}
        
        proper_noun_results[i]["term"] = [word for word in criteria.split() if word[0].isupper()]
        proper_noun_results[i]["start_index"] = [0 for word in criteria.split() if word[0].isupper()]

        loc_phrases_results[i]["term"] = [loc_term for loc_term in loc_terms for word in criteria.split() if len(word) <= len(loc_term)]
        loc_phrases_results[i]["start_index"] = [word.find(loc_term) for loc_term in loc_terms for word in criteria.split() if len(word) <= len(loc_term)]
        
        city_results[i]["term"] = [word for word in criteria.split() if word.lower() in list_of_cities]
        city_results[i]["start_index"] =  [0 for word in criteria.split() if word.lower() in list_of_cities]
        
        
        proper_noun_results[i]["proper_noun_boolean"] = 1 if 0 in proper_noun_results[i]['start_index']  else 0
        loc_phrases_results[i]["loc_boolean"] = 1 if 0 in loc_phrases_results[i]['start_index']  else 0
        city_results[i]["city_boolean"] = 1 if 0 in city_results[i]['start_index']  else 0
 
    
    contains_proper_noun = int(np.any([proper_noun_results[x]["proper_noun_boolean"] for x in proper_noun_results.keys()]))
    contains_loc_term = int(np.any([loc_phrases_results[x]["loc_boolean"] for x in loc_phrases_results.keys()]))
    contains_city = int(np.any([city_results[x]["city_boolean"] for x in city_results.keys()]))
    
    any_location_mention = int(contains_proper_noun or contains_loc_term or contains_city)
    
    
    return [contains_proper_noun, contains_loc_term, contains_city, any_location_mention]

cities = []

with open('./US_cities.txt') as file:
    for line in file.readlines():
        cities.append(line.split("\n")[0].lower())
        

results = find_location_boolean("Must live within 20 minutes of metro Atlanta", cities)   

In [134]:
"normal" in cities

True

### Medication Approach 1:

In [157]:
medications = []

with open('./medications.txt') as file:
    for line in file.readlines():
        medications.append(line.split("\n")[0].lower())


In [163]:
def find_medication_boolean(single_study_inclusion_criteria, list_of_medications):
    
    
    #med_terms = ["dose", "pills", "medication", "prescribed", ""]

    contains_medication = False

    
    medication_results = OrderedDict()
    #loc_phrases_results = OrderedDict()
    #city_results = OrderedDict()
    
    results = OrderedDict({"medication_results":{}})
    
    for i, criteria in enumerate(single_study_inclusion_criteria.split("\n")): 
           
        medication_results[i]  = {"term":[], "start_index":[]}
        #loc_phrases_results[i]  = {"term":[], "start_index":[]}
        #city_results[i]  = {"term":[], "start_index":[]}
        
#         proper_noun_results[i]["term"] = [word for word in criteria.split() if word[0].isupper()]
#         proper_noun_results[i]["start_index"] = [0 for word in criteria.split() if word[0].isupper()]

#         loc_phrases_results[i]["term"] = [loc_term for loc_term in loc_terms for word in criteria.split() if len(word) <= len(loc_term)]
#         loc_phrases_results[i]["start_index"] = [word.find(loc_term) for loc_term in loc_terms for word in criteria.split() if len(word) <= len(loc_term)]
        
        medication_results[i]["term"] = [word for word in criteria.split() if word.lower() in list_of_medications]
        medication_results[i]["start_index"] =  [0 for word in criteria.split() if word.lower() in list_of_medications]
        
        
        #proper_noun_results[i]["proper_noun_boolean"] = 1 if 0 in proper_noun_results[i]['start_index']  else 0
        #loc_phrases_results[i]["loc_boolean"] = 1 if 0 in loc_phrases_results[i]['start_index']  else 0
        medication_results[i]["medication_boolean"] = 1 if 0 in medication_results[i]['start_index']  else 0
 
    
    #contains_proper_noun = int(np.any([proper_noun_results[x]["proper_noun_boolean"] for x in proper_noun_results.keys()]))
    #contains_loc_term = int(np.any([loc_phrases_results[x]["loc_boolean"] for x in loc_phrases_results.keys()]))
    contains_medication = int(np.any([medication_results[x]["medication_boolean"] for x in medication_results.keys()]))
    
    any_medication_mention = int(contains_medication)
    
    
    return [contains_medication, any_medication_mention]

medications = []

with open('./medications.txt') as file:
    for line in file.readlines():
        medications.append(line.split("\n")[0].lower().split(" ")[1])
        

results = find_medication_boolean("Must be taking metformin", medications)   
results

[1, 1]

### Goal 1: 

Develop a query and a model for each primitive; output = {0,1} whether the inclusion criteria statement in question mentions (and, by extension, restricts on) the primitive in question. Note that a single clinical trial will generally include > 1 criteria statement. Run against MIMIC. Use either manual review and/or regex-based queries as ground truth to train model(s) on. 

#### 1.1: Gender

In [56]:
data = read_aact()
df = pd.DataFrame(data)
df.columns = ['study_id', 'inclusion', 'exclusion', 'gender', 'age_min', 'age_max']
df.head()

6795


Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years


In [313]:
# Apply regex-based function
df['gender_regex_bool'] = df['inclusion'].apply(lambda x: find_gender(gender_regexes,[criteria for criteria in x.split("\n")]))
df.head()

Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max,ic_mentions_gender,gender_regex_bool
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,,1,
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years,1,
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years,0,
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years,1,
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years,0,


In [61]:
# Apply string search-based function
df[['ic_mentions_male', 'ic_mentions_female','ic_mentions_gender']] = df.apply(lambda row: pd.Series(find_gender(row['inclusion'])), axis=1)
df.head(n=5)

Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max,ic_mentions_male,ic_mentions_female,ic_mentions_gender
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,,0,0,0
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years,0,0,0
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years,0,0,0
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years,0,1,1
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years,0,0,0


In [60]:
# for i,row in df.iterrows():
#     if i > 20:
#         if row['ic_mentions_female'] == 1:
#             print(i)
#             print(row['inclusion'])

#### 1.2: Location

In [142]:
cities = []

with open('./US_cities.txt') as file:
    for line in file.readlines():
        cities.append(line.split("\n")[0].lower())

df[['ic_mentions_proper_noun', 'ic_mentions_loc_term','ic_mentions_city', 'ic_loc_bool']] = df.apply(lambda row: pd.Series(find_location_boolean(row['inclusion'], cities)), axis=1)
df.head(n=5)

Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max,ic_mentions_male,ic_mentions_female,ic_mentions_gender,ic_mentions_proper_noun,ic_mentions_loc_term,ic_mentions_city,ic_loc_bool
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,,0,0,0,1,0,0,1
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years,0,0,0,1,0,0,1
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years,0,0,0,1,0,0,1
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years,0,1,1,1,0,0,1
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years,0,0,0,1,0,0,1


In [149]:
for i,row in df.iterrows():
    if i > 50 and i < 100:
        if row['ic_mentions_loc_term'] == 1:
            print(i)
            print(row['inclusion'])

68
Inclusion Criteria:
          -  Probable Alzheimer's disease
          -  Mini-Mental State Examination (MMSE) 10-22 and ADAS greater than or equal to 18
          -  Alzheimer's Disease Assessment Scale cognitive portion (ADAS-cog-11) score of at
             least 18
          -  Opportunity for Activities of Daily Living
          -  Caregiver
          -  Subjects who live with or have regular daily visits from a responsible caregiver
             (visit frequency: preferably daily but at least 5 days/week). This includes a friend
             or relative or paid personnel. The caregiver should be capable of assisting with the
             subject's medication, prepared to attend with the subject for assessments, and
             willing to provide information about the subject.
        


In [151]:
#### 1.3 Medication


In [167]:
medications = []

with open('./medications.txt') as file:
    for line in file.readlines():
        medications.append(line.split("\n")[0].lower().split(" ")[1])

df[['ic_mentions_medication', 'ic_mentions_anymed']] = df.apply(lambda row: pd.Series(find_medication_boolean(row['inclusion'], medications)), axis=1)
df.head(n=5)


Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max,ic_mentions_male,ic_mentions_female,ic_mentions_gender,ic_mentions_proper_noun,ic_mentions_loc_term,ic_mentions_city,ic_loc_bool,ic_mentions_medication,ic_mentions_anymed
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,,0,0,0,1,0,0,1,0,0
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years,0,0,0,1,0,0,1,0,0
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years,0,0,0,1,0,0,1,0,0
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years,0,1,1,1,0,0,1,0,0
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years,0,0,0,1,0,0,1,0,0


In [173]:
for i,row in df.iterrows():
    #if i > 50 and i < 100:
    if i < 30:
        
        if row['ic_mentions_medication'] == 1:
            print(i)
            print(row['inclusion'])

18
Inclusion Criteria:
        Eligible patients must:
          -  Be between the ages of 18 and 56
          -  Be able to see the entire face of someone sitting across the table from them without
             scanning
          -  Read newspaper-size print without special magnifying aids
          -  Walk unaided in daylight
          -  Have a normal fasting serum vitamin A and normal liver function profile
          -  Be in good general health
          -  Reside in the United States
        
