In [201]:
%matplotlib inline
import pandas as pd
import numpy as np
import gensim 
import sys
from parse import *
from prep_data import read_aact
import re
from collections import OrderedDict
import numpy as np

In [202]:
# Christine: gender; location; medication
# Charity: age; race; condition

### Approach 1:

Leverages regexes (modified from `nlp.custom_tasks.RaceFinderTask`)

In [203]:
def compile_gender_regex():
    
    str_sep = r'(\s-\s|-\s|\s-|\s)'
    str_word = r'\b[-a-z.\d]+'
    str_punct = r'[,.\s]*'
    str_words = r'(' + str_word + str_punct + r'){0,6}'
    str_category = r'\b(woman|women|female|girl'                      +\
               r'man|men|male|boy)'
    
    str_gender1 = r'(\brace:?\s*)' + r'(?P<category>' + str_category + r')'
    regex_gender1 = re.compile(str_gender1, re.IGNORECASE)
    str_gender2 = r'(?P<category>' + str_category + r')' + str_punct    +\
            str_words 
    regex_gender2 = re.compile(str_gender2, re.IGNORECASE)
    str_gender3 =  str_punct + str_words + r'(?P<category>' +\
            str_category + r')'
    regex_gender3 = re.compile(str_gender3, re.IGNORECASE)
    REGEXES = [regex_gender1, regex_gender2, regex_gender3]

    return REGEXES
     

In [226]:
def find_gender(gender_regexes, sentence_list):

    result_list = []

    found_match = False
    for i in range(len(sentence_list)):
        s = sentence_list[i]
        for regex in gender_regexes:
            match = regex.search(s)
            if match:
                match_text = match.group('category')
        
                start = match.start()
                end   = match.end()
                result = (i, start, end, match_text)
                result_list.append(result)
                found_match = True

    if len(result_list) > 0:
        return result_list[0]
    else:
        return ''


gender_regexes = compile_gender_regex()
find_gender(gender_regexes, ["15 year old men and women"])

(0, 12, 25, 'men')

### Approach 2:

Leverages substring search 

In [231]:
def find_gender_boolean(single_study_inclusion_criteria):
    
    male_terms = ["man", "men", "male", "boy", "boys"]
    female_terms = ["woman", "women", "female", "girl", "girls"]
    
    contains_female_term = False
    contains_male_term = False
    
    male_results = OrderedDict()
    female_results = OrderedDict()
    
    results = OrderedDict({"male_results":{}, "female_results":{}, "gender_boolean":{}})
    
    for i, criteria in enumerate(single_study_inclusion_criteria.split("\n")): 
        male_results[i]  = {"term":[], "start_index":[]}
        female_results[i]  = {"term":[], "start_index":[]}
        
        male_results[i]["term"] = [male_term for male_term in male_terms]
        male_results[i]["start_index"] = [criteria.find(male_term) for male_term in male_terms]
        
        female_results[i]["term"] = [female_term for female_term in female_terms]
        female_results[i]["start_index"] = [criteria.find(female_term) for female_term in female_terms]
        
        male_results[i]["male_boolean"] = 1 if np.max([male_results[i]['start_index']]) >= 0 else 0
        female_results[i]["female_boolean"] = 1 if np.max([female_results[i]['start_index']]) >= 0 else 0
 
    
    contains_male_term = np.any([male_results[x]["male_boolean"] for x in male_results.keys()])
    contains_female_term = np.any([female_results[x]["female_boolean"] for x in female_results.keys()])
    any_gender_mention = int(contains_male_term or contains_female_term)
    
    #return  male_results, female_results, contains_male_term, contains_female_term, any_gender_mention
    return  any_gender_mention
    

any_gender_mention = find_gender_boolean("Non-smoking \n men over the age of 50")
any_gender_mention

1

#### Goal 1: 

Develop a query and a model for each primitive; output = {0,1} whether the inclusion criteria statement in question mentions (and, by extension, restricts on) the primitive in question. Note that a single clinical trial will generally include > 1 criteria statement. Run against MIMIC. Use either manual review and/or regex-based queries as ground truth to train model(s) on. 

In [238]:
data = read_aact()
df = pd.DataFrame(data)
df.columns = ['study_id', 'inclusion', 'exclusion', 'gender', 'age_min', 'age_max']
df.head()

6795


Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years


In [None]:
# Apply regex-based function
df['gender_regex_bool'] = df['inclusion'].apply(lambda x: find_gender(gender_regexes,[criteria for criteria in x.split("\n")]))
df.head()

In [None]:
# Apply string search-based function
df['ic_mentions_gender'] = df['inclusion'].apply(lambda x: find_gender_boolean(x))
df.head()