In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import gensim 
import sys
from parse import *
from prep_data import read_aact
import re
from collections import OrderedDict
import numpy as np

In [202]:
# Christine: gender; location; medication
# Charity: age; race; condition

### Approach 1:

Leverages regexes (modified from `nlp.custom_tasks.RaceFinderTask`)

In [3]:
def compile_gender_regex():
    
    str_sep = r'(\s-\s|-\s|\s-|\s)'
    str_word = r'\b[-a-z.\d]+'
    str_punct = r'[,.\s]*'
    str_words = r'(' + str_word + str_punct + r'){0,6}'
    str_category = r'\b(woman|women|female|girl'                      +\
               r'man|men|male|boy)'
    
    str_gender1 = r'(\brace:?\s*)' + r'(?P<category>' + str_category + r')'
    regex_gender1 = re.compile(str_gender1, re.IGNORECASE)
    str_gender2 = r'(?P<category>' + str_category + r')' + str_punct    +\
            str_words 
    regex_gender2 = re.compile(str_gender2, re.IGNORECASE)
    str_gender3 =  str_punct + str_words + r'(?P<category>' +\
            str_category + r')'
    regex_gender3 = re.compile(str_gender3, re.IGNORECASE)
    REGEXES = [regex_gender1, regex_gender2, regex_gender3]

    return REGEXES
     

In [4]:
def find_gender(gender_regexes, sentence_list):

    result_list = []

    found_match = False
    for i in range(len(sentence_list)):
        s = sentence_list[i]
        for regex in gender_regexes:
            match = regex.search(s)
            if match:
                match_text = match.group('category')
        
                start = match.start()
                end   = match.end()
                result = (i, start, end, match_text)
                result_list.append(result)
                found_match = True

    if len(result_list) > 0:
        return result_list[0]
    else:
        return ''


gender_regexes = compile_gender_regex()
find_gender(gender_regexes, ["15 year old men and women"])

(0, 12, 25, 'men')

### Approach 2:

Leverages substring search 

In [45]:
def find_gender(single_study_inclusion_criteria):
    
    male_terms = ["man", "men", "male", "boy", "boys"]
    female_terms = ["woman", "women", "female", "girl", "girls", "pregnant", "menopausal", "postmenopausal"]

    contains_female_term = False
    contains_male_term = False
    
    male_results = OrderedDict()
    female_results = OrderedDict()
    
    results = OrderedDict({"male_results":{}, "female_results":{}, "gender_boolean":{}})
    
    for i, criteria in enumerate(single_study_inclusion_criteria.split("\n")): 
           
        criteria = criteria.lower().replace("human", "person")
        
        male_results[i]  = {"term":[], "start_index":[]}
        female_results[i]  = {"term":[], "start_index":[]}
        
        male_results[i]["term"] = [male_term for male_term in male_terms for word in criteria.split() if len(word) <= len(male_term)]
        male_results[i]["start_index"] = [word.find(male_term) for male_term in male_terms for word in criteria.split() if len(word) <= len(male_term)]
        
        
        #print(male_results[i])
        
        female_results[i]["term"] = [female_term for female_term in female_terms for word in criteria.split() if len(word) <= len(female_term)]
        female_results[i]["start_index"] = [word.find(female_term) for female_term in female_terms for word in criteria.split() if len(word) <= len(female_term)]
        
        male_results[i]["male_boolean"] = 1 if 0 in male_results[i]['start_index']  else 0
        female_results[i]["female_boolean"] = 1 if 0 in female_results[i]['start_index']  else 0
 
    
    contains_male_term = int(np.any([male_results[x]["male_boolean"] for x in male_results.keys()]))
    contains_female_term = int(np.any([female_results[x]["female_boolean"] for x in female_results.keys()]))
    any_gender_mention = int(contains_male_term or contains_female_term)
    
    return [contains_male_term, contains_female_term, any_gender_mention]
    
    #return  male_results, female_results, contains_male_term, contains_female_term, any_gender_mention


def find_gender_boolean(single_study_inclusion_criteria):
    mr, fr, cm, cf, gm = find_gender(single_study_inclusion_criteria)
    return gm 

def find_male_boolean(single_study_inclusion_criteria):
    mr, fr, cm, cf, gm = find_gender(single_study_inclusion_criteria)
    return cm

def find_female_boolean(single_study_inclusion_criteria):
    mr, fr, cm, cf, gm = find_gender(single_study_inclusion_criteria)
    return cf
    
result = find_gender("Inclusion Criteria:\n" + \
"Patients must have a diagnosis of cancer of any histologic type.\n" +\
          "Patients must have a Karnofsky performance status great or equal to 70%.\n" +\
        "All patients must sign informed consent approved by the Committee on the Use of man human\n" +\
             "Subjects at the University of Minnesota")


result

[1, 0, 1]

#### Goal 1: 

Develop a query and a model for each primitive; output = {0,1} whether the inclusion criteria statement in question mentions (and, by extension, restricts on) the primitive in question. Note that a single clinical trial will generally include > 1 criteria statement. Run against MIMIC. Use either manual review and/or regex-based queries as ground truth to train model(s) on. 

In [56]:
data = read_aact()
df = pd.DataFrame(data)
df.columns = ['study_id', 'inclusion', 'exclusion', 'gender', 'age_min', 'age_max']
df.head()

6795


Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years


In [313]:
# Apply regex-based function
df['gender_regex_bool'] = df['inclusion'].apply(lambda x: find_gender(gender_regexes,[criteria for criteria in x.split("\n")]))
df.head()

Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max,ic_mentions_gender,gender_regex_bool
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,,1,
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years,1,
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years,0,
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years,1,
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years,0,


In [57]:
# Apply string search-based function
df[['ic_mentions_male', 'ic_mentions_female','ic_mentions_gender']] = df.apply(lambda row: pd.Series(find_gender(row['inclusion'])), axis=1)
df.head(n=10)

Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max,ic_mentions_male,ic_mentions_female,ic_mentions_gender
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,,0,0,0
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years,0,0,0
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years,0,0,0
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years,0,1,1
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years,0,0,0
5,NCT00000104,Inclusion Criteria:\n - Pregnant mot...,,Female,,,0,1,1
6,NCT00000117,"To be eligible, patients must have a history o...",,Both,,50 Years,0,0,0
7,NCT00000118,All patients must have had AIDS as defined by ...,,Both,,,0,0,0
8,NCT00000119,Women and men 18 years or older with a documen...,Exclusion criteria include corneal pathology o...,Both,18 Years,,1,1,1
9,NCT00000120,The study included male and female infants del...,,Both,,1 Year,1,1,1


In [59]:
for i,row in df.iterrows():
    if i > 20:
        if row['ic_mentions_female'] == 1:
            print(i)
            print(row['inclusion'])

24
An eligible male or female must have been age 3 years or older (adults were included) and
        must have had esotropia that occurred at age 6 months or older, with no history of
        previous eye muscle surgery.
25
Men and women with uncontrolled intraocular pressure greater than 21 mm Hg in one or both
        eyes despite maximal tolerated therapy and who were aphakic or had undergone previous
        filtering surgery were eligible to participate.
27
Men and women eligible for the study must be age 21 or older, have primary choroidal
        melanoma in only one eye, and have no evidence of metastatic disease. Accurate estimation
        of tumor thickness by echography must also be possible.
28
Men and nonpregnant women between the ages of 40 and 80 with IOP greater than or equal to
        24 mm Hg but less than or equal to 32 mm Hg in at least one eye and IOP greater than or
        equal to 21 but less than or equal to 32 mm Hg in the fellow eye, as well as normal visua

1755
Inclusion Criteria:
          -  Patients must have a diagnosis of multiple myeloma confirmed by the presence of:
               -  Bone marrow plasmacytosis with >= 10% plasma cells or sheets of plasma cells or
                  biopsy-proven plasmacytoma
               -  In addition, at least 1 of the following ancillary criteria must be documented:
                    -  M-protein in the serum
                    -  M-protein in the urine
                    -  Radiographic evidence of osteolytic lesions (generalized osteoporosis
                       qualifies only if the bone marrow aspirate contains >= 20% plasma cells)
          -  Patients must have measurable disease; the following will constitute measurable
             disease; tests used to document measurable disease must be done within two weeks
             prior to registration; a bone marrow biopsy performed =< 6 weeks prior to
             registration is acceptable; Note: If present, all of these parameters mu

2596
Inclusion Criteria:
          -  Patients with biopsy proven B-cell malignancies [e.g. chronic lymphocytic leukemia
             (CLL), non-Hodgkin's lymphoma (NHL), multiple myeloma (MM)]; HIV-associated lymphomas
             and acute leukemias are not eligible
          -  Performance status: ECOG 0, 1, or 2
          -  Life expectancy of at least 12 weeks
          -  Patients with aggressive NHL will be enrolled after having failed all possible
             therapy with curative intent
          -  Patients with CLL must have failed an alkylating agent-containing regimen as well as
             fludarabine chemotherapy
          -  Patients with multiple myeloma must have received at least one prior chemotherapy
             regimen and not be eligible for a dose intensification treatment approach
          -  At least 4 weeks must have elapsed since prior large-field radiation therapy
          -  Patients must have been off previous anti-cancer therapy for at least 3 week

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

