In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

mimic_demo_csv = pd.read_csv('data/mimic_demographics.csv', header=None, names=['id', 'year_of_birth', 'gender', 'race_long', 'race'])
mimic_demo_csv = mimic_demo_csv.replace("Black or African American", "Black")
mimic_demo_csv = mimic_demo_csv.replace("American Indian or Alaska Native", "AI/AN")
mimic_demo_csv = mimic_demo_csv.replace("Native Hawaiian or Other Pacific Islander", "NH/PI")
mimic_demo_csv = mimic_demo_csv.replace("No matching concept", "Unknown")
mimic_demo_csv = mimic_demo_csv.replace("Other Race", "Other")

conditions_csv = pd.read_csv('data/mimic_conditions.csv', header=None, names=['id', 'condition_id', 'condition_name'])
drugs_csv = pd.read_csv('data/mimic_drugs.csv', header=None, names=['id', 'drug_id', 'drug_name'])
drugs_csv = drugs_csv[drugs_csv.drug_name.str.contains("vaccine") == False]

conditions = mimic_demo_csv.merge(conditions_csv, how='inner', on='id')
drugs = mimic_demo_csv.merge(drugs_csv, how='inner', on='id')

mimic_demo_csv.head()

Unnamed: 0,id,year_of_birth,gender,race_long,race
0,186,2128,M,OTHER,Unknown
1,471,2076,M,UNKNOWN/NOT SPECIFIED,Unknown
2,475,2056,F,UNKNOWN/NOT SPECIFIED,Unknown
3,479,2121,M,UNKNOWN/NOT SPECIFIED,Unknown
4,487,2126,F,UNKNOWN/NOT SPECIFIED,Unknown


In [2]:
np.max(mimic_demo_csv['year_of_birth'])
np.min(mimic_demo_csv['year_of_birth'])

1800

### MIMIC Crosstabs on race and gender

* Age will have to use NLP if available (structured data is offset)

In [3]:
race_gender = pd.crosstab(index=mimic_demo_csv["race"], 
                            columns=[mimic_demo_csv["gender"]],
                            margins=True)
race_gender

gender,F,M,All
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AI/AN,21,26,47
Asian,744,947,1691
Black,2059,1816,3875
NH/PI,7,8,15
Other,50,59,109
Unknown,2813,4071,6884
White,14705,19194,33899
All,20399,26121,46520


### MIMIC Crosstabs on Conditions

In [4]:
conditions_ct = pd.crosstab(index=conditions["condition_name"], 
                            columns=[conditions["gender"],conditions["race"]],
                            margins=True)
conditions_ct = conditions_ct.sort_values('All', ascending=False)
conditions_ct.head(50)

gender,F,F,F,F,F,F,F,M,M,M,M,M,M,M,All
race,AI/AN,Asian,Black,NH/PI,Other,Unknown,White,AI/AN,Asian,Black,NH/PI,Other,Unknown,White,Unnamed: 15_level_1
condition_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
All,250,5337,31429,114,426,26892,175979,140,7616,23766,47,646,38437,229054,540133
Essential hypertension,3,178,975,4,15,1133,6698,1,236,703,1,23,1704,8209,19883
Congestive heart failure,9,83,791,0,5,664,4176,0,129,488,0,5,854,5045,12249
Atrial fibrillation,3,96,402,2,4,676,4038,0,154,299,3,5,965,5481,12128
Coronary arteriosclerosis in native artery,1,57,373,1,5,731,3016,3,151,339,2,13,1519,5855,12066
Type 2 diabetes mellitus,6,92,637,2,7,490,2651,1,130,445,0,10,791,3752,9014
Acute renal failure syndrome,5,84,609,0,9,361,2750,2,140,524,2,8,519,3807,8820
Hyperlipidemia,1,73,410,0,9,326,2586,2,110,312,0,13,569,3930,8341
Acute respiratory failure,4,71,430,1,3,427,2447,2,117,316,2,5,574,2931,7330
Urinary tract infectious disease,4,91,434,3,6,412,2975,1,62,203,1,2,294,1880,6368


### MIMIC Crosstabs on Drugs

In [5]:
drugs_ct = pd.crosstab(index=drugs["drug_name"], 
                            columns=[drugs["gender"],drugs["race"]],
                            margins=True)
drugs_ct = drugs_ct.sort_values('All', ascending=False)
drugs_ct.head(50)

gender,F,F,F,F,F,F,F,M,M,M,M,M,M,M,All
race,AI/AN,Asian,Black,NH/PI,Other,Unknown,White,AI/AN,Asian,Black,NH/PI,Other,Unknown,White,Unnamed: 15_level_1
drug_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
All,406,10232,54863,188,882,48824,342109,271,15204,42054,90,1350,72751,458139,1047363
Sodium Chloride,15,456,2155,7,37,1971,13648,15,646,1708,5,52,2948,17938,41601
Glucose,14,384,1877,7,33,1855,11911,11,574,1534,4,48,2745,16006,37003
Potassium Chloride,10,354,1597,8,30,1705,11196,6,481,1181,5,37,2579,14431,33620
Docusate,7,277,1477,4,26,1071,8879,5,416,1107,3,35,1675,11600,26582
"heparin, porcine",8,287,1558,5,25,1166,8861,9,389,1201,2,32,1614,11176,26333
Magnesium Sulfate,11,272,1317,5,22,1171,8865,4,385,939,5,35,1804,11436,26271
Acetaminophen,8,213,1087,4,18,1118,7629,5,323,828,2,31,1917,10744,23927
pantoprazole,8,220,1278,3,20,1058,7794,3,333,975,2,28,1544,10029,23295
Metoprolol,6,181,1073,1,13,973,7225,2,326,849,2,24,1639,10349,22663


### Comparing Vectors

In [6]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models.wrappers.fasttext import FastText
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec
from collections import defaultdict
from prep_data import read_aact

In [7]:
data = read_aact()
df = pd.DataFrame(data)
df.columns = ['study_id', 'inclusion', 'exclusion', 'gender', 'age_min', 'age_max']
df.head()

6795


Unnamed: 0,study_id,inclusion,exclusion,gender,age_min,age_max
0,NCT00000105,Inclusion Criteria:\n - Patients mus...,Exclusion Criteria:\n - Pregnant or ...,Both,18 Years,
1,NCT00000106,Inclusion Criteria:\n - Patients are...,,Both,18 Years,65 Years
2,NCT00000107,Inclusion Criteria:\n - Resting bloo...,,Both,17 Years,60 Years
3,NCT00000108,Inclusion Criteria:\n - Postmenopaus...,,Female,50 Years,65 Years
4,NCT00000102,Inclusion Criteria:\n - diagnosed wi...,Exclusion Criteria:\n - history of l...,Both,14 Years,35 Years


In [8]:
# cleanup and model building
df['inclusion'].replace('[\\n!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',' ', inplace=True, regex=True)
inclusion = filter(None, [next_text.lower().strip(" ").split(" ") for next_text in df["inclusion"]])
model = Word2Vec(inclusion, size=100, window=5, min_count=5, workers=4)

In [9]:
model.wv.most_similar("hemophilia")

[('glomerulopathy', 0.40327781438827515),
 ('neither', 0.3382846415042877),
 ('minutes', 0.3372820019721985),
 ('pis', 0.3328104317188263),
 ('provider', 0.31975698471069336),
 ('percutaneous', 0.3175405263900757),
 ('myelotoxic', 0.30776888132095337),
 ('mouse', 0.30301523208618164),
 ('conforming', 0.3029557466506958),
 ('read', 0.2927960455417633)]

In [10]:
word_vectors = model.wv.vectors
n_words = word_vectors.shape[0]
vec_size = word_vectors.shape[1]
print("words = {0}, vector size = {1}".format(n_words, vec_size))

words = 5998, vector size = 100


In [11]:
cluster_size = 50
kmeans = KMeans(n_clusters=cluster_size, random_state=0).fit(word_vectors)
labels = kmeans.predict(word_vectors)
centers = kmeans.cluster_centers_

vocab_list = list(model.wv.vocab.keys())
word_indexes = dict()
for i in range(len(vocab_list)):
    word_indexes[i] = vocab_list[i]
vector_df = pd.DataFrame(word_vectors, index=vocab_list)

In [12]:
df_vocab = pd.DataFrame(vocab_list, columns=['word'])
df_vocab.insert((df_vocab.shape[1]),'kmeans', labels)
df_vocab.head()

Unnamed: 0,word,kmeans
0,,3
1,itself,38
2,neovascularization,28
3,pirfenidone,38
4,nephrectomy,38


In [13]:
for i in range(cluster_size):
    c = df_vocab[df_vocab['kmeans'] == i]
    print("cluster {0}, size {1}".format(i, c.shape[0]))
    print(list(c['word']))
    print()

cluster 0, size 128
['form', 'anthracyclines', 'pathological', 'resections', 'manifestation', 'main', 'described', 'pneumothorax', 'trastuzumab', 'rsr', 'bcc', 'uicc', 'reagents', 'trimethoprim', 'mesodermal', 'reflexes', 'review', 'exogenous', 'meperidine', 'sparing', 'refused', 'vein', 'megakaryocytes', 'hypertrophy', 'intraperitoneal', 'initial', 'neuromuscular', 'purified', 'distress', 'exploration', 'aspirin', 'hypo', 'interactions', 'lactating', 'tolerance', 'chairman', 'antihypertensives', 'sampled', 'deposition', 'sensitive', 'cc', 'spongioblastoma', 'glandular', 'astrocytoma', 'thoracotomy', 'examined', 'neutrophil', 'cortisol', 'etiology', 'partially', 'affected', 'filling', 'space', 'fluoropyrimidine', 'vestibular', 'valproic', 'longer', 'seronegativity', 'sulfadiazine', 'schwannoma', 'designate', 'tendon', 'implants', 'unusual', 'philadelphia', 'mr', 'aplasia', 'highest', 'ankle', 'pervasive', 'ic', 'their', 'lower', 'gondii', 'seminoma', 'promyelocytic', 'myeloid', 'autoan