In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
nct_ids = pd.read_csv('data/nct_ids.txt', names=['id']).set_index('id', drop=True)
conditions_and_medications = pd.read_csv('data/aact_conditions_and_medications.csv')
race = pd.read_csv('data/aact_race.csv')
gender = pd.read_csv('data/aact_gender.csv')
nct_ids.shape
nct_ids.head()

NCT02555917
NCT02555904
NCT02555891
NCT02555735
NCT02555878


In [3]:
cm_light = conditions_and_medications[['subject', 'term']]
cm_light = cm_light.rename(index=str, columns={"subject": "id"})
cm_light['term'] = cm_light['term'].apply(lambda x: x.lower())

# conditions
cm_light['is_hypertension'] = cm_light['term'].isin(['hypertension', "high blood pressure"])
cm_light['is_chf'] = cm_light['term'].isin(["congestive heart failure","chf","ccf - congestive cardiac failure","chf - congestive heart failure","congestive cardiac failure","congestive heart disease","congestive heart failure","congestive heart failure (disorder)","congestive heart failure (finding)"])
cm_light['is_afib'] = cm_light['term'].isin([
    "atrial fibrillation",
    "atrial fibrulation",
    "atrial fabrillation",
    "atrial fibrilation",
    "a fib",
    "afib",
    "atrial fib",
    "atr fibrillation",
    "atr fibrulation",
    "atr fabrillation",
    "atr fibrilation",
    "atr fib",
    "auricular fibrillation",
    "auricular fibrulation",
    "auricular fabrillation",
    "auricular fibrilation",
    "auricular fib",
    "aflutter",
    "atrial flutter",
])
cm_light['is_diabetes'] = cm_light['term'].isin(["diabetes","dm"])
cm_light['is_renal_failure'] = cm_light['term'].isin(["renal failure"])
cm_light['is_high_cholesterol'] = cm_light['term'].isin(["hyperlipidemia",
"high blood cholesterol",
"high cholesterol"])
cm_light['is_uti'] = cm_light['term'].isin(["urinary tract infectious disease","uti", "urinary tract infection"])
cm_light['is_gerd'] = cm_light['term'].isin(["gastroesophageal reflux","gerd"])

#medications
cm_light['is_sodium_chloride'] = cm_light['term'].isin(["sodium chloride"])
cm_light['is_glucose'] = cm_light['term'].isin(["glucose"])
cm_light['is_potassium'] = cm_light['term'].isin(["potassium chloride"])
cm_light['is_docusate'] = cm_light['term'].isin(["docusate"])
cm_light['is_heparin'] = cm_light['term'].isin(["heparin"])
cm_light['is_magnesium_sulfate'] = cm_light['term'].isin(["magnesium sulfate"])
cm_light['is_acetaminophen'] = cm_light['term'].isin(["acetaminophen"])
cm_light['is_pantoprazole'] = cm_light['term'].isin(["pantoprazole"])
cm_light['is_metoprolol'] = cm_light['term'].isin(["metoprolol"])
cm_light['is_furosemide'] = cm_light['term'].isin(["furosemide"])

cm_light.drop(['term'], axis=1, inplace=True)


for col in cm_light.columns:
    if col != 'id':
        cm_light[col] = cm_light[col].astype(int)
cm_light = cm_light.apply(pd.to_numeric, errors='ignore').groupby('id').max()
cm_light

Unnamed: 0_level_0,is_hypertension,is_chf,is_afib,is_diabetes,is_renal_failure,is_high_cholesterol,is_uti,is_gerd,is_sodium_chloride,is_glucose,is_potassium,is_docusate,is_heparin,is_magnesium_sulfate,is_acetaminophen,is_pantoprazole,is_metoprolol,is_furosemide
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
NCT00000271,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
NCT00000475,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
NCT00000484,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
NCT00000485,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
NCT00000487,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
NCT00000495,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
NCT00000497,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
NCT00000498,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
NCT00000499,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
NCT00000537,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
import warnings
warnings.filterwarnings('ignore')

race_light = race[['subject', 'value_normalized']]
race_light = race_light.rename(index=str, columns={"subject": "id"})
race_light['is_asian'] = race_light['value_normalized'].isin(["asian"])
race_light['is_black'] = race_light['value_normalized'].isin(["black"])
race_light['is_native_american'] = race_light['value_normalized'].isin(["native american", "alaska native", "alaskan native"])
race_light['is_pacific_islander'] = race_light['value_normalized'].isin(["pacific islander", "native hawaiian"])
race_light['is_white'] = race_light['value_normalized'].isin(["white"])


race_light.drop(['value_normalized'], axis=1, inplace=True)

for col in race_light.columns:
    if col != 'id':
        race_light[col] = race_light[col].astype(int)
race_light = race_light.apply(pd.to_numeric, errors='ignore').groupby('id').max()
race_light


Unnamed: 0_level_0,is_asian,is_black,is_native_american,is_pacific_islander,is_white
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NCT00017732,0,1,0,0,0
NCT00324389,0,0,0,0,1
NCT00426543,0,0,0,0,1
NCT00484640,0,0,0,0,1
NCT00668902,1,0,0,0,0
NCT00698984,0,0,0,0,1
NCT00736190,1,0,0,0,0
NCT00798785,0,0,0,0,1
NCT00874757,1,0,0,0,0
NCT00891267,0,0,0,0,1


In [5]:
gender_light = gender[['subject','nlpql_feature']]
gender_light = gender_light.rename(index=str, columns={"subject": "id"})

gender_light['is_male'] = gender_light[['nlpql_feature']] == 'isMale'
gender_light['is_female'] = gender_light[['nlpql_feature']] == 'isFemale'

gender_light.drop(['nlpql_feature'], axis=1, inplace=True)

for col in gender_light.columns:
    if col != 'id':
         gender_light[col] = gender_light[col].astype(int)
gender_light = gender_light.apply(pd.to_numeric, errors='ignore').groupby('id').max()

gender_light

Unnamed: 0_level_0,is_male,is_female
id,Unnamed: 1_level_1,Unnamed: 2_level_1
NCT00000460,1,0
NCT00000461,1,0
NCT00000468,1,0
NCT00000469,1,0
NCT00000474,1,0
NCT00000475,1,0
NCT00000476,1,0
NCT00000524,1,0
NCT00000558,1,0
NCT00000627,1,0


In [17]:
nct_features = pd.concat([nct_ids, race_light, gender_light, cm_light], axis=1)
nct_features.fillna(0, inplace=True)
nct_features['match_count'] = nct_features.sum(axis=1)
nct_features.sort_values(by=['match_count'], ascending=False)

Unnamed: 0,is_asian,is_black,is_native_american,is_pacific_islander,is_white,is_male,is_female,is_hypertension,is_chf,is_afib,...,is_glucose,is_potassium,is_docusate,is_heparin,is_magnesium_sulfate,is_acetaminophen,is_pantoprazole,is_metoprolol,is_furosemide,match_count
NCT02696382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
NCT03515889,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
NCT01251484,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0
NCT01474018,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
NCT00911508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
NCT02599714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0
NCT00433511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0
NCT00520208,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
NCT01476995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
NCT00422630,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,4.0


In [18]:
nct_features.loc['NCT02696382']

is_asian                0.0
is_black                0.0
is_native_american      0.0
is_pacific_islander     0.0
is_white                0.0
is_male                 0.0
is_female               0.0
is_hypertension         1.0
is_chf                  1.0
is_afib                 1.0
is_diabetes             0.0
is_renal_failure        1.0
is_high_cholesterol     0.0
is_uti                  1.0
is_gerd                 0.0
is_sodium_chloride      0.0
is_glucose              0.0
is_potassium            0.0
is_docusate             0.0
is_heparin              0.0
is_magnesium_sulfate    0.0
is_acetaminophen        0.0
is_pantoprazole         0.0
is_metoprolol           0.0
is_furosemide           0.0
match_count             5.0
Name: NCT02696382, dtype: float64