In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

## NHANES Data set

National Health and Nutrition Examination Survey  
https://www.cdc.gov/nchs/nhanes/index.htm

This data set contains a large variety of data from thousands of individuals about topics including: 
- Demographics - https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DEMO_H.htm
- Medical conditions - https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/MCQ_H.htm
- Medications - https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/RXQ_RX_H.htm

We will first create a model to predict an individual's mdeical conditions given the drugs they take. Then, we will try to improve the model by adding in demographics. Refer to the links next to the topics for column descriptions.

In [78]:
health_status_df = pd.read_csv('health_status_cleaned.txt', sep='\t', low_memory=False, index_col=0)
health_status_df.head(3)

Unnamed: 0_level_0,HSD010,HSQ500,HSQ510,HSQ520,HSQ571,HSQ580,HSQ590,HSAQUEX
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
73557,2.0,2.0,2.0,2.0,2.0,,2.0,2
73558,4.0,2.0,2.0,2.0,2.0,,2.0,2
73559,3.0,2.0,2.0,2.0,2.0,,2.0,2


In [79]:
medical_cond_df = pd.read_csv('medical_conditions_cleaned.txt', sep='\t', low_memory=False, index_col=0)
medical_cond_df.head(3)

Unnamed: 0_level_0,MCQ010,MCQ025,MCQ035,MCQ040,MCQ050,AGQ030,MCQ053,MCQ070,MCQ075,MCQ080,...,MCQ300c,MCQ365a,MCQ365b,MCQ365c,MCQ365d,MCQ370a,MCQ370b,MCQ370c,MCQ370d,MCQ380
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
73557,2.0,,,,,,2.0,2.0,,1.0,...,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
73558,1.0,8.0,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,
73559,2.0,,,,,,2.0,2.0,,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0


In [80]:
drugs_df = pd.read_csv('prescription_meds_cleaned.txt', sep='\t', low_memory=False)
drugs_df.head(3)

Unnamed: 0,SEQN,RXDUSE,RXDDRUG,RXDDRGID,ICD-10-CM,RXDRSC1,RXDRSD1,RXDCOUNT
0,73557,1,INSULIN,d00262,1460.0,E11,Type 2 diabetes mellitus,2
1,73558,1,GABAPENTIN,d03182,243.0,G25.81,Restless legs syndrome,4
2,73558,1,INSULIN GLARGINE,d04538,365.0,E11,Type 2 diabetes mellitus,4


<hr/>

In [131]:
print(drugs_df['RXDDRGID'].unique().size)
print(drugs_df['RXDRSC1'].unique().size)

696
474


We have over 1100 combined unique drugs and diseases, for simplicity, lets focus on diabetes.

RXDRSC1:
 - E11 - Type 2 diabetes mellitus
 - E11.4 - Type 2 diabetes mellitus with neurological complications
 - E11.2 - Type 2 diabetes mellitus with kidney complications
 - E10 - Type 1 diabetes mellitus

In [135]:
#add boolean column to determine if user has diabetes
drugs_df['has_diabetes'] = drugs_df['RXDRSC1'].apply(
    lambda x: 1 if x == 'E11' or x == 'E11.4' or x == 'E11.2' or x == 'E10' else 0
)
drugs_df.head(2)

Unnamed: 0,SEQN,RXDUSE,RXDDRUG,RXDDRGID,ICD-10-CM,RXDRSC1,RXDRSD1,RXDCOUNT,has_diabetes
0,73557,1,INSULIN,d00262,1460.0,E11,Type 2 diabetes mellitus,2,1
1,73558,1,GABAPENTIN,d03182,243.0,G25.81,Restless legs syndrome,4,0


In [73]:
#turn categorical values into dummies
drug_users_df = pd.get_dummies(drugs_df['RXDDRGID'])

In [137]:
drug_users_merged = drugs_df.join(drug_users_df) 
drug_users_merged = drug_users_merged.groupby('SEQN').sum()
drug_users_merged.drop(['RXDUSE','RXDCOUNT','ICD-10-CM'], axis=1, inplace=True)
#reset has_diabetes after group by
drug_users_merged['has_diabetes'] = drug_users_merged['has_diabetes'].apply(
    lambda x: 1 if x > 0 else 0
)
drug_users_merged.head(2)

Unnamed: 0_level_0,has_diabetes,a54115,a56545,a59812,a71066,c00001,c00019,c00040,c00049,c00088,...,d07928,d07965,d08080,d08086,d08100,d08114,d08182,d08184,h00024,h00035
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
73557,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73558,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


First column represents if they have diabetes

In [138]:
#make sure everything was merged correctly
test = pd.DataFrame(drug_users_merged.loc[73558])
test[test[73558] == 1]

Unnamed: 0,73558
has_diabetes,1
d00746,1
d03182,1
d04538,1
d04801,1


In [139]:
has_diabetes_y = drug_users_merged['has_diabetes']
X = drug_users_merged.drop('has_diabetes', axis=1)
print(X.shape)
print(has_diabetes_y.shape)

(4033, 696)
(4033,)


### Random forest classifier

In [118]:
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

In [140]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

X_train, X_test, y_train, y_test = train_test_split(X, has_diabetes_y,random_state = 0)

clf = RandomForestClassifier().fit(X_train, y_train)
title = 'Random Forest Classifier, complex binary dataset, default settings'

In [141]:
from sklearn.metrics import accuracy_score

y_predict = clf.predict(X_test)
accuracy_score(y_test, y_predict)

0.95341922695738357

In [142]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Predicted Diabetes', 'Predicted No Diabetes'],
    index=['True Diabetes', 'True No Diabetes']
)

Unnamed: 0,Predicted Diabetes,Predicted No Diabetes
True Diabetes,861,18
True No Diabetes,29,101


#### Results

Pretty good! Random Forest has an accuracy score of 95.34% for predicting if someone has diabetes given the drugs they are taking. Next, we will incorporate demographics to see if that will improve the accuracy.