In [109]:
# Import dependencies
import pandas as pd
from sodapy import Socrata

# Load Data

In [110]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("chronicdata.cdc.gov", None)



In [111]:
# Return results as csv from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("swc5-untb", limit=2000000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

# View all columns in dataframes
pd.set_option('display.max_columns', None)  

# Display dataframe
results_df.head()

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
0,2019,WY,Wyoming,Washakie,BRFSS,Health Outcomes,High cholesterol among adults aged >=18 years ...,%,Crude prevalence,32.1,31.2,33.1,7760,56043,HLTHOUT,HIGHCHOL,CrdPrv,High Cholesterol,"{'type': 'Point', 'coordinates': [-107.669052,...",14,3127
1,2020,WY,Wyoming,Laramie,BRFSS,Health Risk Behaviors,No leisure-time physical activity among adults...,%,Crude prevalence,23.8,21.5,26.2,100595,56021,RISKBEH,LPA,CrdPrv,Physical Inactivity,"{'type': 'Point', 'coordinates': [-104.660395,...",14,3119
2,2020,WY,Wyoming,Carbon,BRFSS,Health Status,Mental health not good for >=14 days among adu...,%,Crude prevalence,12.9,12.1,13.7,14711,56007,HLTHSTAT,MHLTH,CrdPrv,Mental Health,"{'type': 'Point', 'coordinates': [-106.9331526...",14,3081
3,2020,WY,Wyoming,Sublette,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,Crude prevalence,16.0,15.0,17.1,9856,56035,HLTHOUT,DEPRESSION,CrdPrv,Depression,"{'type': 'Point', 'coordinates': [-109.9161701...",14,3124
4,2020,WY,Wyoming,Uinta,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,3.0,2.7,3.4,20215,56041,HLTHOUT,STROKE,CrdPrv,Stroke,"{'type': 'Point', 'coordinates': [-110.5589468...",14,893


# General Data Overview

In [112]:
# View how many years worth of data we have
results_df['year'].unique()

array(['2019', '2020'], dtype=object)

In [113]:
# View null values by year
results_df.isnull().sum()

year                              0
stateabbr                         0
statedesc                         0
locationname                     60
datasource                        0
category                          0
measure                           0
data_value_unit                   0
data_value_type                   0
data_value                        0
low_confidence_limit              0
high_confidence_limit             0
totalpopulation                   0
locationid                        0
categoryid                        0
measureid                         0
datavaluetypeid                   0
short_question_text               0
geolocation                      60
:@computed_region_bxsw_vy29    1192
:@computed_region_he4y_prf8     660
dtype: int64

In [114]:
# Check data types for results_df
results_df.dtypes

year                           object
stateabbr                      object
statedesc                      object
locationname                   object
datasource                     object
category                       object
measure                        object
data_value_unit                object
data_value_type                object
data_value                     object
low_confidence_limit           object
high_confidence_limit          object
totalpopulation                object
locationid                     object
categoryid                     object
measureid                      object
datavaluetypeid                object
short_question_text            object
geolocation                    object
:@computed_region_bxsw_vy29    object
:@computed_region_he4y_prf8    object
dtype: object

In [115]:
# View health categories - (which include metrics we could use as features for machine learning)
results_df['category'].unique()

array(['Health Outcomes', 'Health Risk Behaviors', 'Health Status',
       'Prevention'], dtype=object)

In [116]:
# View contributing health metrics (possible features for machine learning)
results_df.loc[results_df['category'].isin(['Prevention', 'Health Risk Behaviors', 'Health Status'])]['measureid'].unique()

array(['LPA', 'MHLTH', 'DENTAL', 'BINGE', 'COREW', 'CERVICAL', 'PHLTH',
       'BPMED', 'COLON_SCREEN', 'CHOLSCREEN', 'COREM', 'SLEEP',
       'CSMOKING', 'MAMMOUSE', 'ACCESS2', 'GHLTH', 'CHECKUP'],
      dtype=object)

In [117]:
# View disease outcomes (possible targets for machine learning)
# This analysis will focus on COPD (Chronic Obstructive Pulmonary Disease)
# However, this notebook will make it easy to analyze other diseases
results_df['measureid'].unique()

array(['HIGHCHOL', 'LPA', 'MHLTH', 'DEPRESSION', 'STROKE', 'CASTHMA',
       'BPHIGH', 'DIABETES', 'DENTAL', 'BINGE', 'COPD', 'COREW',
       'CERVICAL', 'CHD', 'PHLTH', 'BPMED', 'COLON_SCREEN', 'KIDNEY',
       'CHOLSCREEN', 'COREM', 'SLEEP', 'CSMOKING', 'MAMMOUSE',
       'ARTHRITIS', 'ACCESS2', 'TEETHLOST', 'CANCER', 'GHLTH', 'CHECKUP',
       'OBESITY'], dtype=object)

# Prepare Dataframe

In [118]:
# Create a list of columns from results_df we want to convert from object to numeric
# This will allow us to perform calculations on the data
numeric_list = ['year', 'data_value', 'low_confidence_limit', 'high_confidence_limit', 'totalpopulation']

# Convert columns in numeric_list to numeric data type in results_df
results_df[numeric_list] = results_df[numeric_list].apply(pd.to_numeric, errors='coerce')

# View data types for results_df
results_df.dtypes

year                             int64
stateabbr                       object
statedesc                       object
locationname                    object
datasource                      object
category                        object
measure                         object
data_value_unit                 object
data_value_type                 object
data_value                     float64
low_confidence_limit           float64
high_confidence_limit          float64
totalpopulation                  int64
locationid                      object
categoryid                      object
measureid                       object
datavaluetypeid                 object
short_question_text             object
geolocation                     object
:@computed_region_bxsw_vy29     object
:@computed_region_he4y_prf8     object
dtype: object

In [119]:
# Create a list of the health metrics which will become our features and make them into dataframes
# And print the feature and the measure of each dataframe as a key to more easily understand the data

# Add the health outcomes to a list
features_list = results_df.loc[results_df['category'].isin(['Prevention', 'Health Risk Behaviors', 'Health Status'])]['measureid'].unique()

# Create unique dataframes for each feature in the features_list
for feature in features_list:
    globals()[feature + "_df"] = results_df.loc[results_df['measureid'] == feature]

# Create a list that contains the names of the dataframes made in the step above
dataframes_list = [feature + "_df" for feature in features_list]

# Print the feature and the measure of each dataframe
for dataframe in dataframes_list:
    print(f"{dataframe} - {globals()[dataframe]['measure'].unique()}")

LPA_df - ['No leisure-time physical activity among adults aged >=18 years']
MHLTH_df - ['Mental health not good for >=14 days among adults aged >=18 years']
DENTAL_df - ['Visits to dentist or dental clinic among adults aged >=18 years']
BINGE_df - ['Binge drinking among adults aged >=18 years']
COREW_df - ['Older adult women aged >=65 years who are up to date on a core set of clinical preventive services: Flu shot past year, PPV shot ever, Colorectal cancer screening, and Mammogram past 2 years']
CERVICAL_df - ['Cervical cancer screening among adult women aged 21-65 years']
PHLTH_df - ['Physical health not good for >=14 days among adults aged >=18 years']
BPMED_df - ['Taking medicine for high blood pressure control among adults aged >=18 years with high blood pressure']
COLON_SCREEN_df - ['Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50-75 years']
CHOLSCREEN_df - ['Cholesterol screening among adults aged >=18 years']
COREM_df - ['Older adult men aged >=65 ye

In [120]:
# Create the COPD dataframe by filtering results_df which will be our taget
copd_df = results_df.loc[results_df["measureid"] == "COPD", :]

# Display dataframe and view all columns
copd_df.head(3)

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
12,2020,WY,Wyoming,Niobrara,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,8.4,7.3,9.5,2275,56027,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-104.4683727...",14,3121
21,2020,WY,Wyoming,Laramie,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,6.3,5.5,7.3,100595,56021,HLTHOUT,COPD,CrdPrv,COPD,"{'type': 'Point', 'coordinates': [-104.660395,...",14,3119
38,2020,WY,Wyoming,Sheridan,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,5.7,4.9,6.6,30863,56033,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-106.8812114...",14,3123


In [121]:
# View dataframes list
dataframes_list

['LPA_df',
 'MHLTH_df',
 'DENTAL_df',
 'BINGE_df',
 'COREW_df',
 'CERVICAL_df',
 'PHLTH_df',
 'BPMED_df',
 'COLON_SCREEN_df',
 'CHOLSCREEN_df',
 'COREM_df',
 'SLEEP_df',
 'CSMOKING_df',
 'MAMMOUSE_df',
 'ACCESS2_df',
 'GHLTH_df',
 'CHECKUP_df']

In [122]:
# Add copd_df to dataframes_list
dataframes_list.append("copd_df")

# Filter dataframes to only include datavaluetypeid AgeAdjPrv
for dataframe in dataframes_list:
    globals()[dataframe] = globals()[dataframe].loc[globals()[dataframe]['datavaluetypeid'] == 'AgeAdjPrv']

# Change the name of data_value to the name of the measureid + data_value for all dataframes
for dataframe in dataframes_list:
    globals()[dataframe] = globals()[dataframe].rename(columns={'data_value': globals()[dataframe]['measureid'].unique()[0] + '_data_value'})


# Drop unnecessary columns for all dataframes in dataframes_list
for dataframe in dataframes_list:
    globals()[dataframe] = globals()[dataframe].drop(columns=['statedesc', 'datasource', 'category', 'measure', 
                                                              'data_value_unit', 'locationid', 'categoryid', 
                                                              'measureid', 'data_value_type', 'low_confidence_limit', 
                                                              'high_confidence_limit', 'short_question_text', 'geolocation', 
                                                              'datavaluetypeid', ':@computed_region_bxsw_vy29', 
                                                              ':@computed_region_he4y_prf8'], axis=1)

# Display dataframe and view all columns
copd_df.head(3)

Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation
12,2020,WY,Niobrara,8.4,2275
38,2020,WY,Sheridan,5.7,30863
39,2020,WY,Hot Springs,6.8,4425


In [123]:
# View dataframes list
dataframes_list

['LPA_df',
 'MHLTH_df',
 'DENTAL_df',
 'BINGE_df',
 'COREW_df',
 'CERVICAL_df',
 'PHLTH_df',
 'BPMED_df',
 'COLON_SCREEN_df',
 'CHOLSCREEN_df',
 'COREM_df',
 'SLEEP_df',
 'CSMOKING_df',
 'MAMMOUSE_df',
 'ACCESS2_df',
 'GHLTH_df',
 'CHECKUP_df',
 'copd_df']

In [124]:
# Copy the copd_df dataframe to a new dataframe
ml_df = copd_df.copy()

# Merge the dataframes in dataframes_list to ml_df
for dataframe in dataframes_list:
    ml_df = ml_df.merge(globals()[dataframe], how='left', on=['year', 'totalpopulation', 'stateabbr', 'locationname'])

# Rename the column COPD_data_value_x to COPD_data_value
ml_df = ml_df.rename(columns={'COPD_data_value_x': 'COPD_data_value'})

# Drop the column COPD_data_value_y
ml_df = ml_df.drop(columns=['COPD_data_value_y'], axis=1)

# Display dataframe and view all columns
ml_df.head(3)




Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation,LPA_data_value,MHLTH_data_value,DENTAL_data_value,BINGE_data_value,COREW_data_value,CERVICAL_data_value,PHLTH_data_value,BPMED_data_value,COLON_SCREEN_data_value,CHOLSCREEN_data_value,COREM_data_value,SLEEP_data_value,CSMOKING_data_value,MAMMOUSE_data_value,ACCESS2_data_value,GHLTH_data_value,CHECKUP_data_value
0,2020,WY,Niobrara,8.4,2275,28.2,16.0,58.0,16.2,26.7,78.2,11.6,,57.9,,38.4,33.1,22.6,59.4,17.5,15.6,67.9
1,2020,WY,Sheridan,5.7,30863,20.4,13.3,65.3,19.3,32.3,81.9,8.4,,63.4,,50.0,31.9,17.6,69.2,14.7,10.8,64.0
2,2020,WY,Hot Springs,6.8,4425,24.6,14.1,61.8,17.8,28.8,80.7,9.7,,61.5,,41.4,32.3,19.9,60.1,16.2,12.9,62.4


In [125]:
# Get summary statistics for all columns
ml_df.describe()

Unnamed: 0,year,COPD_data_value,totalpopulation,LPA_data_value,MHLTH_data_value,DENTAL_data_value,BINGE_data_value,COREW_data_value,CERVICAL_data_value,PHLTH_data_value,BPMED_data_value,COLON_SCREEN_data_value,CHOLSCREEN_data_value,COREM_data_value,SLEEP_data_value,CSMOKING_data_value,MAMMOUSE_data_value,ACCESS2_data_value,GHLTH_data_value,CHECKUP_data_value
count,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,0.0,3144.0,0.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0
mean,2020.0,7.183938,210220.5,25.709796,15.730184,58.988963,17.834669,36.866349,81.565331,10.963836,,67.734765,,42.759351,34.471947,20.034256,70.444593,16.797328,16.032029,72.762309
std,0.0,1.747671,5918720.0,5.192709,2.040912,7.488897,3.02008,4.477454,2.34434,2.152859,,4.738214,,5.143715,3.63718,4.105682,3.959298,6.606075,4.442595,4.045601
min,2020.0,2.6,87.0,10.2,8.3,33.2,8.2,16.2,68.4,5.6,,49.0,,19.8,23.8,5.8,50.6,5.7,6.5,58.3
25%,2020.0,5.9,10884.25,22.0,14.3,53.6,15.7,33.9,80.1,9.3,,64.8,,39.3,31.9,17.5,67.9,12.1,12.6,69.8
50%,2020.0,7.0,25620.5,25.2,15.7,59.8,17.6,36.8,81.8,10.8,,68.1,,42.7,34.4,19.85,70.7,15.0,15.3,73.2
75%,2020.0,8.2,68297.25,29.2,17.2,64.525,19.7,39.8,83.2,12.4,,71.3,,46.0,36.8,22.6,73.3,19.5,19.0,75.6
max,2020.0,16.5,331449300.0,47.2,23.3,80.7,27.6,53.4,87.7,20.9,,82.1,,60.6,48.4,41.1,83.4,53.3,36.8,82.6


# Machine Learning

In [126]:
# Import dependencies for machine learning
import tensorflow as tf
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [127]:
# View ml_df
ml_df.head(3)

Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation,LPA_data_value,MHLTH_data_value,DENTAL_data_value,BINGE_data_value,COREW_data_value,CERVICAL_data_value,PHLTH_data_value,BPMED_data_value,COLON_SCREEN_data_value,CHOLSCREEN_data_value,COREM_data_value,SLEEP_data_value,CSMOKING_data_value,MAMMOUSE_data_value,ACCESS2_data_value,GHLTH_data_value,CHECKUP_data_value
0,2020,WY,Niobrara,8.4,2275,28.2,16.0,58.0,16.2,26.7,78.2,11.6,,57.9,,38.4,33.1,22.6,59.4,17.5,15.6,67.9
1,2020,WY,Sheridan,5.7,30863,20.4,13.3,65.3,19.3,32.3,81.9,8.4,,63.4,,50.0,31.9,17.6,69.2,14.7,10.8,64.0
2,2020,WY,Hot Springs,6.8,4425,24.6,14.1,61.8,17.8,28.8,80.7,9.7,,61.5,,41.4,32.3,19.9,60.1,16.2,12.9,62.4


In [128]:
# Check for null values
ml_df.isnull().sum()

year                          0
stateabbr                     0
locationname                  1
COPD_data_value               0
totalpopulation               0
LPA_data_value                0
MHLTH_data_value              0
DENTAL_data_value             0
BINGE_data_value              0
COREW_data_value              0
CERVICAL_data_value           0
PHLTH_data_value              0
BPMED_data_value           3144
COLON_SCREEN_data_value       0
CHOLSCREEN_data_value      3144
COREM_data_value              0
SLEEP_data_value              0
CSMOKING_data_value           0
MAMMOUSE_data_value           0
ACCESS2_data_value            0
GHLTH_data_value              0
CHECKUP_data_value            0
dtype: int64

In [129]:
# Drop BPMED_data_value and CHOLSCREEN_data_value columns
ml_df = ml_df.drop(columns=['BPMED_data_value', 'CHOLSCREEN_data_value'], axis=1)

In [130]:
# View the null value in the locationname column
ml_df.loc[ml_df['locationname'].isnull()]

Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation,LPA_data_value,MHLTH_data_value,DENTAL_data_value,BINGE_data_value,COREW_data_value,CERVICAL_data_value,PHLTH_data_value,COLON_SCREEN_data_value,COREM_data_value,SLEEP_data_value,CSMOKING_data_value,MAMMOUSE_data_value,ACCESS2_data_value,GHLTH_data_value,CHECKUP_data_value
365,2020,US,,5.6,331449281,22.9,13.9,64.5,16.7,37.4,83.7,9.4,70.6,44.0,33.3,14.6,77.8,13.5,13.7,73.0


In [131]:
# Drop the null value in the locationname column
ml_df = ml_df.dropna()

# Check for null values
ml_df.isnull().sum()

year                       0
stateabbr                  0
locationname               0
COPD_data_value            0
totalpopulation            0
LPA_data_value             0
MHLTH_data_value           0
DENTAL_data_value          0
BINGE_data_value           0
COREW_data_value           0
CERVICAL_data_value        0
PHLTH_data_value           0
COLON_SCREEN_data_value    0
COREM_data_value           0
SLEEP_data_value           0
CSMOKING_data_value        0
MAMMOUSE_data_value        0
ACCESS2_data_value         0
GHLTH_data_value           0
CHECKUP_data_value         0
dtype: int64

In [132]:
# Create label encoder instance
le = LabelEncoder()

# Create a copy of ml_df
ml_df_encoded = ml_df.copy()

# Generate our categorical variable lists
dataset_cat = ml_df_encoded.dtypes[ml_df_encoded.dtypes == "object"].index.tolist()

# View the list of categorical variables
dataset_cat

['stateabbr', 'locationname']

In [133]:
# For loop to encode text columns to numerical values
for textColumn in dataset_cat:
    ml_df_encoded[textColumn] = le.fit_transform(ml_df_encoded[textColumn])

# View ml_df
ml_df_encoded.head(3)

Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation,LPA_data_value,MHLTH_data_value,DENTAL_data_value,BINGE_data_value,COREW_data_value,CERVICAL_data_value,PHLTH_data_value,COLON_SCREEN_data_value,COREM_data_value,SLEEP_data_value,CSMOKING_data_value,MAMMOUSE_data_value,ACCESS2_data_value,GHLTH_data_value,CHECKUP_data_value
0,2020,50,1179,8.4,2275,28.2,16.0,58.0,16.2,26.7,78.2,11.6,57.9,38.4,33.1,22.6,59.4,17.5,15.6,67.9
1,2020,50,1509,5.7,30863,20.4,13.3,65.3,19.3,32.3,81.9,8.4,63.4,50.0,31.9,17.6,69.2,14.7,10.8,64.0
2,2020,50,779,6.8,4425,24.6,14.1,61.8,17.8,28.8,80.7,9.7,61.5,41.4,32.3,19.9,60.1,16.2,12.9,62.4


In [134]:
# Create list of columns to be our features
feature_cols = [i for i in ml_df_encoded.columns if i not in ('COPD_data_value')]
X = ml_df_encoded[feature_cols]

# Create our target column
y = ml_df_encoded['COPD_data_value']

In [135]:
# Split the data into testing and training datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)

In [136]:
# Train the Random Forest Regression model on the training set

# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

In [137]:
# View feature importance
regressor.feature_importances_

array([0.        , 0.00411489, 0.00117561, 0.00187912, 0.00358573,
       0.07826905, 0.00381925, 0.00464962, 0.0014287 , 0.00446476,
       0.11047682, 0.00307309, 0.00164987, 0.00310639, 0.75625874,
       0.00204146, 0.00242726, 0.01326383, 0.00431581])

In [139]:
# Create a dataframe of the feature importance
feature_importance_df = pd.DataFrame(regressor.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

# View the dataframe
feature_importance_df

Unnamed: 0,importance
CSMOKING_data_value,0.756259
PHLTH_data_value,0.110477
MHLTH_data_value,0.078269
GHLTH_data_value,0.013264
BINGE_data_value,0.00465
CERVICAL_data_value,0.004465
CHECKUP_data_value,0.004316
stateabbr,0.004115
DENTAL_data_value,0.003819
LPA_data_value,0.003586
