In [55]:
# Import dependencies
import pandas as pd
from sodapy import Socrata

# Load Data

In [56]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("chronicdata.cdc.gov", None)



In [57]:
# Return results as csv from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("swc5-untb", limit=2000000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

# View all columns in dataframes
pd.set_option('display.max_columns', None)  

# Display dataframe
results_df.head()

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
0,2019,WY,Wyoming,Washakie,BRFSS,Health Outcomes,High cholesterol among adults aged >=18 years ...,%,Crude prevalence,32.1,31.2,33.1,7760,56043,HLTHOUT,HIGHCHOL,CrdPrv,High Cholesterol,"{'type': 'Point', 'coordinates': [-107.669052,...",14,3127
1,2020,WY,Wyoming,Laramie,BRFSS,Health Risk Behaviors,No leisure-time physical activity among adults...,%,Crude prevalence,23.8,21.5,26.2,100595,56021,RISKBEH,LPA,CrdPrv,Physical Inactivity,"{'type': 'Point', 'coordinates': [-104.660395,...",14,3119
2,2020,WY,Wyoming,Carbon,BRFSS,Health Status,Mental health not good for >=14 days among adu...,%,Crude prevalence,12.9,12.1,13.7,14711,56007,HLTHSTAT,MHLTH,CrdPrv,Mental Health,"{'type': 'Point', 'coordinates': [-106.9331526...",14,3081
3,2020,WY,Wyoming,Sublette,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,Crude prevalence,16.0,15.0,17.1,9856,56035,HLTHOUT,DEPRESSION,CrdPrv,Depression,"{'type': 'Point', 'coordinates': [-109.9161701...",14,3124
4,2020,WY,Wyoming,Uinta,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,3.0,2.7,3.4,20215,56041,HLTHOUT,STROKE,CrdPrv,Stroke,"{'type': 'Point', 'coordinates': [-110.5589468...",14,893


# General Data Overview

In [58]:
# View how many years worth of data we have
results_df['year'].unique()

array(['2019', '2020'], dtype=object)

In [59]:
# View null values by year
results_df.isnull().sum()

year                              0
stateabbr                         0
statedesc                         0
locationname                     60
datasource                        0
category                          0
measure                           0
data_value_unit                   0
data_value_type                   0
data_value                        0
low_confidence_limit              0
high_confidence_limit             0
totalpopulation                   0
locationid                        0
categoryid                        0
measureid                         0
datavaluetypeid                   0
short_question_text               0
geolocation                      60
:@computed_region_bxsw_vy29    1192
:@computed_region_he4y_prf8     660
dtype: int64

In [60]:
# View health categories - (which include metrics we could use as features for machine learning)
results_df['category'].unique()

array(['Health Outcomes', 'Health Risk Behaviors', 'Health Status',
       'Prevention'], dtype=object)

In [61]:
# View contributing health metrics (possible features for machine learning)
results_df.loc[results_df['category'].isin(['Prevention', 'Health Risk Behaviors', 'Health Status'])]['measureid'].unique()

array(['LPA', 'MHLTH', 'DENTAL', 'BINGE', 'COREW', 'CERVICAL', 'PHLTH',
       'BPMED', 'COLON_SCREEN', 'CHOLSCREEN', 'COREM', 'SLEEP',
       'CSMOKING', 'MAMMOUSE', 'ACCESS2', 'GHLTH', 'CHECKUP'],
      dtype=object)

In [62]:
# View disease outcomes (possible targets for machine learning)
# This analysis will focus on COPD (Chronic Obstructive Pulmonary Disease)
# However, this notebook will make it easy to analyze other diseases
results_df['measureid'].unique()

array(['HIGHCHOL', 'LPA', 'MHLTH', 'DEPRESSION', 'STROKE', 'CASTHMA',
       'BPHIGH', 'DIABETES', 'DENTAL', 'BINGE', 'COPD', 'COREW',
       'CERVICAL', 'CHD', 'PHLTH', 'BPMED', 'COLON_SCREEN', 'KIDNEY',
       'CHOLSCREEN', 'COREM', 'SLEEP', 'CSMOKING', 'MAMMOUSE',
       'ARTHRITIS', 'ACCESS2', 'TEETHLOST', 'CANCER', 'GHLTH', 'CHECKUP',
       'OBESITY'], dtype=object)

# Prepare Dataframe

In [63]:
# Create a list of the health metrics which will become our features and make them into dataframes
# And print the feature and the measure of each dataframe as a key to more easily understand the data

# Add the health outcomes to a list
features_list = results_df.loc[results_df['category'].isin(['Prevention', 'Health Risk Behaviors', 'Health Status'])]['measureid'].unique()

# Create unique dataframes for each feature in the features_list
for feature in features_list:
    globals()[feature + "_df"] = results_df.loc[results_df['measureid'] == feature]

# Create a list that contains the names of the dataframes made in the step above
dataframes_list = [feature + "_df" for feature in features_list]

# Print the feature and the measure of each dataframe
for dataframe in dataframes_list:
    print(f"{dataframe} - {globals()[dataframe]['measure'].unique()}")

LPA_df - ['No leisure-time physical activity among adults aged >=18 years']
MHLTH_df - ['Mental health not good for >=14 days among adults aged >=18 years']
DENTAL_df - ['Visits to dentist or dental clinic among adults aged >=18 years']
BINGE_df - ['Binge drinking among adults aged >=18 years']
COREW_df - ['Older adult women aged >=65 years who are up to date on a core set of clinical preventive services: Flu shot past year, PPV shot ever, Colorectal cancer screening, and Mammogram past 2 years']
CERVICAL_df - ['Cervical cancer screening among adult women aged 21-65 years']
PHLTH_df - ['Physical health not good for >=14 days among adults aged >=18 years']
BPMED_df - ['Taking medicine for high blood pressure control among adults aged >=18 years with high blood pressure']
COLON_SCREEN_df - ['Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50-75 years']
CHOLSCREEN_df - ['Cholesterol screening among adults aged >=18 years']
COREM_df - ['Older adult men aged >=65 ye

In [64]:
# Create the COPD dataframe by filtering results_df which will be our taget
copd_df = results_df.loc[results_df["measureid"] == "COPD", :]

# Display dataframe and view all columns
copd_df.head(3)

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
12,2020,WY,Wyoming,Niobrara,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,8.4,7.3,9.5,2275,56027,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-104.4683727...",14,3121
21,2020,WY,Wyoming,Laramie,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,6.3,5.5,7.3,100595,56021,HLTHOUT,COPD,CrdPrv,COPD,"{'type': 'Point', 'coordinates': [-104.660395,...",14,3119
38,2020,WY,Wyoming,Sheridan,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,5.7,4.9,6.6,30863,56033,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-106.8812114...",14,3123


In [65]:
# View dataframes list
dataframes_list

['LPA_df',
 'MHLTH_df',
 'DENTAL_df',
 'BINGE_df',
 'COREW_df',
 'CERVICAL_df',
 'PHLTH_df',
 'BPMED_df',
 'COLON_SCREEN_df',
 'CHOLSCREEN_df',
 'COREM_df',
 'SLEEP_df',
 'CSMOKING_df',
 'MAMMOUSE_df',
 'ACCESS2_df',
 'GHLTH_df',
 'CHECKUP_df']

In [66]:
# Add copd_df to dataframes_list
dataframes_list.append("copd_df")

# Filter dataframes to only include datavaluetypeid AgeAdjPrv
for dataframe in dataframes_list:
    globals()[dataframe] = globals()[dataframe].loc[globals()[dataframe]['datavaluetypeid'] == 'AgeAdjPrv']

# Change the name of data_value to the name of the measureid + data_value for all dataframes
for dataframe in dataframes_list:
    globals()[dataframe] = globals()[dataframe].rename(columns={'data_value': globals()[dataframe]['measureid'].unique()[0] + '_data_value'})


# Drop unnecessary columns for all dataframes in dataframes_list
for dataframe in dataframes_list:
    globals()[dataframe] = globals()[dataframe].drop(columns=['statedesc', 'datasource', 'category', 'measure', 
                                                              'data_value_unit', 'locationid', 'categoryid', 
                                                              'measureid', 'data_value_type', 'low_confidence_limit', 
                                                              'high_confidence_limit', 'short_question_text', 'geolocation', 
                                                              'datavaluetypeid', ':@computed_region_bxsw_vy29', 
                                                              ':@computed_region_he4y_prf8'], axis=1)

# Display dataframe and view all columns
copd_df.head(3)

Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation
12,2020,WY,Niobrara,8.4,2275
38,2020,WY,Sheridan,5.7,30863
39,2020,WY,Hot Springs,6.8,4425


In [68]:
# View dataframes list
dataframes_list

['LPA_df',
 'MHLTH_df',
 'DENTAL_df',
 'BINGE_df',
 'COREW_df',
 'CERVICAL_df',
 'PHLTH_df',
 'BPMED_df',
 'COLON_SCREEN_df',
 'CHOLSCREEN_df',
 'COREM_df',
 'SLEEP_df',
 'CSMOKING_df',
 'MAMMOUSE_df',
 'ACCESS2_df',
 'GHLTH_df',
 'CHECKUP_df']

In [67]:
# Remove copd_df from dataframes_list
dataframes_list.remove("copd_df")

# Merge copd_df with all dataframes in dataframes_list to create the ml_df
for dataframe in dataframes_list:
    ml_df = copd_df.merge(globals()[dataframe], on=['year', 'stateabbr', 'locationname', 'totalpopulation'])

# Display dataframe and view all columns
ml_df.head(3)




Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation,CHECKUP_data_value
0,2020,WY,Niobrara,8.4,2275,67.9
1,2020,WY,Sheridan,5.7,30863,64.0
2,2020,WY,Hot Springs,6.8,4425,62.4
