In [147]:
# Import dependencies
import pandas as pd
from sodapy import Socrata

# Load Data

In [148]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("chronicdata.cdc.gov", None)



In [149]:
# Return results as csv from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("swc5-untb", limit=2000000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

# View all columns in dataframes
pd.set_option('display.max_columns', None)  

# Display dataframe
results_df.head()

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
0,2020,WY,Wyoming,Teton,BRFSS,Health Status,Physical health not good for >=14 days among a...,%,Crude prevalence,7.3,6.4,8.2,23497,56039,HLTHSTAT,PHLTH,CrdPrv,Physical Health,"{'type': 'Point', 'coordinates': [-110.426087,...",14,3126
1,2020,WY,Wyoming,Goshen,BRFSS,Health Status,Fair or poor self-rated health status among ad...,%,Crude prevalence,13.8,12.0,15.8,13235,56015,HLTHSTAT,GHLTH,CrdPrv,General Health,"{'type': 'Point', 'coordinates': [-104.3535403...",14,890
2,2020,WY,Wyoming,Laramie,BRFSS,Prevention,"Fecal occult blood test, sigmoidoscopy, or col...",%,Age-adjusted prevalence,61.6,58.5,64.6,100595,56021,PREVENT,COLON_SCREEN,AgeAdjPrv,Colorectal Cancer Screening,"{'type': 'Point', 'coordinates': [-104.660395,...",14,3119
3,2020,WY,Wyoming,Park,BRFSS,Prevention,Visits to doctor for routine checkup within th...,%,Crude prevalence,71.0,70.0,72.0,29331,56029,PREVENT,CHECKUP,CrdPrv,Annual Checkup,"{'type': 'Point', 'coordinates': [-109.5935975...",14,3122
4,2020,WY,Wyoming,Lincoln,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,5.9,5.1,6.8,20253,56023,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-110.6829614...",14,3120


# General Data Overview

In [150]:
# View how many years worth of data we have
results_df['year'].unique()

array(['2020', '2019'], dtype=object)

In [151]:
# View null values by year
results_df.isnull().sum()

year                              0
stateabbr                         0
statedesc                         0
locationname                     60
datasource                        0
category                          0
measure                           0
data_value_unit                   0
data_value_type                   0
data_value                        0
low_confidence_limit              0
high_confidence_limit             0
totalpopulation                   0
locationid                        0
categoryid                        0
measureid                         0
datavaluetypeid                   0
short_question_text               0
geolocation                      60
:@computed_region_bxsw_vy29    1192
:@computed_region_he4y_prf8     660
dtype: int64

In [152]:
# View health categories - (which include metrics we could use as features for machine learning)
results_df['category'].unique()

array(['Health Status', 'Prevention', 'Health Outcomes',
       'Health Risk Behaviors'], dtype=object)

In [153]:
# View contributing health metrics (possible features for machine learning)
results_df.loc[results_df['category'].isin(['Prevention', 'Health Risk Behaviors', 'Health Status'])]['measureid'].unique()

array(['PHLTH', 'GHLTH', 'COLON_SCREEN', 'CHECKUP', 'DENTAL',
       'CHOLSCREEN', 'CSMOKING', 'MHLTH', 'ACCESS2', 'BINGE', 'LPA',
       'SLEEP', 'COREW', 'CERVICAL', 'MAMMOUSE', 'BPMED', 'COREM'],
      dtype=object)

In [154]:
# View disease outcomes (possible targets for machine learning)
# This analysis will focus on COPD (Chronic Obstructive Pulmonary Disease)
# However, this notebook will make it easy to analyze other diseases
results_df['measureid'].unique()

array(['PHLTH', 'GHLTH', 'COLON_SCREEN', 'CHECKUP', 'COPD', 'CASTHMA',
       'TEETHLOST', 'CANCER', 'DENTAL', 'STROKE', 'CHOLSCREEN',
       'CSMOKING', 'MHLTH', 'OBESITY', 'DIABETES', 'ARTHRITIS', 'ACCESS2',
       'BINGE', 'LPA', 'CHD', 'KIDNEY', 'BPHIGH', 'HIGHCHOL', 'SLEEP',
       'COREW', 'CERVICAL', 'MAMMOUSE', 'BPMED', 'DEPRESSION', 'COREM'],
      dtype=object)

# Prepare Dataframe

In [155]:
# Create a list of the health metrics which will become our features and make them into dataframes
# And print the feature and the measure of each dataframe as a key to more easily understand the data

# Add the health outcomes to a list
features_list = results_df.loc[results_df['category'].isin(['Prevention', 'Health Risk Behaviors', 'Health Status'])]['measureid'].unique()

# Create unique dataframes for each feature in the features_list
for feature in features_list:
    globals()[feature + "_df"] = results_df.loc[results_df['measureid'] == feature]

# Create a list that contains the names of the dataframes made in the step above
dataframes_list = [feature + "_df" for feature in features_list]

# Print the feature and the measure of each dataframe
for dataframe in dataframes_list:
    print(f"{dataframe} - {globals()[dataframe]['measure'].unique()}")

PHLTH_df - ['Physical health not good for >=14 days among adults aged >=18 years']
GHLTH_df - ['Fair or poor self-rated health status among adults aged >=18 years']
COLON_SCREEN_df - ['Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50-75 years']
CHECKUP_df - ['Visits to doctor for routine checkup within the past year among adults aged >=18 years']
DENTAL_df - ['Visits to dentist or dental clinic among adults aged >=18 years']
CHOLSCREEN_df - ['Cholesterol screening among adults aged >=18 years']
CSMOKING_df - ['Current smoking among adults aged >=18 years']
MHLTH_df - ['Mental health not good for >=14 days among adults aged >=18 years']
ACCESS2_df - ['Current lack of health insurance among adults aged 18-64 years']
BINGE_df - ['Binge drinking among adults aged >=18 years']
LPA_df - ['No leisure-time physical activity among adults aged >=18 years']
SLEEP_df - ['Sleeping less than 7 hours among adults aged >=18 years']
COREW_df - ['Older adult women aged >=65 ye

In [156]:
# Create the COPD dataframe by filtering results_df which will be our taget
copd_df = results_df.loc[results_df["measureid"] == "COPD", :]

# Display dataframe and view all columns
copd_df.head(3)

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
4,2020,WY,Wyoming,Lincoln,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,5.9,5.1,6.8,20253,56023,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-110.6829614...",14,3120
15,2020,WY,Wyoming,Crook,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,6.1,5.1,7.0,7593,56011,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-104.5672975...",14,3083
20,2020,WY,Wyoming,Platte,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,6.3,5.4,7.1,8578,56031,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-104.9539626...",14,892


In [157]:
# View dataframes list
dataframes_list

['PHLTH_df',
 'GHLTH_df',
 'COLON_SCREEN_df',
 'CHECKUP_df',
 'DENTAL_df',
 'CHOLSCREEN_df',
 'CSMOKING_df',
 'MHLTH_df',
 'ACCESS2_df',
 'BINGE_df',
 'LPA_df',
 'SLEEP_df',
 'COREW_df',
 'CERVICAL_df',
 'MAMMOUSE_df',
 'BPMED_df',
 'COREM_df']

In [158]:
# Add copd_df to dataframes_list
dataframes_list.append("copd_df")

# Filter dataframes to only include datavaluetypeid AgeAdjPrv
for dataframe in dataframes_list:
    globals()[dataframe] = globals()[dataframe].loc[globals()[dataframe]['datavaluetypeid'] == 'AgeAdjPrv']

# Change the name of data_value to the name of the measureid + data_value for all dataframes
for dataframe in dataframes_list:
    globals()[dataframe] = globals()[dataframe].rename(columns={'data_value': globals()[dataframe]['measureid'].unique()[0] + '_data_value'})


# Drop unnecessary columns for all dataframes in dataframes_list
for dataframe in dataframes_list:
    globals()[dataframe] = globals()[dataframe].drop(columns=['statedesc', 'datasource', 'category', 'measure', 
                                                              'data_value_unit', 'locationid', 'categoryid', 
                                                              'measureid', 'data_value_type', 'low_confidence_limit', 
                                                              'high_confidence_limit', 'short_question_text', 'geolocation', 
                                                              'datavaluetypeid', ':@computed_region_bxsw_vy29', 
                                                              ':@computed_region_he4y_prf8'], axis=1)

# Display dataframe and view all columns
copd_df.head(3)

Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation
4,2020,WY,Lincoln,5.9,20253
15,2020,WY,Crook,6.1,7593
20,2020,WY,Platte,6.3,8578


In [159]:
# View dataframes list
dataframes_list

['PHLTH_df',
 'GHLTH_df',
 'COLON_SCREEN_df',
 'CHECKUP_df',
 'DENTAL_df',
 'CHOLSCREEN_df',
 'CSMOKING_df',
 'MHLTH_df',
 'ACCESS2_df',
 'BINGE_df',
 'LPA_df',
 'SLEEP_df',
 'COREW_df',
 'CERVICAL_df',
 'MAMMOUSE_df',
 'BPMED_df',
 'COREM_df',
 'copd_df']

In [161]:
# Copy the copd_df dataframe to a new dataframe
ml_df = copd_df.copy()

# Merge the dataframes in dataframes_list to ml_df
for dataframe in dataframes_list:
    ml_df = ml_df.merge(globals()[dataframe], how='left', on=['year', 'totalpopulation', 'stateabbr', 'locationname'])

# Rename the column COPD_data_value_x to COPD_data_value
ml_df = ml_df.rename(columns={'COPD_data_value_x': 'COPD_data_value'})

# Drop the column COPD_data_value_y
ml_df = ml_df.drop(columns=['COPD_data_value_y'], axis=1)

# Display dataframe and view all columns
ml_df.head(3)




Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation,PHLTH_data_value,GHLTH_data_value,COLON_SCREEN_data_value,CHECKUP_data_value,DENTAL_data_value,CHOLSCREEN_data_value,CSMOKING_data_value,MHLTH_data_value,ACCESS2_data_value,BINGE_data_value,LPA_data_value,SLEEP_data_value,COREW_data_value,CERVICAL_data_value,MAMMOUSE_data_value,BPMED_data_value,COREM_data_value
0,2020,WY,Lincoln,5.9,20253,8.6,11.0,64.0,65.3,70.7,,17.9,13.4,16.0,18.9,22.7,31.6,28.0,81.3,64.2,,34.4
1,2020,WY,Crook,6.1,7593,8.7,11.0,62.4,65.0,66.3,,17.7,13.6,15.6,18.4,21.7,30.7,33.9,82.0,67.1,,41.3
2,2020,WY,Platte,6.3,8578,9.3,12.1,58.7,63.9,66.8,,19.0,13.7,17.5,18.7,23.8,31.4,28.0,81.0,61.9,,42.9


In [162]:
# Get summary statistics for all columns
ml_df.describe()

Unnamed: 0,year,stateabbr,locationname,COPD_data_value,totalpopulation,PHLTH_data_value,GHLTH_data_value,COLON_SCREEN_data_value,CHECKUP_data_value,DENTAL_data_value,CHOLSCREEN_data_value,CSMOKING_data_value,MHLTH_data_value,ACCESS2_data_value,BINGE_data_value,LPA_data_value,SLEEP_data_value,COREW_data_value,CERVICAL_data_value,MAMMOUSE_data_value,BPMED_data_value,COREM_data_value
count,3144,3144,3143,3144.0,3144,3144.0,3144.0,3144.0,3144.0,3144.0,0.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,3144.0,0.0,3144.0
unique,1,52,1840,113.0,3068,130.0,228.0,256.0,209.0,371.0,0.0,238.0,120.0,324.0,170.0,274.0,205.0,252.0,145.0,223.0,0.0,292.0
top,2020,TX,Washington,6.1,2986,9.3,12.7,67.4,75.0,59.9,,19.1,14.4,12.1,15.9,21.9,35.3,37.3,82.1,71.6,,42.8
freq,3144,254,31,92.0,3,76.0,43.0,36.0,59.0,28.0,,42.0,70.0,44.0,52.0,38.0,47.0,43.0,70.0,45.0,,38.0
