In [1]:
# Import dependencies
import pandas as pd
from sodapy import Socrata

# Load Data

In [2]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("chronicdata.cdc.gov", None)



In [3]:
# Return results as csv from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("swc5-untb", limit=2000000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

# View all columns in dataframes
pd.set_option('display.max_columns', None)  

# Display dataframe
results_df.head()

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
0,2020,WY,Wyoming,Teton,BRFSS,Health Status,Physical health not good for >=14 days among a...,%,Crude prevalence,7.3,6.4,8.2,23497,56039,HLTHSTAT,PHLTH,CrdPrv,Physical Health,"{'type': 'Point', 'coordinates': [-110.426087,...",14,3126
1,2020,WY,Wyoming,Goshen,BRFSS,Health Status,Fair or poor self-rated health status among ad...,%,Crude prevalence,13.8,12.0,15.8,13235,56015,HLTHSTAT,GHLTH,CrdPrv,General Health,"{'type': 'Point', 'coordinates': [-104.3535403...",14,890
2,2020,WY,Wyoming,Laramie,BRFSS,Prevention,"Fecal occult blood test, sigmoidoscopy, or col...",%,Age-adjusted prevalence,61.6,58.5,64.6,100595,56021,PREVENT,COLON_SCREEN,AgeAdjPrv,Colorectal Cancer Screening,"{'type': 'Point', 'coordinates': [-104.660395,...",14,3119
3,2020,WY,Wyoming,Park,BRFSS,Prevention,Visits to doctor for routine checkup within th...,%,Crude prevalence,71.0,70.0,72.0,29331,56029,PREVENT,CHECKUP,CrdPrv,Annual Checkup,"{'type': 'Point', 'coordinates': [-109.5935975...",14,3122
4,2020,WY,Wyoming,Lincoln,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,5.9,5.1,6.8,20253,56023,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-110.6829614...",14,3120


# General Data Overview

In [4]:
# View how many years worth of data we have
results_df['year'].unique()

array(['2020', '2019'], dtype=object)

In [5]:
# View null values by year
results_df.isnull().sum()

year                              0
stateabbr                         0
statedesc                         0
locationname                     60
datasource                        0
category                          0
measure                           0
data_value_unit                   0
data_value_type                   0
data_value                        0
low_confidence_limit              0
high_confidence_limit             0
totalpopulation                   0
locationid                        0
categoryid                        0
measureid                         0
datavaluetypeid                   0
short_question_text               0
geolocation                      60
:@computed_region_bxsw_vy29    1192
:@computed_region_he4y_prf8     660
dtype: int64

In [6]:
# Check data types for results_df
results_df.dtypes

year                           object
stateabbr                      object
statedesc                      object
locationname                   object
datasource                     object
category                       object
measure                        object
data_value_unit                object
data_value_type                object
data_value                     object
low_confidence_limit           object
high_confidence_limit          object
totalpopulation                object
locationid                     object
categoryid                     object
measureid                      object
datavaluetypeid                object
short_question_text            object
geolocation                    object
:@computed_region_bxsw_vy29    object
:@computed_region_he4y_prf8    object
dtype: object

In [7]:
# View health categories - (which include metrics we could use as features for machine learning)
results_df['category'].unique()

array(['Health Status', 'Prevention', 'Health Outcomes',
       'Health Risk Behaviors'], dtype=object)

In [8]:
# View contributing health metrics (possible features for machine learning)
results_df.loc[results_df['category'].isin(['Prevention', 'Health Risk Behaviors', 'Health Status'])]['measureid'].unique()

array(['PHLTH', 'GHLTH', 'COLON_SCREEN', 'CHECKUP', 'DENTAL',
       'CHOLSCREEN', 'CSMOKING', 'MHLTH', 'ACCESS2', 'BINGE', 'LPA',
       'SLEEP', 'COREW', 'CERVICAL', 'MAMMOUSE', 'BPMED', 'COREM'],
      dtype=object)

In [9]:
# View disease outcomes (possible targets for machine learning)
# This analysis will focus on COPD (Chronic Obstructive Pulmonary Disease)
# However, this notebook will make it easy to analyze other diseases
results_df['measureid'].unique()

array(['PHLTH', 'GHLTH', 'COLON_SCREEN', 'CHECKUP', 'COPD', 'CASTHMA',
       'TEETHLOST', 'CANCER', 'DENTAL', 'STROKE', 'CHOLSCREEN',
       'CSMOKING', 'MHLTH', 'OBESITY', 'DIABETES', 'ARTHRITIS', 'ACCESS2',
       'BINGE', 'LPA', 'CHD', 'KIDNEY', 'BPHIGH', 'HIGHCHOL', 'SLEEP',
       'COREW', 'CERVICAL', 'MAMMOUSE', 'BPMED', 'DEPRESSION', 'COREM'],
      dtype=object)

# Prepare Dataframe

In [10]:
# Create a list of columns from results_df we want to convert from object to numeric
# This will allow us to perform calculations on the data
numeric_list = ['year', 'data_value', 'low_confidence_limit', 'high_confidence_limit', 'totalpopulation']

# Convert columns in numeric_list to numeric data type in results_df
results_df[numeric_list] = results_df[numeric_list].apply(pd.to_numeric, errors='coerce')

# View data types for results_df
results_df.dtypes

year                             int64
stateabbr                       object
statedesc                       object
locationname                    object
datasource                      object
category                        object
measure                         object
data_value_unit                 object
data_value_type                 object
data_value                     float64
low_confidence_limit           float64
high_confidence_limit          float64
totalpopulation                  int64
locationid                      object
categoryid                      object
measureid                       object
datavaluetypeid                 object
short_question_text             object
geolocation                     object
:@computed_region_bxsw_vy29     object
:@computed_region_he4y_prf8     object
dtype: object