In [2]:
# Import dependencies
import pandas as pd
from sodapy import Socrata

In [3]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("chronicdata.cdc.gov", None)



In [4]:
# Return results as csv from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("swc5-untb", limit=2000000)

In [5]:
# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

In [6]:
# Display dataframe
results_df.head()

# View all columns in dataframes
pd.set_option('display.max_columns', None)  

In [7]:
# View unique values in measureid column
results_df['measureid'].unique()

array(['HIGHCHOL', 'LPA', 'MHLTH', 'DEPRESSION', 'STROKE', 'CASTHMA',
       'BPHIGH', 'DIABETES', 'DENTAL', 'BINGE', 'COPD', 'COREW',
       'CERVICAL', 'CHD', 'PHLTH', 'BPMED', 'COLON_SCREEN', 'KIDNEY',
       'CHOLSCREEN', 'COREM', 'SLEEP', 'CSMOKING', 'MAMMOUSE',
       'ARTHRITIS', 'ACCESS2', 'TEETHLOST', 'CANCER', 'GHLTH', 'CHECKUP',
       'OBESITY'], dtype=object)

In [8]:
# View unique values in category column
results_df['category'].unique()

array(['Health Outcomes', 'Health Risk Behaviors', 'Health Status',
       'Prevention'], dtype=object)

In [14]:
# View measureid where category is 'Prevention' or Health Risk Behaviors' or 'Health Status'
results_df.loc[results_df['category'].isin(['Prevention', 'Health Risk Behaviors', 'Health Status'])]['measureid'].unique()

# Add these items to a list
health_factors = results_df.loc[results_df['category'].isin(['Prevention', 'Health Risk Behaviors', 'Health Status'])]['measureid'].unique()

# View list
health_factors

array(['LPA', 'MHLTH', 'DENTAL', 'BINGE', 'COREW', 'CERVICAL', 'PHLTH',
       'BPMED', 'COLON_SCREEN', 'CHOLSCREEN', 'COREM', 'SLEEP',
       'CSMOKING', 'MAMMOUSE', 'ACCESS2', 'GHLTH', 'CHECKUP'],
      dtype=object)

In [9]:
# View unique values in year column
results_df['year'].unique()

array(['2019', '2020'], dtype=object)

In [10]:
# Create the COPD dataframe by filtering results_df
copd_df = results_df.loc[results_df["measureid"] == "COPD", :]

# Filter copd_df where data_value_type is Age-adjusted Prevalence
copd_df = copd_df.loc[copd_df["data_value_type"] == "Age-adjusted prevalence", :]

# Display dataframe and view all columns
copd_df.head(3)

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
12,2020,WY,Wyoming,Niobrara,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,8.4,7.3,9.5,2275,56027,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-104.4683727...",14,3121
38,2020,WY,Wyoming,Sheridan,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,5.7,4.9,6.6,30863,56033,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-106.8812114...",14,3123
39,2020,WY,Wyoming,Hot Springs,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,6.8,5.9,7.9,4425,56017,HLTHOUT,COPD,AgeAdjPrv,COPD,"{'type': 'Point', 'coordinates': [-108.4350765...",14,3117


In [11]:
# Create the smokers dataframe by filtering results_df
smokers_df = results_df.loc[results_df["measureid"] == "CSMOKING", :]

# Filter smokers_df where data_value_type is Age-adjusted prevalence
smokers_df = smokers_df.loc[smokers_df["data_value_type"] == "Age-adjusted prevalence", :]

# Display dataframe
smokers_df.head(3)

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,locationid,categoryid,measureid,datavaluetypeid,short_question_text,geolocation,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
81,2020,WY,Wyoming,Park,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,15.9,13.0,19.0,29331,56029,RISKBEH,CSMOKING,AgeAdjPrv,Current Smoking,"{'type': 'Point', 'coordinates': [-109.5935975...",14,3122
141,2020,WY,Wyoming,Fremont,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,22.0,19.1,24.8,39317,56013,RISKBEH,CSMOKING,AgeAdjPrv,Current Smoking,"{'type': 'Point', 'coordinates': [-108.6089349...",14,3084
156,2020,WY,Wyoming,Sheridan,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,17.6,14.4,20.8,30863,56033,RISKBEH,CSMOKING,AgeAdjPrv,Current Smoking,"{'type': 'Point', 'coordinates': [-106.8812114...",14,3123


In [12]:
# # Remove uninformative columns
# copd_df = copd_df.drop(["datasource", "category", ""], axis=1)

# # smokers_df = smokers_df.drop(["Data_Value_Footnote", "Year", "DataSource",
# #                            "Data_Value_Footnote_Symbol", "Data_Value_Footnote",
# #                            "CategoryID", "MeasureId", "DataValueTypeID",
# #                            "Short_Question_Text", "Category", "LocationName",
# #                            "StateAbbr", "StateDesc"], axis=1)