# Processing PLACES_2019 for ML and DataViz

In [1]:
# Dependencies
import pandas as pd

In [2]:
# This dataset is for 2019 but is from the 2021 release of PLACES
# https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-County-Data-20/swc5-untb/data

# Load in PLACES dataset filtered for COPD
file_path = "Resources/PLACES_COPD.csv"
copd_df = pd.read_csv(file_path)

# Display all rows 
pd.set_option('display.max_columns', None)

# Show the dataframe
copd_df.head()

Unnamed: 0,Year,StateAbbr,StateDesc,LocationName,DataSource,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Data_Value_Footnote_Symbol,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text,Geolocation
0,2019,IA,Iowa,Cass,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.3,,,7.0,9.8,12836,19029,HLTHOUT,COPD,CrdPrv,COPD,POINT (-94.92791359 41.33146007)
1,2019,IA,Iowa,Monona,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.3,,,7.0,9.5,8615,19133,HLTHOUT,COPD,CrdPrv,COPD,POINT (-95.95997093 42.05236881)
2,2019,AZ,Arizona,Graham,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,6.6,,,6.0,7.3,38837,4009,HLTHOUT,COPD,AgeAdjPrv,COPD,POINT (-109.8871087 32.93306592)
3,2019,AK,Alaska,Prince of Wales-Hyder,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,7.9,,,7.2,8.6,6203,2198,HLTHOUT,COPD,AgeAdjPrv,COPD,POINT (-132.7096152 55.50495667)
4,2019,AK,Alaska,Dillingham,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.9,,,7.7,10.0,4916,2070,HLTHOUT,COPD,CrdPrv,COPD,POINT (-158.2125392 59.79999618)


In [3]:
# This dataset is for 2019 but is from the 2021 release of PLACES
# https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-County-Data-20/swc5-untb/data

# Load in PLACES dataset filtered for current smokers
file_path = "Resources/PLACES_smokers.csv"
smokers_df = pd.read_csv(file_path)

# Display all rows 
pd.set_option('display.max_columns', None)

# Show the dataframe
smokers_df.head()

Unnamed: 0,Year,StateAbbr,StateDesc,LocationName,DataSource,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Data_Value_Footnote_Symbol,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text,Geolocation
0,2019,CA,California,Sierra,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Crude prevalence,13.8,,,11.9,15.6,3005,6091,RISKBEH,CSMOKING,CrdPrv,Current Smoking,POINT (-120.5160862 39.58014381)
1,2019,CO,Colorado,Pitkin,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,11.0,,,8.7,13.4,17767,8097,RISKBEH,CSMOKING,AgeAdjPrv,Current Smoking,POINT (-106.9159741 39.21744595)
2,2019,CA,California,San Mateo,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Crude prevalence,9.2,,,7.8,10.5,766573,6081,RISKBEH,CSMOKING,CrdPrv,Current Smoking,POINT (-122.3273639 37.42322968)
3,2019,AL,Alabama,Dale,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,22.0,,,19.4,24.4,49172,1045,RISKBEH,CSMOKING,AgeAdjPrv,Current Smoking,POINT (-85.61159073 31.43150889)
4,2019,AR,Arkansas,St. Francis,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Crude prevalence,24.6,,,22.2,26.8,24994,5123,RISKBEH,CSMOKING,CrdPrv,Current Smoking,POINT (-90.74737815 35.02173053)


In [4]:
# View rows and columns of the dataframe
print(copd_df.shape)
print(smokers_df.shape)

(6244, 21)
(6244, 21)


In [5]:
# View unique values for columns
print(copd_df["Data_Value_Footnote"].unique())
print(copd_df["Year"].unique())
print(copd_df["DataSource"].unique())
print(copd_df["Data_Value_Footnote_Symbol"].unique())
print(copd_df["Data_Value_Footnote"].unique())
print(copd_df["CategoryID"].unique())
print(copd_df["MeasureId"].unique())
print(copd_df["DataValueTypeID"].unique())
print(copd_df["Short_Question_Text"].unique())

[nan]
[2019]
['BRFSS']
[nan]
[nan]
['HLTHOUT']
['COPD']
['CrdPrv' 'AgeAdjPrv']
['COPD']


In [6]:
# View all columns
copd_df.columns

Index(['Year', 'StateAbbr', 'StateDesc', 'LocationName', 'DataSource',
       'Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
       'Low_Confidence_Limit', 'High_Confidence_Limit', 'TotalPopulation',
       'LocationID', 'CategoryID', 'MeasureId', 'DataValueTypeID',
       'Short_Question_Text', 'Geolocation'],
      dtype='object')

In [7]:
# Make a new column that merges county name ("LocationName") and the State ("StateAbbr")
copd_df["State_County"] = copd_df['LocationName'].astype(str) +", "+ copd_df["StateDesc"]

# Make a new column that merges county name ("LocationName") and the State ("StateAbbr")
smokers_df["State_County"] = smokers_df['LocationName'].astype(str) +", "+ smokers_df["StateDesc"]

In [8]:
# Remove uninformative columns
copd_df = copd_df.drop(["Data_Value_Footnote", "Year", "DataSource",
                           "Data_Value_Footnote_Symbol", "Data_Value_Footnote",
                           "CategoryID", "MeasureId", "DataValueTypeID",
                           "Short_Question_Text", "Category",
                           "StateAbbr"], axis=1)

smokers_df = smokers_df.drop(["Data_Value_Footnote", "Year", "DataSource",
                           "Data_Value_Footnote_Symbol", "Data_Value_Footnote",
                           "CategoryID", "MeasureId", "DataValueTypeID",
                           "Short_Question_Text", "Category", "LocationName",
                           "StateAbbr", "StateDesc"], axis=1)

In [9]:
# Filter out "Age-adjusted prevalence"
copd_df = copd_df[copd_df.Data_Value_Type != "Age-adjusted prevalence"]
smokers_df = smokers_df[smokers_df.Data_Value_Type != "Age-adjusted prevalence"]

In [10]:
# View copd dataframe
copd_df.head(2)

Unnamed: 0,StateDesc,LocationName,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,LocationID,Geolocation,State_County
0,Iowa,Cass,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.3,7.0,9.8,12836,19029,POINT (-94.92791359 41.33146007),"Cass, Iowa"
1,Iowa,Monona,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.3,7.0,9.5,8615,19133,POINT (-95.95997093 42.05236881),"Monona, Iowa"


In [11]:
# View smokers dataframe
smokers_df.head(2)

Unnamed: 0,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,LocationID,Geolocation,State_County
0,Current smoking among adults aged >=18 years,%,Crude prevalence,13.8,11.9,15.6,3005,6091,POINT (-120.5160862 39.58014381),"Sierra, California"
2,Current smoking among adults aged >=18 years,%,Crude prevalence,9.2,7.8,10.5,766573,6081,POINT (-122.3273639 37.42322968),"San Mateo, California"


In [12]:
# Merge copd and smokers dataframes
merged_df = pd.merge(copd_df, smokers_df, on="State_County")
merged_df.head(2)

Unnamed: 0,StateDesc,LocationName,Measure_x,Data_Value_Unit_x,Data_Value_Type_x,Data_Value_x,Low_Confidence_Limit_x,High_Confidence_Limit_x,TotalPopulation_x,LocationID_x,Geolocation_x,State_County,Measure_y,Data_Value_Unit_y,Data_Value_Type_y,Data_Value_y,Low_Confidence_Limit_y,High_Confidence_Limit_y,TotalPopulation_y,LocationID_y,Geolocation_y
0,Iowa,Cass,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.3,7.0,9.8,12836,19029,POINT (-94.92791359 41.33146007),"Cass, Iowa",Current smoking among adults aged >=18 years,%,Crude prevalence,19.3,16.4,22.1,12836,19029,POINT (-94.92791359 41.33146007)
1,Iowa,Monona,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.3,7.0,9.5,8615,19133,POINT (-95.95997093 42.05236881),"Monona, Iowa",Current smoking among adults aged >=18 years,%,Crude prevalence,18.5,15.8,20.9,8615,19133,POINT (-95.95997093 42.05236881)


In [13]:
# Drop unnecessary columns
merged_df = merged_df.drop(["Measure_x", "Data_Value_Unit_x", "Data_Value_Type_x", "TotalPopulation_x",
                           "LocationID_x", "Geolocation_x", "Measure_y", "Data_Value_Unit_y", 
                            "Data_Value_Type_y", "Low_Confidence_Limit_x","High_Confidence_Limit_x", 
                            "Low_Confidence_Limit_y", "High_Confidence_Limit_y", "TotalPopulation_x"], axis=1)

In [14]:
# Check columns were dropped
merged_df.head()

Unnamed: 0,StateDesc,LocationName,Data_Value_x,State_County,Data_Value_y,TotalPopulation_y,LocationID_y,Geolocation_y
0,Iowa,Cass,8.3,"Cass, Iowa",19.3,12836,19029,POINT (-94.92791359 41.33146007)
1,Iowa,Monona,8.3,"Monona, Iowa",18.5,8615,19133,POINT (-95.95997093 42.05236881)
2,Alaska,Dillingham,8.9,"Dillingham, Alaska",31.1,4916,2070,POINT (-158.2125392 59.79999618)
3,Colorado,Custer,7.5,"Custer, Colorado",13.8,5068,8027,POINT (-105.3669993 38.10865771)
4,Alaska,Ketchikan Gateway,6.6,"Ketchikan Gateway, Alaska",19.4,13901,2130,POINT (-130.9352394 55.58863162)


In [15]:
# Sort dataframe by levels of COPD
top10_df = merged_df.sort_values(by=['Data_Value_x'],  ascending=False).head(5)
top10_df

Unnamed: 0,StateDesc,LocationName,Data_Value_x,State_County,Data_Value_y,TotalPopulation_y,LocationID_y,Geolocation_y
3020,West Virginia,McDowell,18.2,"McDowell, West Virginia",31.9,17624,54047,POINT (-81.65398301 37.37831053)
1179,Kentucky,Bell,17.4,"Bell, Kentucky",32.3,26032,21013,POINT (-83.67371026 36.7310214)
1097,Kentucky,Lee,17.4,"Lee, Kentucky",35.4,7403,21129,POINT (-83.71567753 37.59439368)
997,Kentucky,Wolfe,17.3,"Wolfe, Kentucky",32.6,7157,21237,POINT (-83.49340584 37.73915782)
1105,Kentucky,Harlan,16.9,"Harlan, Kentucky",31.0,26010,21095,POINT (-83.21787827 36.85671097)


In [16]:
top10_df["State_County"].tolist()

['McDowell, West Virginia',
 'Bell, Kentucky',
 'Lee, Kentucky',
 'Wolfe, Kentucky',
 'Harlan, Kentucky']

In [17]:
merged_df.columns.tolist()

['StateDesc',
 'LocationName',
 'Data_Value_x',
 'State_County',
 'Data_Value_y',
 'TotalPopulation_y',
 'LocationID_y',
 'Geolocation_y']

In [18]:
# Working on renaming columns to something more digestable.
dict = {'TotalPopulation_y': "Total_Population",
        'Data_Value_x' :'Levels_COPD',
        'Data_Value_y': 'Levels_Smokers',
        'TotalPopulation_y': 'Total_Population',
        'LocationID_y': 'Location_ID',
        'Geolocation_y': 'Geolocation'
}
 
# call rename () method
merged_df.rename(columns=dict,
          inplace=True)

# View dataframe
merged_df.head()

Unnamed: 0,StateDesc,LocationName,Levels_COPD,State_County,Levels_Smokers,Total_Population,Location_ID,Geolocation
0,Iowa,Cass,8.3,"Cass, Iowa",19.3,12836,19029,POINT (-94.92791359 41.33146007)
1,Iowa,Monona,8.3,"Monona, Iowa",18.5,8615,19133,POINT (-95.95997093 42.05236881)
2,Alaska,Dillingham,8.9,"Dillingham, Alaska",31.1,4916,2070,POINT (-158.2125392 59.79999618)
3,Colorado,Custer,7.5,"Custer, Colorado",13.8,5068,8027,POINT (-105.3669993 38.10865771)
4,Alaska,Ketchikan Gateway,6.6,"Ketchikan Gateway, Alaska",19.4,13901,2130,POINT (-130.9352394 55.58863162)


In [19]:
# For ML Model
# Reorder the columns
merged_df = merged_df[["State_County", "StateDesc", "LocationName", "Levels_Smokers", "Levels_COPD"]]

# View the dataframe
merged_df.head()

Unnamed: 0,State_County,StateDesc,LocationName,Levels_Smokers,Levels_COPD
0,"Cass, Iowa",Iowa,Cass,19.3,8.3
1,"Monona, Iowa",Iowa,Monona,18.5,8.3
2,"Dillingham, Alaska",Alaska,Dillingham,31.1,8.9
3,"Custer, Colorado",Colorado,Custer,13.8,7.5
4,"Ketchikan Gateway, Alaska",Alaska,Ketchikan Gateway,19.4,6.6


In [21]:
# Rename columns
merged_df.columns = ["State_County", "State", "County", "Levels_Smokers", "Levels_COPD"]
merged_df.head()

Unnamed: 0,State_County,State,County,Levels_Smokers,Levels_COPD
0,"Cass, Iowa",Iowa,Cass,19.3,8.3
1,"Monona, Iowa",Iowa,Monona,18.5,8.3
2,"Dillingham, Alaska",Alaska,Dillingham,31.1,8.9
3,"Custer, Colorado",Colorado,Custer,13.8,7.5
4,"Ketchikan Gateway, Alaska",Alaska,Ketchikan Gateway,19.4,6.6


In [22]:
# # For DataViz
# # Reorder the columns
# merged_df = merged_df[["StateDesc", "LocationName", "Location_ID", "State_County", "Geolocation",
#                        "Levels_Smokers", "Levels_COPD"]]

# # View the dataframe
# merged_df.head()

In [23]:
# merged_df.loc[merged_df['State_County'] == 'Aleutians West, Alaska']

In [24]:
# Check the datatypes
merged_df.dtypes

State_County       object
State              object
County             object
Levels_Smokers    float64
Levels_COPD       float64
dtype: object

In [25]:
# Export the dataframe to csv
merged_df.to_csv('./Resources/processed_PLACES_COPD.csv')