In [1]:
# Dependencies
import pandas as pd

In [2]:
# This dataset is for 2019 but is from the 2021 release of PLACES
# https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-County-Data-20/swc5-untb/data

# Load in PLACES dataset filtered for COPD
file_path = "Resources/PLACES_COPD.csv"
copd_df = pd.read_csv(file_path)

# Display all rows 
pd.set_option('display.max_columns', None)

# Show the dataframe
copd_df.head()

Unnamed: 0,Year,StateAbbr,StateDesc,LocationName,DataSource,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Data_Value_Footnote_Symbol,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text,Geolocation
0,2019,IA,Iowa,Cass,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.3,,,7.0,9.8,12836,19029,HLTHOUT,COPD,CrdPrv,COPD,POINT (-94.92791359 41.33146007)
1,2019,IA,Iowa,Monona,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.3,,,7.0,9.5,8615,19133,HLTHOUT,COPD,CrdPrv,COPD,POINT (-95.95997093 42.05236881)
2,2019,AZ,Arizona,Graham,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,6.6,,,6.0,7.3,38837,4009,HLTHOUT,COPD,AgeAdjPrv,COPD,POINT (-109.8871087 32.93306592)
3,2019,AK,Alaska,Prince of Wales-Hyder,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,7.9,,,7.2,8.6,6203,2198,HLTHOUT,COPD,AgeAdjPrv,COPD,POINT (-132.7096152 55.50495667)
4,2019,AK,Alaska,Dillingham,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,8.9,,,7.7,10.0,4916,2070,HLTHOUT,COPD,CrdPrv,COPD,POINT (-158.2125392 59.79999618)


In [3]:
# This dataset is for 2019 but is from the 2021 release of PLACES
# https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-County-Data-20/swc5-untb/data

# Load in PLACES dataset filtered for current smokers
file_path = "Resources/PLACES_smokers.csv"
smokers_df = pd.read_csv(file_path)

# Display all rows 
pd.set_option('display.max_columns', None)

# Show the dataframe
smokers_df.head()

Unnamed: 0,Year,StateAbbr,StateDesc,LocationName,DataSource,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Data_Value_Footnote_Symbol,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text,Geolocation
0,2019,CA,California,Sierra,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Crude prevalence,13.8,,,11.9,15.6,3005,6091,RISKBEH,CSMOKING,CrdPrv,Current Smoking,POINT (-120.5160862 39.58014381)
1,2019,CO,Colorado,Pitkin,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,11.0,,,8.7,13.4,17767,8097,RISKBEH,CSMOKING,AgeAdjPrv,Current Smoking,POINT (-106.9159741 39.21744595)
2,2019,CA,California,San Mateo,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Crude prevalence,9.2,,,7.8,10.5,766573,6081,RISKBEH,CSMOKING,CrdPrv,Current Smoking,POINT (-122.3273639 37.42322968)
3,2019,AL,Alabama,Dale,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,22.0,,,19.4,24.4,49172,1045,RISKBEH,CSMOKING,AgeAdjPrv,Current Smoking,POINT (-85.61159073 31.43150889)
4,2019,AR,Arkansas,St. Francis,BRFSS,Health Risk Behaviors,Current smoking among adults aged >=18 years,%,Crude prevalence,24.6,,,22.2,26.8,24994,5123,RISKBEH,CSMOKING,CrdPrv,Current Smoking,POINT (-90.74737815 35.02173053)


In [4]:
# View rows and columns of the dataframe
print(copd_df.shape)
print(smokers_df.shape)

(6244, 21)
(6244, 21)


In [5]:
# View unique values for columns
print(copd_df["Data_Value_Footnote"].unique())
print(copd_df["Year"].unique())
print(copd_df["DataSource"].unique())
print(copd_df["Data_Value_Footnote_Symbol"].unique())
print(copd_df["Data_Value_Footnote"].unique())
print(copd_df["CategoryID"].unique())
print(copd_df["MeasureId"].unique())
print(copd_df["DataValueTypeID"].unique())
print(copd_df["Short_Question_Text"].unique())

[nan]
[2019]
['BRFSS']
[nan]
[nan]
['HLTHOUT']
['COPD']
['CrdPrv' 'AgeAdjPrv']
['COPD']


In [6]:
# View all columns
copd_df.columns

Index(['Year', 'StateAbbr', 'StateDesc', 'LocationName', 'DataSource',
       'Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
       'Low_Confidence_Limit', 'High_Confidence_Limit', 'TotalPopulation',
       'LocationID', 'CategoryID', 'MeasureId', 'DataValueTypeID',
       'Short_Question_Text', 'Geolocation'],
      dtype='object')

In [7]:
# Make a new column that merges county name ("LocationName") and the State ("StateAbbr")
copd_df["State&County"] = copd_df['LocationName'].astype(str) +", "+ copd_df["StateAbbr"]

# Make a new column that merges county name ("LocationName") and the State ("StateAbbr")
smokers_df["State&County"] = smokers_df['LocationName'].astype(str) +", "+ smokers_df["StateAbbr"]

In [8]:
# Remove uninformative columns
copd_df = copd_df.drop(["Data_Value_Footnote", "Year", "DataSource",
                           "Data_Value_Footnote_Symbol", "Data_Value_Footnote",
                           "CategoryID", "MeasureId", "DataValueTypeID",
                           "Short_Question_Text", "Category", "LocationName",
                           "StateAbbr", "StateDesc"], axis=1)

smokers_df = smokers_df.drop(["Data_Value_Footnote", "Year", "DataSource",
                           "Data_Value_Footnote_Symbol", "Data_Value_Footnote",
                           "CategoryID", "MeasureId", "DataValueTypeID",
                           "Short_Question_Text", "Category", "LocationName",
                           "StateAbbr", "StateDesc"], axis=1)

In [9]:
# Filter out "Crude prevalence"
copd_df = copd_df[copd_df.Data_Value_Type != "Crude prevalence"]
smokers_df = smokers_df[smokers_df.Data_Value_Type != "Crude prevalence"]

In [10]:
copd_df.head(2)

Unnamed: 0,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,LocationID,Geolocation,State&County
2,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,6.6,6.0,7.3,38837,4009,POINT (-109.8871087 32.93306592),"Graham, AZ"
3,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,7.9,7.2,8.6,6203,2198,POINT (-132.7096152 55.50495667),"Prince of Wales-Hyder, AK"


In [11]:
smokers_df.head(2)

Unnamed: 0,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,LocationID,Geolocation,State&County
1,Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,11.0,8.7,13.4,17767,8097,POINT (-106.9159741 39.21744595),"Pitkin, CO"
3,Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,22.0,19.4,24.4,49172,1045,POINT (-85.61159073 31.43150889),"Dale, AL"


In [12]:
merged_df = pd.merge(copd_df, smokers_df, on="State&County")
merged_df.head(2)

Unnamed: 0,Measure_x,Data_Value_Unit_x,Data_Value_Type_x,Data_Value_x,Low_Confidence_Limit_x,High_Confidence_Limit_x,TotalPopulation_x,LocationID_x,Geolocation_x,State&County,Measure_y,Data_Value_Unit_y,Data_Value_Type_y,Data_Value_y,Low_Confidence_Limit_y,High_Confidence_Limit_y,TotalPopulation_y,LocationID_y,Geolocation_y
0,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,6.6,6.0,7.3,38837,4009,POINT (-109.8871087 32.93306592),"Graham, AZ",Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,17.4,15.7,19.2,38837,4009,POINT (-109.8871087 32.93306592)
1,Chronic obstructive pulmonary disease among ad...,%,Age-adjusted prevalence,7.9,7.2,8.6,6203,2198,POINT (-132.7096152 55.50495667),"Prince of Wales-Hyder, AK",Current smoking among adults aged >=18 years,%,Age-adjusted prevalence,25.6,23.3,27.9,6203,2198,POINT (-132.7096152 55.50495667)


In [13]:
merged_df = merged_df.drop(["Data_Value_Unit_x", "TotalPopulation_x", "LocationID_x",
                           "Geolocation_x"], axis=1)

In [14]:
merged_df = merged_df[["State&County", "TotalPopulation_y", "Measure_x", "Data_Value_Unit_y", "Low_Confidence_Limit_x", 
                       "Data_Value_x", "High_Confidence_Limit_x", "Measure_y", "Low_Confidence_Limit_y", 
                      "Data_Value_y", "High_Confidence_Limit_y", "LocationID_y", "Geolocation_y"]]

In [25]:
merged_df.head(10)

Unnamed: 0,State&County,TotalPopulation_y,Measure_x,Data_Value_Unit_y,Low_Confidence_Limit_x,Data_Value_x,High_Confidence_Limit_x,Measure_y,Low_Confidence_Limit_y,Data_Value_y,High_Confidence_Limit_y,LocationID_y,Geolocation_y
0,"Graham, AZ",38837,Chronic obstructive pulmonary disease among ad...,%,6.0,6.6,7.3,Current smoking among adults aged >=18 years,15.7,17.4,19.2,4009,POINT (-109.8871087 32.93306592)
1,"Prince of Wales-Hyder, AK",6203,Chronic obstructive pulmonary disease among ad...,%,7.2,7.9,8.6,Current smoking among adults aged >=18 years,23.3,25.6,27.9,2198,POINT (-132.7096152 55.50495667)
2,"Conecuh, AL",12067,Chronic obstructive pulmonary disease among ad...,%,8.5,9.4,10.3,Current smoking among adults aged >=18 years,22.7,25.1,27.6,1035,POINT (-86.99409002 31.42928542)
3,"Nevada, AR",8252,Chronic obstructive pulmonary disease among ad...,%,7.7,8.5,9.4,Current smoking among adults aged >=18 years,20.7,23.0,25.3,5099,POINT (-93.30711084 33.66372397)
4,"Tehama, CA",65084,Chronic obstructive pulmonary disease among ad...,%,6.4,7.2,8.1,Current smoking among adults aged >=18 years,14.7,16.6,18.5,6103,POINT (-122.2337288 40.12516798)
5,"San Diego, CA",3338330,Chronic obstructive pulmonary disease among ad...,%,4.0,4.6,5.2,Current smoking among adults aged >=18 years,9.5,11.0,12.6,6073,POINT (-116.7353793 33.03401356)
6,"Clark, AR",22320,Chronic obstructive pulmonary disease among ad...,%,7.0,7.9,8.8,Current smoking among adults aged >=18 years,18.1,20.5,23.0,5019,POINT (-93.17677056 34.05109731)
7,"Drew, AR",18219,Chronic obstructive pulmonary disease among ad...,%,7.3,8.3,9.3,Current smoking among adults aged >=18 years,19.0,21.5,24.0,5043,POINT (-91.71953511 33.58969442)
8,"La Plata, CO",56221,Chronic obstructive pulmonary disease among ad...,%,3.8,4.5,5.2,Current smoking among adults aged >=18 years,11.8,14.2,16.7,8067,POINT (-107.8432983 37.28668213)
9,"San Joaquin, CA",762148,Chronic obstructive pulmonary disease among ad...,%,4.7,5.3,5.9,Current smoking among adults aged >=18 years,12.4,13.9,15.5,6077,POINT (-121.2718065 37.93484195)


In [22]:
merged_df.columns.tolist()

['State&County',
 'TotalPopulation_y',
 'Measure_x',
 'Data_Value_Unit_y',
 'Low_Confidence_Limit_x',
 'Data_Value_x',
 'High_Confidence_Limit_x',
 'Measure_y',
 'Low_Confidence_Limit_y',
 'Data_Value_y',
 'High_Confidence_Limit_y',
 'LocationID_y',
 'Geolocation_y']

In [24]:
# Working on renaming columns to something more digestable.
dict = {'State&County': 'Location',
 'TotalPopulation_y': "TotalPopulation",
 'Measure_x',
 'Data_Value_Unit_y' :'Data_Value_Unit',
 'Low_Confidence_Limit_x',
 'Data_Value_x',
 'High_Confidence_Limit_x',
 'Measure_y',
 'Low_Confidence_Limit_y',
 'Data_Value_y',
 'High_Confidence_Limit_y',
 'LocationID_y',
 'Geolocation_y'}
 
# call rename () method
merged_df.rename(columns=dict,
          inplace=True)

SyntaxError: invalid syntax (4131983591.py, line 3)