In [3]:
# Import dependencies
import pandas as pd

In [4]:
# Read in the coal dataframe
file_path = "./Resources/processed_Coal_Mines.csv"
coal_df = pd.read_csv(file_path)

# View all columns for dataframes
pd.set_option('display.max_columns', None)

# View the dataframe
coal_df.head(5)

Unnamed: 0,State&County,Surface_Mines,Underground_Mines
0,"Pennsylvania, Clearfield",22,2.0
1,"Pennsylvania, Schuylkill",22,5.0
2,"Pennsylvania, Somerset",13,5.0
3,"West Virginia, Mcdowell",13,12.0
4,"Kentucky, Pike",13,14.0


In [5]:
# This dataset is for 2019 but is from the 2021 release of PLACES
# https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-County-Data-20/swc5-untb/data

# Load in PLACES dataset filtered for COPD
file_path = "Resources/processed_PLACES_COPD.csv"
copd_df = pd.read_csv(file_path)

# Display all rows 
pd.set_option('display.max_columns', None)

# Show the dataframe
copd_df.head()

Unnamed: 0,State&County,Total_Population,Levels_COPD,Levels_Smokers
0,"Graham, Arizona",38837,6.6,17.4
1,"Prince of Wales-Hyder, Alaska",6203,7.9,25.6
2,"Conecuh, Alabama",12067,9.4,25.1
3,"Nevada, Arkansas",8252,8.5,23.0
4,"Tehama, California",65084,7.2,16.6


In [6]:
# Merge the dataframes
merged_df = pd.merge(copd_df, coal_df, on="State&County", how="left")
merged_df.head()

Unnamed: 0,State&County,Total_Population,Levels_COPD,Levels_Smokers,Surface_Mines,Underground_Mines
0,"Graham, Arizona",38837,6.6,17.4,,
1,"Prince of Wales-Hyder, Alaska",6203,7.9,25.6,,
2,"Conecuh, Alabama",12067,9.4,25.1,,
3,"Nevada, Arkansas",8252,8.5,23.0,,
4,"Tehama, California",65084,7.2,16.6,,


In [7]:
# Change Surface_Mines NaN values to 0 since there are no mines in these counties
merged_df['Surface_Mines'] = merged_df['Surface_Mines'].fillna(0)

# Change Underground_Mines NaN values to 0 since there are no mines in these counties
merged_df['Underground_Mines'] = merged_df['Underground_Mines'].fillna(0)

# View the dataframe
merged_df.head()

Unnamed: 0,State&County,Total_Population,Levels_COPD,Levels_Smokers,Surface_Mines,Underground_Mines
0,"Graham, Arizona",38837,6.6,17.4,0.0,0.0
1,"Prince of Wales-Hyder, Alaska",6203,7.9,25.6,0.0,0.0
2,"Conecuh, Alabama",12067,9.4,25.1,0.0,0.0
3,"Nevada, Arkansas",8252,8.5,23.0,0.0,0.0
4,"Tehama, California",65084,7.2,16.6,0.0,0.0


In [18]:
# Create a separate dataframe to work on creating Urban, UrbanCluster
pop_df = merged_df[['State&County', "Total_Population"]]
pop_df.head()

Unnamed: 0,State&County,Total_Population
0,"Graham, Arizona",38837
1,"Prince of Wales-Hyder, Alaska",6203
2,"Conecuh, Alabama",12067
3,"Nevada, Arkansas",8252
4,"Tehama, California",65084


In [11]:
merged_df.columns.tolist()

['State&County',
 'Total_Population',
 'Levels_COPD',
 'Levels_Smokers',
 'Surface_Mines',
 'Underground_Mines']

In [14]:
# Change the order of the dataframe so that COPD (the target) is last
merged_df = merged_df[['State&County',
                      'Total_Population',
                      'Levels_Smokers',
                      'Surface_Mines',
                      'Underground_Mines',
                      'Levels_COPD']]

merged_df.head(20)

Unnamed: 0,State&County,Total_Population,Levels_Smokers,Surface_Mines,Underground_Mines,Levels_COPD
0,"Graham, Arizona",38837,17.4,0.0,0.0,6.6
1,"Prince of Wales-Hyder, Alaska",6203,25.6,0.0,0.0,7.9
2,"Conecuh, Alabama",12067,25.1,0.0,0.0,9.4
3,"Nevada, Arkansas",8252,23.0,0.0,0.0,8.5
4,"Tehama, California",65084,16.6,0.0,0.0,7.2
5,"San Diego, California",3338330,11.0,0.0,0.0,4.6
6,"Clark, Arkansas",22320,20.5,0.0,0.0,7.9
7,"Drew, Arkansas",18219,21.5,0.0,0.0,8.3
8,"La Plata, Colorado",56221,14.2,0.0,0.0,4.5
9,"San Joaquin, California",762148,13.9,0.0,0.0,5.3


In [15]:
# Export the dataframe to csv
merged_df.to_csv('./Resources/ProcessMerge.csv', index=False)