In [88]:
import os
import configparser
import tqdm
import warnings
import pandas as pd
import geopandas as gpd
from shapely import wkt
import numpy as np
warnings.filterwarnings('ignore')

In [89]:
BASE_DIR = os.getcwd()
CONFIG = configparser.ConfigParser()
CONFIG.read(os.path.join(BASE_DIR, 'script_config.ini'))

BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

DATA_RAW = os.path.join(BASE_PATH, 'raw')
DATA_RESULTS = os.path.join(BASE_PATH, '..', 'results')

In [90]:
data_path = os.path.join(DATA_RESULTS, 'processed', 'sex_cdc_pulmonary_data.csv')
all_df = pd.read_csv(data_path)
df = all_df[['state', 'county_name', 'res_statefips', 'sex', 'mort_count', 'fileyear']]

For hotspot analysis, we divide our data into two epochs, i.e. 2005 - 2015 and 2016 - 2022.

In [91]:
df['epoch'] = df['fileyear'].apply(
    lambda x: 'pre_2015' if x <= 2015 else 'after_2015'
)
df

Unnamed: 0,state,county_name,res_statefips,sex,mort_count,fileyear,epoch
0,Alabama,Autauga,AL,F,1,2005,pre_2015
1,Alabama,Autauga,AL,M,1,2005,pre_2015
2,Alabama,Autauga,AL,F,2,2006,pre_2015
3,Alabama,Autauga,AL,M,3,2006,pre_2015
4,Alabama,Autauga,AL,F,2,2007,pre_2015
...,...,...,...,...,...,...,...
50212,Wyoming,Weston,WY,F,1,2008,pre_2015
50213,Wyoming,Weston,WY,M,1,2009,pre_2015
50214,Wyoming,Weston,WY,M,1,2011,pre_2015
50215,Wyoming,Weston,WY,F,1,2015,pre_2015


First import geospatial county data to link with all our subsequent processed data.

In [92]:
county_path = os.path.join(DATA_RAW, 'geodata', 'NRI_Shapefile_Counties.shp')
gdf = gpd.read_file(county_path)
gdf = gdf[['STATE','STATEABBRV', 'STATEFIPS', 'COUNTYFIPS', 'geometry', 
           'COUNTY', 'POPULATION']]
gdf.rename(columns = {'COUNTY': 'county_name', 'STATE': 'state', 
                      'STATEFIPS': 'state_fips', 'STATEABBRV': 
                      'res_statefips', 'COUNTYFIPS': 'county_fips', 
                      'POPULATION': 'pop20'}, inplace = True)
territories = ['AS', 'GU', 'MP', 'PR', 'VI']
gdf = gdf[~gdf['res_statefips'].isin(territories)]
gdf = gdf.reset_index(drop = True)

In [93]:
gdf = gdf.to_crs(epsg = 4326)
gdf['centroid'] = gdf.geometry.centroid
gdf['longitude'] = gdf.centroid.x
gdf['latitude'] = gdf.centroid.y
gdf = gdf.drop(columns=['centroid', 'geometry'])
gdf

Unnamed: 0,state,res_statefips,state_fips,county_fips,county_name,pop20,longitude,latitude
0,Alabama,AL,01,001,Autauga,58764,-86.642759,32.534932
1,Alabama,AL,01,003,Baldwin,231365,-87.723352,30.736470
2,Alabama,AL,01,005,Barbour,25160,-85.393206,31.869615
3,Alabama,AL,01,007,Bibb,22239,-87.126450,32.998655
4,Alabama,AL,01,009,Blount,58992,-86.567381,33.980880
...,...,...,...,...,...,...,...,...
3138,Wyoming,WY,56,037,Sweetwater,42238,-108.879453,41.659509
3139,Wyoming,WY,56,039,Teton,23250,-110.589834,43.934650
3140,Wyoming,WY,56,041,Uinta,20412,-110.547585,41.287504
3141,Wyoming,WY,56,043,Washakie,7662,-107.682883,43.905010


### 1. General

In [94]:
df_gen = (df.groupby(['state', 'county_name', 'res_statefips', 'epoch'], 
                 as_index = False).agg({'mort_count': 'sum'}))
df_gen

Unnamed: 0,state,county_name,res_statefips,epoch,mort_count
0,Alabama,Autauga,AL,after_2015,11
1,Alabama,Autauga,AL,pre_2015,22
2,Alabama,Baldwin,AL,after_2015,48
3,Alabama,Baldwin,AL,pre_2015,65
4,Alabama,Barbour,AL,after_2015,13
...,...,...,...,...,...
5884,Wyoming,Uinta,WY,pre_2015,4
5885,Wyoming,Washakie,WY,after_2015,2
5886,Wyoming,Washakie,WY,pre_2015,6
5887,Wyoming,Weston,WY,after_2015,1


Select and save dataframes for each of the two epochs.

In [95]:
df_pre_2015 = df_gen[df_gen['epoch'] == 'pre_2015'].copy()
df_after_2015 = df_gen[df_gen['epoch'] == 'after_2015'].copy()

Combine pre-2015 data with spatial data

In [96]:
gdf1 = gdf.merge(df_pre_2015, on = ['res_statefips', 'county_name'], how = 'left')
gdf1 = gdf1.drop(columns=[col for col in gdf1.columns if col.endswith('_y')])
gdf1 = gdf1.rename(columns=lambda c: c[:-2] if c.endswith('_x') else c)
gdf1['mort_count'] = gdf1['mort_count'].fillna(0)

Also combine post-2015 data with spatial data

In [97]:
gdf2 = gdf.merge(df_after_2015, on = ['res_statefips', 'county_name'], how = 'left')
gdf2 = gdf2.drop(columns=[col for col in gdf2.columns if col.endswith('_y')])
gdf2 = gdf2.rename(columns=lambda c: c[:-2] if c.endswith('_x') else c)
gdf2['mort_count'] = gdf2['mort_count'].fillna(0)

Save the two files for further analysis

In [98]:
folder_out = os.path.join(DATA_RESULTS, 'hotspot')
filename = 'gen_pre_2015.csv'
filename1 = 'gen_after_2015.csv'
path_out = os.path.join(folder_out, filename)
path_out1 = os.path.join(folder_out, filename1)
gdf1.to_csv(path_out, index = False)
gdf2.to_csv(path_out1, index = False)

### 2. Sex

We now group our data for the two periods for both males and females.

In [99]:
df_sex = (df.groupby(['state', 'county_name', 'res_statefips', 'epoch', 'sex'], 
                 as_index = False).agg({'mort_count': 'sum'}))

We select all data by sex before and after 2015.

In [100]:
df_pre_2015_sex = df_sex[df_sex['epoch'] == 'pre_2015'].copy()
df_after_2015_sex = df_sex[df_sex['epoch'] == 'after_2015'].copy()

Now we link all sex-based dataframes to geospatial data.

In [101]:
gdf3 = gdf.merge(df_pre_2015_sex, on = ['res_statefips', 'county_name'], how = 'left')
gdf3 = gdf3.drop(columns=[col for col in gdf3.columns if col.endswith('_y')])
gdf3 = gdf3.rename(columns=lambda c: c[:-2] if c.endswith('_x') else c)
gdf3['mort_count'] = gdf3['mort_count'].fillna(0)

In [102]:
gdf4 = gdf.merge(df_after_2015_sex, on = ['res_statefips', 'county_name'], how = 'left')
gdf4 = gdf4.drop(columns=[col for col in gdf4.columns if col.endswith('_y')])
gdf4 = gdf4.rename(columns=lambda c: c[:-2] if c.endswith('_x') else c)
gdf4['mort_count'] = gdf4['mort_count'].fillna(0)

#### 2.a Male

Male mortality counts before 2015.

In [103]:
df_pre_2015_male = gdf3[gdf3['sex'] == 'M'].copy()

Male mortality counts after 2015

In [104]:
df_after_2015_male = gdf4[gdf4['sex'] == 'M'].copy()

#### 2.b Female

Female mortality counts before 2015.

In [105]:
df_pre_2015_female = gdf3[gdf3['sex'] == 'F'].copy()

In [106]:
df_after_2015_female = gdf4[gdf4['sex'] == 'F'].copy()

In [107]:
folder_out1 = os.path.join(DATA_RESULTS, 'hotspot', 'sex')
filename2 = 'male_pre_2015.csv'
filename3 = 'male_after_2015.csv'
filename4 = 'female_pre_2015.csv'
filename5 = 'female_after_2015.csv'

path_out2 = os.path.join(folder_out1, filename2)
path_out3 = os.path.join(folder_out1, filename3)
path_out4 = os.path.join(folder_out1, filename4)
path_out5 = os.path.join(folder_out1, filename5)

df_pre_2015_male.to_csv(path_out2, index = False)
df_after_2015_male.to_csv(path_out3, index = False)
df_pre_2015_female.to_csv(path_out4, index = False)
df_after_2015_female.to_csv(path_out5, index = False)

### 3. Age Groups

We now group our data for the two periods for the three age groups.

In [117]:
age_path = os.path.join(DATA_RESULTS, 'processed', 'age_cdc_pulmonary_data.csv')
ages = pd.read_csv(age_path)
ages = ages[['state', 'county_name', 'res_statefips', 'age_cat', 'mort_count', 'fileyear']]
ages['epoch'] = ages['fileyear'].apply(
    lambda x: 'pre_2015' if x <= 2015 else 'after_2015')

In [118]:
df_age = (ages.groupby(['state', 'county_name', 'res_statefips', 'epoch', 'age_cat'], 
                 as_index = False).agg({'mort_count': 'sum'}))
df_age

Unnamed: 0,state,county_name,res_statefips,epoch,age_cat,mort_count
0,Alabama,Autauga,AL,after_2015,30 - 49 years,1
1,Alabama,Autauga,AL,after_2015,70 years or above,10
2,Alabama,Autauga,AL,pre_2015,30 - 49 years,8
3,Alabama,Autauga,AL,pre_2015,70 years or above,14
4,Alabama,Baldwin,AL,after_2015,10 - 29 years,2
...,...,...,...,...,...,...
11282,Wyoming,Washakie,WY,after_2015,70 years or above,2
11283,Wyoming,Washakie,WY,pre_2015,70 years or above,6
11284,Wyoming,Weston,WY,after_2015,70 years or above,1
11285,Wyoming,Weston,WY,pre_2015,30 - 49 years,1


Separate the age group based dataframe into pre and post 2015.

In [121]:
df_pre_2015_age = df_age[df_age['epoch'] == 'pre_2015'].copy()
df_after_2015_age = df_age[df_age['epoch'] == 'after_2015'].copy()

In [124]:
gdf5 = gdf.merge(df_pre_2015_age, on = ['res_statefips', 'county_name'], how = 'left')
gdf5 = gdf5.drop(columns=[col for col in gdf5.columns if col.endswith('_y')])
gdf5 = gdf5.rename(columns=lambda c: c[:-2] if c.endswith('_x') else c)
gdf5['mort_count'] = gdf5['mort_count'].fillna(0)

In [125]:
gdf6 = gdf.merge(df_after_2015_age, on = ['res_statefips', 'county_name'], how = 'left')
gdf6 = gdf6.drop(columns=[col for col in gdf6.columns if col.endswith('_y')])
gdf6 = gdf6.rename(columns=lambda c: c[:-2] if c.endswith('_x') else c)
gdf6['mort_count'] = gdf6['mort_count'].fillna(0)

In [123]:
print(df_age['age_cat'].unique())

['30 - 49 years' '70 years or above' '10 - 29 years' ' 9 years or below']


#### 3.a 10 - 29 years

10 - 29 years individual's mortality counts before 2015.

In [127]:
df_pre_2015_1029 = gdf5[gdf5['age_cat'] == '10 - 29 years'].copy()

10 - 29 years individual's mortality counts after 2015.

In [129]:
df_after_2015_1029 = gdf6[gdf6['age_cat'] == '10 - 29 years'].copy()

#### 3.b 30 - 49 years

In [131]:
df_pre_2015_3049 = gdf5[gdf5['age_cat'] == '30 - 49 years'].copy()

In [133]:
df_after_2015_3049 = gdf6[gdf6['age_cat'] == '30 - 49 years'].copy()

#### 3.c 70 years or above

In [134]:
df_pre_2015_70 = gdf5[gdf5['age_cat'] == '70 years or above'].copy()

In [135]:
df_after_2015_70 = gdf6[gdf6['age_cat'] == '70 years or above'].copy()

In [138]:
folder_out2 = os.path.join(DATA_RESULTS, 'hotspot', 'age')
filename6 = '1029_pre_2015.csv'
filename7 = '1029_after_2015.csv'
filename8 = '3049_pre_2015.csv'
filename9 = '3049_after_2015.csv'
filename10 = '70_pre_2015.csv'
filename11 = '70_after_2015.csv'

path_out6 = os.path.join(folder_out2, filename6)
path_out7 = os.path.join(folder_out2, filename7)
path_out8 = os.path.join(folder_out2, filename8)
path_out9 = os.path.join(folder_out2, filename9)
path_out10 = os.path.join(folder_out2, filename10)
path_out11 = os.path.join(folder_out2, filename11)

df_pre_2015_1029.to_csv(path_out6, index = False)
df_after_2015_1029.to_csv(path_out7, index = False)
df_pre_2015_3049.to_csv(path_out8, index = False)
df_after_2015_3049.to_csv(path_out9, index = False)
df_pre_2015_70.to_csv(path_out10, index = False)
df_after_2015_70.to_csv(path_out11, index = False)