In [121]:
import os
import configparser
import tqdm
import warnings
import pandas as pd
import geopandas as gpd
from shapely import wkt
warnings.filterwarnings('ignore')

In [122]:
BASE_DIR = os.getcwd()
CONFIG = configparser.ConfigParser()
CONFIG.read(os.path.join(BASE_DIR, 'script_config.ini'))

BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

DATA_RAW = os.path.join(BASE_PATH, 'raw')
DATA_RESULTS = os.path.join(BASE_PATH, '..', 'results')

In [138]:
data_path = os.path.join(DATA_RESULTS, 'processed', 'complete_cdc_pulmonary_data.csv')
df = pd.read_csv(data_path)

Only select from the year 2012

In [139]:
df = df[df['fileyear'] >= 2004].copy()

Sum the mortality count for all the years.

In [140]:
df = (df.groupby(['state', 'county_name', 'state_fips', 'sex',
                  'res_countyfips', 'res_statefips', 'age_cat'], 
                 as_index = False).agg({'mort_count': 'sum'}))
df

Unnamed: 0,state,county_name,state_fips,sex,res_countyfips,res_statefips,age_cat,mort_count
0,Alabama,Autauga,1,F,1,AL,30 - 49 years,5
1,Alabama,Autauga,1,F,1,AL,70 years or above,15
2,Alabama,Autauga,1,M,1,AL,30 - 49 years,4
3,Alabama,Autauga,1,M,1,AL,70 years or above,9
4,Alabama,Baldwin,1,F,3,AL,10 - 29 years,1
...,...,...,...,...,...,...,...,...
11318,Wyoming,Washakie,56,F,43,WY,70 years or above,3
11319,Wyoming,Washakie,56,M,43,WY,70 years or above,5
11320,Wyoming,Weston,56,F,45,WY,30 - 49 years,1
11321,Wyoming,Weston,56,F,45,WY,70 years or above,2


### Combine Mortality & County Shapefiles

We read and prepare the county shapefiles data

In [141]:
county_path = os.path.join(DATA_RAW, 'geodata', 'NRI_Shapefile_Counties.shp')
gdf = gpd.read_file(county_path)
gdf = gdf[['STATE','STATEABBRV', 'STATEFIPS', 'COUNTYFIPS', 'geometry', 
           'COUNTY', 'POPULATION']]
gdf.rename(columns = {'COUNTY': 'county_name', 'STATE': 'state', 
                      'STATEFIPS': 'state_fips', 'STATEABBRV': 
                      'res_statefips', 'COUNTYFIPS': 'county_fips', 
                      'POPULATION': 'pop20'}, inplace = True)

<div class="alert alert-block alert-warning">

<b>!! Attention !!</b> We are averaging the county fips but this is not a standard practice. We are only doing it because it will return the same value for each county.

</div>

In [142]:
gdf1 = gdf.merge(df, on = ['res_statefips', 'county_name'], how = 'left')
gdf1 = gdf1.drop(columns=[col for col in gdf1.columns if col.endswith('_y')])
gdf1 = gdf1.rename(columns=lambda c: c[:-2] if c.endswith('_x') else c)

Unnamed: 0,state,res_statefips,state_fips,county_fips,geometry,county_name,pop20,sex,res_countyfips,age_cat,mort_count
0,Alabama,AL,01,001,"POLYGON ((-9.62e+06 3.85e+06, -9.62e+06 3.85e+...",Autauga,58764,F,1.0,30 - 49 years,5.0
1,Alabama,AL,01,001,"POLYGON ((-9.62e+06 3.85e+06, -9.62e+06 3.85e+...",Autauga,58764,F,1.0,70 years or above,15.0
2,Alabama,AL,01,001,"POLYGON ((-9.62e+06 3.85e+06, -9.62e+06 3.85e+...",Autauga,58764,M,1.0,30 - 49 years,4.0
3,Alabama,AL,01,001,"POLYGON ((-9.62e+06 3.85e+06, -9.62e+06 3.85e+...",Autauga,58764,M,1.0,70 years or above,9.0
4,Alabama,AL,01,003,"MULTIPOLYGON (((-9.79e+06 3.59e+06, -9.79e+06 ...",Baldwin,231365,F,3.0,10 - 29 years,1.0
...,...,...,...,...,...,...,...,...,...,...,...
11216,Puerto Rico,PR,72,151,"MULTIPOLYGON (((-7.32e+06 2.05e+06, -7.32e+06 ...",Yabucoa,30397,,,,
11217,Puerto Rico,PR,72,153,"POLYGON ((-7.44e+06 2.06e+06, -7.44e+06 2.06e+...",Yauco,34151,,,,
11218,Virgin Islands,VI,78,010,"MULTIPOLYGON (((-7.2e+06 2.01e+06, -7.2e+06 2....",St. Croix,40913,,,,
11219,Virgin Islands,VI,78,020,"MULTIPOLYGON (((-7.22e+06 2.08e+06, -7.22e+06 ...",St. John,3882,,,,


## Prepare Data for GWR

In this section we prepare the pulmonary embolism data for running the Multiscale Geographically Weighted Regression

#### Convert County Geometries into points

First, let's calculate the centroid.

In [144]:
gdf1 = gdf1.to_crs(epsg = 4326)
gdf1['centroid'] = gdf1.geometry.centroid

And then extract longitudes and latitudes.

In [145]:
gdf1['longitude'] = gdf1.centroid.x
gdf1['latitude'] = gdf1.centroid.y

And then drop the redundant columns before saving the data.

In [146]:
gdf1 = gdf1.drop(columns=['centroid', 'geometry'])
gdf1

Unnamed: 0,state,res_statefips,state_fips,county_fips,county_name,pop20,sex,res_countyfips,age_cat,mort_count,longitude,latitude
0,Alabama,AL,01,001,Autauga,58764,F,1.0,30 - 49 years,5.0,-86.642759,32.534932
1,Alabama,AL,01,001,Autauga,58764,F,1.0,70 years or above,15.0,-86.642759,32.534932
2,Alabama,AL,01,001,Autauga,58764,M,1.0,30 - 49 years,4.0,-86.642759,32.534932
3,Alabama,AL,01,001,Autauga,58764,M,1.0,70 years or above,9.0,-86.642759,32.534932
4,Alabama,AL,01,003,Baldwin,231365,F,3.0,10 - 29 years,1.0,-87.723352,30.736470
...,...,...,...,...,...,...,...,...,...,...,...,...
11216,Puerto Rico,PR,72,151,Yabucoa,30397,,,,,-65.896594,18.070595
11217,Puerto Rico,PR,72,153,Yauco,34151,,,,,-66.858283,18.079748
11218,Virgin Islands,VI,78,010,St. Croix,40913,,,,,-64.767808,17.733188
11219,Virgin Islands,VI,78,020,St. John,3882,,,,,-64.738631,18.338808


In [148]:
gdf1['age_cat'] = gdf1['age_cat'].str.strip()
gdf1['sex'] = gdf1['sex'].str.strip().str.upper()
gdf1['age_cat'] = gdf1['age_cat'].map(
    {'70 years or above': 3, '30 - 49 years': 2, 
     '10 - 29 years': 1, ' 9 years or below': 0})
gdf1['sex'] = gdf1['sex'].map({'F': 0, 'M': 1})

In [149]:
folder_out = os.path.join(DATA_RESULTS, 'gwr_data')
filename = 'all_gwr_data.csv'
path_out = os.path.join(folder_out, filename)
gdf1.to_csv(path_out, index = False)