In [1]:
import os
import configparser
import tqdm
import warnings
import pandas as pd
import geopandas as gpd
from shapely import wkt
warnings.filterwarnings('ignore')

In [96]:
BASE_DIR = os.getcwd()
CONFIG = configparser.ConfigParser()
CONFIG.read(os.path.join(BASE_DIR, 'script_config.ini'))

BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

DATA_RAW = os.path.join(BASE_PATH, 'raw')
DATA_RESULTS = os.path.join(BASE_PATH, '..', 'results')

First we read all the required datasets

In [97]:
data_path = os.path.join(DATA_RESULTS, 'gwr_data', 'all_gwr_data.csv')

df = pd.read_csv(data_path)
df['mort_count'] = df['mort_count'].fillna(0)

We are only going to keep the age column since the occurrence of pulmonary embolism differs with age.

In [98]:
group_cols = ['county_name', 'state', 'state_fips', 'res_countyfips',
    'res_statefips', 'county_fips', 'longitude', 'latitude', 'age_cat']

df = (df.groupby(group_cols, as_index = False).agg({
        'mort_count': 'sum',
        'pop20': 'first'}))

Next, we calculate the mortality counts per a 100,000 people.

In [99]:
df['mort_per_100k'] = ((df['mort_count'] * 100000) / df['pop20'])#.round().astype(int)

### Air Quality Data

We now include the air quality data

In [111]:
shp_path = os.path.join(DATA_RAW, 'geodata', 'air_US_counties.shp')
gdf = gpd.read_file(shp_path)
gdf = gdf.rename(columns = {'MEAN': 'pm25_mean',
    'MEAN_1': 'air_quality_index', 'NAME': 'county_name',
    'STATE_NAME': 'state'})
gdf

Unnamed: 0,ObjectID,county_name,state,FIPS,pm25_mean,air_quality_index,geometry
0,0,Siskiyou,California,06093,6.725839,29.657545,"POLYGON ((-2263443.001 2373524.342, -2263269.8..."
1,1,Big Horn,Montana,30003,5.058616,26.081857,"POLYGON ((-961891.533 2507738.015, -961894.81 ..."
2,8,Del Norte,California,06015,5.313719,25.479666,"POLYGON ((-2292563.856 2394672.986, -2292561.7..."
3,9,Linn,Oregon,41043,5.507526,26.275272,"POLYGON ((-2028816.907 2640191.92, -2029870.14..."
4,10,Benton,Oregon,41003,5.896031,28.884273,"POLYGON ((-2179108.518 2686663.84, -2179075.94..."
...,...,...,...,...,...,...,...
3064,3137,Indian River,Florida,12061,7.038592,37.364350,"MULTIPOLYGON (((1545289.189 626576.115, 154513..."
3065,3138,St. Lucie,Florida,12111,7.297799,38.418076,"MULTIPOLYGON (((1558142.384 592233.686, 155813..."
3066,3139,Broward,Florida,12011,6.914660,36.524386,"MULTIPOLYGON (((1522198.511 446881.286, 152200..."
3067,3140,Miami-Dade,Florida,12086,6.684649,35.439131,"MULTIPOLYGON (((1594417.097 444933.533, 159441..."


In [112]:
df1 = pd.merge(gdf, df, on =['state', 'county_name'], how = 'left')
df1 = df1.drop(columns = ['state_fips', 'res_statefips', 'geometry',
                          'county_fips', 'mort_count'])

In [113]:
folder_out = os.path.join(DATA_RESULTS, 'gwr_data')
filename = 'full_gwr_ready_data.csv'
path_out = os.path.join(folder_out, filename)
df1.to_csv(path_out, index = False)