Libraries and options

In [48]:
import numpy as np, pandas as pd
import sys, os, re, zipfile, shutil, pickle, matplotlib.pyplot as plt, seaborn as sns, bz2,time
import statsmodels
import scipy.stats as stats
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
pd.options.display.max_colwidth = 50

In [2]:
def importPbz2( file ):
    data = bz2.BZ2File(file,'rb')
    return pd.read_pickle(data)

In [3]:
newDF = importPbz2('covidPollutionCensus.pbz2')
# newDF = importPbz2('covidCensus.pbz2')

In [4]:
newDF['fips'] = newDF['fips'].astype('str').str.zfill(5)
newDF['State Code'] = newDF['State Code'].astype('str').str.zfill(5)
newDF['County Code'] = newDF['County Code'].astype('str').str.zfill(5)
newDF['dates'] = pd.to_datetime(newDF['dates'])

And explore...

In [5]:
newDF[['totalMoved','movedWithinState','movedWithoutState','movedFromAbroad','publicTrans','totalTrans','householdsTotal','houseWith65',
      'house2+with65','houseFamily65','houseNonfam65','houseNo65','house2+No65','houseFamilyNo65','houseNonfamNo65',
      'householdStructuresTotal','householdIncomeMedian','gini','hoursWorkedMean','unitsInStructure','healthInsTotal',
      'healthInsNativeWith','healthInsForeignNatWith','healthInsForeignNoncitWith','healthInsForeignNatNo',
      'healthInsForeignNoncitNo','healthInsNativeNo']]  = \
newDF[['totalMoved','movedWithinState','movedWithoutState','movedFromAbroad','publicTrans','totalTrans','householdsTotal','houseWith65',
      'house2+with65','houseFamily65','houseNonfam65','houseNo65','house2+No65','houseFamilyNo65','houseNonfamNo65',
      'householdStructuresTotal','householdIncomeMedian','gini','hoursWorkedMean','unitsInStructure','healthInsTotal',
      'healthInsNativeWith','healthInsForeignNatWith','healthInsForeignNoncitWith','healthInsForeignNatNo',
      'healthInsForeignNoncitNo','healthInsNativeNo']].astype('float64')

In [6]:
testDF = newDF.groupby([pd.Grouper(key='dates',freq='W'),'pollutant','fips']).agg('mean')

In [7]:
testDF = testDF.reset_index()#.set_index(['dates','fips','pollutant'])

In [8]:
testDF = testDF[testDF['dates'].between('2020-03-11','2021-03-11')]

In [9]:
testDF.sort_values(['dates','fips','pollutant'],inplace=True)

In [10]:
wokka=[i for i in testDF if not (i.endswith('.missing'))|(i.startswith('NYT'))]
testDF = testDF[wokka]

In [11]:
def rateCalc( numerators, denominator, DF  ):
    for col in numerators:
        DF[col+'Rate'] = DF[col] / DF[denominator]

In [12]:
ratesDF = testDF.copy()

rateCalc(['movedWithinState',
         'movedWithoutState',
         'movedFromAbroad'], 'totalMoved', ratesDF)

rateCalc(['publicTrans'], 'totalTrans', ratesDF)

rateCalc(['houseWith65', #householdsTotal
'house2+with65',
'houseFamily65',
'houseNonfam65',
'houseNo65',
'house2+No65',
'houseFamilyNo65',
'houseNonfamNo65'], 'householdsTotal', ratesDF)

# rateCalc(['healthInsNativeWith', #healthInsTotal
# 'healthInsForeignNatWith',
# 'healthInsForeignNoncitWith'], 'healthInsTotal', ratesDF)

rateCalc(['hospitalIcuBeds',
          'hospitalLicensedBeds',
          'hospitalStaffedBeds'], 'latestTotalPopulation', ratesDF)

rateCalc(['MaleAndFemale_AtLeast65_Population.data'], 'latestTotalPopulation', ratesDF)

ratesDF['healthInsRates']  = ratesDF[['healthInsNativeWith','healthInsForeignNatWith','healthInsForeignNoncitWith']].sum(axis=1) / ratesDF['healthInsTotal']
ratesDF['householdsWith65Rate'] = ratesDF[['houseWith65Rate','house2+with65Rate','houseNonfam65Rate','houseFamily65']].sum(axis=1)
ratesDF.drop(columns=['houseWith65', #householdsTotal
'house2+with65',
'houseFamily65',
'houseNonfam65',
'houseNo65',
'house2+No65',
'houseFamilyNo65',
'houseNonfamNo65'],inplace=True)

In [14]:
keep = ['dates','fips','AverageDailyTemperature.data','AveragePrecipitationTotal.data','AverageWindSpeed.data','BLS_UnemploymentRate.data',\
       'gini','hoursWorkedMean','unitsInStructure','Arithmetic Mean','density','pollutant','householdIncomeMedian']
wokka = [x for x in ratesDF if x.endswith('Rate')]
keep.extend(wokka)
ratesDF = ratesDF[keep]

In [159]:
cols = ['dates','fips','deathRate','Arithmetic Mean','householdIncomeMedian','AverageDailyTemperature.data','gini','hospitalStaffedBedsRate','publicTransRate','householdsWith65Rate','density']
rateFipsMean = ratesDF.loc[:,cols].groupby('fips').agg('mean')
rateFipsMean

rateCorrs = rateFipsMean.corr().sort_values('deathRate',ascending=False,key=abs)
rateCorrs
ratesDF['fips'].nunique()

898

In [153]:
def covidPollutantFipsCorr(df, pollutant, cols=[], sort='Arithmetic Mean'):
    if cols:
        polDF =  df.loc[df['pollutant']==pollutant,cols+['fips']]
    else:
        polDF = df.loc[df['pollutant']==pollutant,:]
    fipsDF  = polDF.groupby(by='fips').agg('mean')
    corrDF = fipsDF.corr().sort_values(by=sort,ascending=False,key=abs)
    pVals = fipsDF.corr(method=lambda x, y: stats.pearsonr(x, y)[1]) - np.eye(len(corrDF.columns))
    return corrDF.round(2), pVals.loc[corrDF.index,:].round(3)

NO2

In [148]:
cols = ['dates','deathRate','Arithmetic Mean','householdIncomeMedian','AverageDailyTemperature.data','gini','hospitalStaffedBedsRate','publicTransRate','householdsWith65Rate','density']
no2Corr,pVals=covidPollutantFipsCorr(ratesDF, 'no2', cols)


rows = ['deathRate','publicTransRate','density']
no2Corr

Unnamed: 0,deathRate,Arithmetic Mean,householdIncomeMedian,AverageDailyTemperature.data,gini,hospitalStaffedBedsRate,publicTransRate,householdsWith65Rate,density
deathRate,1.0,0.25,0.19,0.01,0.07,0.05,0.23,0.1,0.08
Arithmetic Mean,0.25,1.0,0.16,0.19,0.35,0.06,0.43,0.49,0.41
householdIncomeMedian,0.19,0.16,1.0,-0.03,-0.2,-0.32,0.25,0.14,0.2
AverageDailyTemperature.data,0.01,0.19,-0.03,1.0,0.3,0.01,-0.0,0.28,0.09
gini,0.07,0.35,-0.2,0.3,1.0,0.38,0.33,0.29,0.39
hospitalStaffedBedsRate,0.05,0.06,-0.32,0.01,0.38,1.0,0.02,-0.0,0.12
publicTransRate,0.23,0.43,0.25,-0.0,0.33,0.02,1.0,0.25,0.88
householdsWith65Rate,0.1,0.49,0.14,0.28,0.29,-0.0,0.25,1.0,0.24
density,0.08,0.41,0.2,0.09,0.39,0.12,0.88,0.24,1.0


In [149]:
# p values
# pVals = no2DF.corr(method=lambda x, y: stats.pearsonr(x, y)[1]) - np.eye(len(no2DF.corr().columns)) 
pVals

Unnamed: 0,deathRate,Arithmetic Mean,householdIncomeMedian,AverageDailyTemperature.data,gini,hospitalStaffedBedsRate,publicTransRate,householdsWith65Rate,density
deathRate,0.0,0.0,0.002,0.875,0.256,0.49,0.0,0.112,0.204
Arithmetic Mean,0.0,0.0,0.011,0.002,0.0,0.379,0.0,0.0,0.0
householdIncomeMedian,0.002,0.011,0.0,0.626,0.001,0.0,0.0,0.027,0.002
AverageDailyTemperature.data,0.875,0.002,0.626,0.0,0.0,0.896,0.964,0.0,0.152
gini,0.256,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0
hospitalStaffedBedsRate,0.49,0.379,0.0,0.896,0.0,0.0,0.803,0.996,0.079
publicTransRate,0.0,0.0,0.0,0.964,0.0,0.803,0.0,0.0,0.0
householdsWith65Rate,0.112,0.0,0.027,0.0,0.0,0.996,0.0,0.0,0.0
density,0.204,0.0,0.002,0.152,0.0,0.079,0.0,0.0,0.0


Ozone

In [154]:
cols = ['dates','deathRate','Arithmetic Mean','householdIncomeMedian','AverageDailyTemperature.data','gini','hospitalStaffedBedsRate','publicTransRate','householdsWith65Rate','density']
ozoneCorr,pVals=covidPollutantFipsCorr(ratesDF, 'ozone', cols)


rows = ['deathRate','publicTransRate','density']
ozoneCorr

Unnamed: 0,deathRate,Arithmetic Mean,householdIncomeMedian,AverageDailyTemperature.data,gini,hospitalStaffedBedsRate,publicTransRate,householdsWith65Rate,density
Arithmetic Mean,-0.05,1.0,-0.0,-0.3,-0.2,-0.18,-0.1,-0.02,-0.14
AverageDailyTemperature.data,-0.01,-0.3,-0.1,1.0,0.24,0.07,-0.05,0.19,0.06
gini,0.11,-0.2,-0.24,0.24,1.0,0.38,0.27,0.27,0.29
hospitalStaffedBedsRate,0.09,-0.18,-0.24,0.07,0.38,1.0,0.04,0.02,0.08
density,0.11,-0.14,0.26,0.06,0.29,0.08,0.84,0.35,1.0
publicTransRate,0.25,-0.1,0.29,-0.05,0.27,0.04,1.0,0.32,0.84
deathRate,1.0,-0.05,0.13,-0.01,0.11,0.09,0.25,0.15,0.11
householdsWith65Rate,0.15,-0.02,0.22,0.19,0.27,0.02,0.32,1.0,0.35
householdIncomeMedian,0.13,-0.0,1.0,-0.1,-0.24,-0.24,0.29,0.22,0.26


In [155]:
# p values
# pVals = ozoneDF.corr(method=lambda x, y: stats.pearsonr(x, y)[1]) - np.eye(len(ozoneDF.corr().columns)) 
pVals

Unnamed: 0,deathRate,Arithmetic Mean,householdIncomeMedian,AverageDailyTemperature.data,gini,hospitalStaffedBedsRate,publicTransRate,householdsWith65Rate,density
Arithmetic Mean,0.139,0.0,0.973,0.0,0.0,0.0,0.009,0.516,0.0
AverageDailyTemperature.data,0.829,0.0,0.005,0.0,0.0,0.051,0.172,0.0,0.118
gini,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hospitalStaffedBedsRate,0.026,0.0,0.0,0.051,0.0,0.0,0.297,0.655,0.047
density,0.003,0.0,0.0,0.118,0.0,0.047,0.0,0.0,0.0
publicTransRate,0.0,0.009,0.0,0.172,0.0,0.297,0.0,0.0,0.0
deathRate,0.0,0.139,0.0,0.829,0.003,0.026,0.0,0.0,0.003
householdsWith65Rate,0.0,0.516,0.0,0.0,0.0,0.655,0.0,0.0,0.0
householdIncomeMedian,0.0,0.973,0.0,0.005,0.0,0.0,0.0,0.0,0.0


PM2.5

In [151]:
cols = ['dates','deathRate','Arithmetic Mean','householdIncomeMedian','AverageDailyTemperature.data','gini','hospitalStaffedBedsRate','publicTransRate','householdsWith65Rate','density']
pm25Corr,pVals=covidPollutantFipsCorr(ratesDF, 'pm25', cols)


rows = ['deathRate','publicTransRate','density']
pm25Corr

Unnamed: 0,deathRate,Arithmetic Mean,householdIncomeMedian,AverageDailyTemperature.data,gini,hospitalStaffedBedsRate,publicTransRate,householdsWith65Rate,density
deathRate,1.0,0.42,0.13,0.25,0.32,0.2,0.59,0.26,0.46
Arithmetic Mean,0.42,1.0,0.06,0.23,0.1,0.05,0.08,0.03,0.04
householdIncomeMedian,0.13,0.06,1.0,-0.06,-0.17,-0.2,0.21,0.23,0.18
AverageDailyTemperature.data,0.25,0.23,-0.06,1.0,0.3,0.16,0.02,0.32,0.15
gini,0.32,0.1,-0.17,0.3,1.0,0.4,0.31,0.29,0.38
hospitalStaffedBedsRate,0.2,0.05,-0.2,0.16,0.4,1.0,0.07,0.04,0.18
publicTransRate,0.59,0.08,0.21,0.02,0.31,0.07,1.0,0.3,0.81
householdsWith65Rate,0.26,0.03,0.23,0.32,0.29,0.04,0.3,1.0,0.34
density,0.46,0.04,0.18,0.15,0.38,0.18,0.81,0.34,1.0


In [144]:
# p values
# pVals = pm25DF.corr(method=lambda x, y: stats.pearsonr(x, y)[1]) - np.eye(len(pm25DF.corr().columns)) 
pVals

Unnamed: 0,deathRate,Arithmetic Mean,householdIncomeMedian,AverageDailyTemperature.data,gini,hospitalStaffedBedsRate,publicTransRate,householdsWith65Rate,density
Arithmetic Mean,0.0,0.0,0.272,0.0,0.044,0.41,0.125,0.587,0.515
deathRate,0.0,0.0,0.011,0.0,0.0,0.0,0.0,0.0,0.0
AverageDailyTemperature.data,0.0,0.0,0.25,0.0,0.0,0.004,0.635,0.0,0.004
gini,0.0,0.044,0.001,0.0,0.0,0.0,0.0,0.0,0.0
publicTransRate,0.0,0.125,0.0,0.635,0.0,0.19,0.0,0.0,0.0
householdIncomeMedian,0.011,0.272,0.0,0.25,0.001,0.0,0.0,0.0,0.001
hospitalStaffedBedsRate,0.0,0.41,0.0,0.004,0.0,0.0,0.19,0.43,0.001
density,0.0,0.515,0.001,0.004,0.0,0.001,0.0,0.0,0.0
householdsWith65Rate,0.0,0.587,0.0,0.0,0.0,0.43,0.0,0.0,0.0


Lead

In [152]:
cols = ['dates','deathRate','Arithmetic Mean','householdIncomeMedian','AverageDailyTemperature.data','gini','hospitalStaffedBedsRate','publicTransRate','householdsWith65Rate','density']
leadCorr,pVals=covidPollutantFipsCorr(ratesDF, 'lead', cols)


rows = ['deathRate','publicTransRate','density']
leadCorr

Unnamed: 0,deathRate,Arithmetic Mean,householdIncomeMedian,AverageDailyTemperature.data,gini,hospitalStaffedBedsRate,publicTransRate,householdsWith65Rate,density
deathRate,1.0,0.05,-0.21,0.11,0.12,0.06,0.19,0.15,0.13
Arithmetic Mean,0.05,1.0,-0.13,-0.01,-0.07,-0.06,-0.13,-0.07,-0.15
householdIncomeMedian,-0.21,-0.13,1.0,-0.05,-0.18,-0.35,0.36,0.26,0.35
AverageDailyTemperature.data,0.11,-0.01,-0.05,1.0,0.1,0.13,-0.1,0.16,-0.04
gini,0.12,-0.07,-0.18,0.1,1.0,0.47,0.41,0.29,0.39
hospitalStaffedBedsRate,0.06,-0.06,-0.35,0.13,0.47,1.0,0.03,-0.08,0.09
publicTransRate,0.19,-0.13,0.36,-0.1,0.41,0.03,1.0,0.3,0.96
householdsWith65Rate,0.15,-0.07,0.26,0.16,0.29,-0.08,0.3,1.0,0.22
density,0.13,-0.15,0.35,-0.04,0.39,0.09,0.96,0.22,1.0


In [146]:
# p values
# pVals = leadDF.corr(method=lambda x, y: stats.pearsonr(x, y)[1]) - np.eye(len(leadDF.corr().columns)) 
pVals

Unnamed: 0,deathRate,Arithmetic Mean,householdIncomeMedian,AverageDailyTemperature.data,gini,hospitalStaffedBedsRate,publicTransRate,householdsWith65Rate,density
Arithmetic Mean,0.659,0.0,0.272,0.931,0.583,0.612,0.273,0.548,0.238
density,0.313,0.238,0.004,0.727,0.001,0.481,0.0,0.073,0.0
householdIncomeMedian,0.086,0.272,0.0,0.674,0.133,0.004,0.002,0.032,0.004
publicTransRate,0.129,0.273,0.002,0.435,0.001,0.83,0.0,0.013,0.0
householdsWith65Rate,0.219,0.548,0.032,0.194,0.016,0.511,0.013,0.0,0.073
gini,0.335,0.583,0.133,0.392,0.0,0.0,0.001,0.016,0.001
hospitalStaffedBedsRate,0.606,0.612,0.004,0.285,0.0,0.0,0.83,0.511,0.481
deathRate,0.0,0.659,0.086,0.38,0.335,0.606,0.129,0.219,0.313
AverageDailyTemperature.data,0.38,0.931,0.674,0.0,0.392,0.285,0.435,0.194,0.727
