# CS109b Final Project: 
# Air Pollution Exposure and COVID-19 Mortality in the U.S.

## Import libraries

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import plotly.figure_factory as ff

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

## Load and clean data

In [2]:
data = pd.read_csv("./../PM_COVID-master/processed_data_04-24-2020.csv")

In [3]:
#load data
#data = pd.read_csv('https://raw.githubusercontent.com/CS109b-Team37/Pollution-Prediction/master/PM_COVID-master/processed_data_04-24-2020.csv')

In [4]:
#rename older_pecent to older_percent
data = data.rename(columns={'older_pecent': 'older_percent'})

#drop columns
cols = list(data.columns)
cols.remove('Unnamed: 0') #just a column of integers 1-21560
#cols.remove('Province_State') #redundant information; already captured by 'state'
cols.remove('Country_Region') #only US
cols.remove('Combined_Key') #redundant information; already captured by 'Province_State' and 'Admin2'
cols.remove('year.x') #only 2016
cols.remove('year.y') #only 2012 and nan
cols.remove('Population') #'older_pecent' was calculated by Population/older_Population
cols.remove('older_Population') #'older_pecent' was calculated by Population/older_Population
cols.remove('date') #only 20200502
cols.remove('hash') #useless information
cols.remove('dateChecked') #only '2020-05-02T20:00:00Z'
cols.remove('Abbrev') #redundant information; already captured by 'state'
cols.remove('total') #almost a repeat of 'totalTestResults'
cols.remove('Recovered') #only 0
data = data[cols]


In [5]:
data['Last_Update'].unique()

array(['2020-04-25 06:30:53', '3/30/20 22:52', '2020-04-25 06:31:05',
       '2020-04-23 00:00:00', '2020-04-24 00:00:00'], dtype=object)

In [6]:
#convert Last_Update to binary variable
convert_dict = {'2020-05-03 02:32:28': 0, '3/30/20 22:52': 1}
data = data.replace({'Last_Update': convert_dict})


In [7]:
#variables with NA values
print('Variables with NA values:')
display(data.isna().sum()[data.isna().sum() > 0])

#remove variables with many NA values
na_vars = list(data.isna().sum()[data.isna().sum() > 50].index) #variables with many NA values
data = data[set(cols) - set(na_vars)] #final cleaned data

Variables with NA values:


smoke_rate                 867
mean_bmi                   867
Crude.Rate                   1
older_percent                1
pending                   2819
hospitalizedCurrently      898
hospitalizedCumulative    1284
inIcuCurrently            1808
inIcuCumulative           2608
onVentilatorCurrently     2155
onVentilatorCumulative    2944
recovered                 1479
hospitalized              1284
beds                       811
dtype: int64

In [8]:
#both NA values are for Loving, Texas
null_data = data[data.isnull().any(axis=1)]
print('Rows with NA values:')
display(null_data)

#fill in NA values for 'Crude.Rate' and 'older_percent' with state average
values = {'Crude.Rate': data.groupby('state').mean()['Crude.Rate']['TX'], 'older_percent': data.groupby('state').mean()['older_percent']['TX']}
data = data.fillna(value=values)

Rows with NA values:


Unnamed: 0,population_frac_county,negative,Last_Update,positiveIncrease,Province_State,Lat,Confirmed,Admin2,pct_asian,mean_summer_rm,Crude.Rate,pct_native,mean_winter_rm,population,Deaths,deathIncrease,fips,pct_blk,positive,mean_summer_temp,older_percent,hospitalizedIncrease,totalTestResults_county,q_popdensity,poverty,medianhousevalue,mean_winter_temp,negativeIncrease,posNeg,education,Long_,medhouseholdincome,death,pct_owner_occ,totalTestResults,totalTestResultsIncrease,state,Active,pct_white,mean_pm25,hispanic,popdensity
2635,2e-06,219741,1,862,Texas,31.849476,0,Loving,0.0,73.844694,,0.047619,72.837808,63,0,32,48301,0.0,22806,309.483185,,0,0.567672,1,0.631579,89040.0,290.213523,16607,242547,0.526316,-103.581857,55625.0,593,0.485714,242547,17469,TX,0,0.857143,5.685412,0.142857,0.395035


## Modeling

In [11]:
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
df_nytimes = pd.read_csv(url,index_col=0,parse_dates=[0])

In [17]:
df_nytimes.shape

(118343, 6)

In [29]:
df_nytimes.tail()

Unnamed: 0_level_0,county,state,fips,cases,deaths,fips_new
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-05-05,Sublette,Wyoming,56035.0,1,0,56035.0
2020-05-05,Sweetwater,Wyoming,56037.0,11,0,56037.0
2020-05-05,Teton,Wyoming,56039.0,67,1,56039.0
2020-05-05,Uinta,Wyoming,56041.0,6,0,56041.0
2020-05-05,Washakie,Wyoming,56043.0,4,0,56043.0


In [25]:
data.head()

Unnamed: 0,population_frac_county,negative,Last_Update,positiveIncrease,Province_State,Lat,Confirmed,Admin2,pct_asian,mean_summer_rm,Crude.Rate,pct_native,mean_winter_rm,population,Deaths,deathIncrease,fips,pct_blk,positive,mean_summer_temp,older_percent,hospitalizedIncrease,totalTestResults_county,q_popdensity,poverty,medianhousevalue,mean_winter_temp,negativeIncrease,posNeg,education,Long_,medhouseholdincome,death,pct_owner_occ,totalTestResults,totalTestResultsIncrease,state,Active,pct_white,mean_pm25,hispanic,popdensity,fips_new
0,0.011115,46863,2020-04-25 06:30:53,54,Alabama,32.539527,36,Autauga,0.004005,96.055417,859.3,0.001698,85.651845,53708,2,0,1001,0.263016,5832,306.023451,0.119383,0,585.689142,3,0.045571,105021.111111,288.085091,0,52695,0.286126,-86.644082,46433.833333,197,0.771371,52695,54,AL,34,0.708365,11.712587,0.016285,201.040355,1001
1,0.041208,46863,2020-04-25 06:30:53,54,Alabama,30.72775,147,Baldwin,0.004191,97.971544,976.2,0.010138,89.730972,199123,3,0,1003,0.097385,5832,305.516633,0.169574,0,2171.448929,3,0.094227,158367.15873,290.208861,0,52695,0.174485,-87.722071,45903.377778,197,0.784619,52695,54,AL,144,0.866799,10.077723,0.031128,217.415627,1003
2,0.005236,46863,2020-04-25 06:30:53,54,Alabama,31.868263,32,Barbour,0.002032,97.371675,1040.9,0.001576,88.633572,25303,0,0,1005,0.474788,5832,306.062249,0.143296,0,275.930818,2,0.210081,82200.0,289.242107,0,52695,0.396694,-85.387129,33020.25,197,0.696278,52695,54,AL,32,0.485013,10.981967,0.039705,33.8891,1005
3,0.004697,46863,2020-04-25 06:30:53,54,Alabama,32.996421,34,Bibb,0.000534,96.293077,1028.8,0.006186,86.485866,22696,0,0,1007,0.133431,5832,305.982177,0.129414,0,247.501318,2,0.091389,99318.561766,287.362832,0,52695,0.269033,-87.125115,43257.0,197,0.803955,52695,54,AL,34,0.850112,11.998715,0.058361,39.086779,1007
4,0.009112,46863,2020-04-25 06:30:53,54,Alabama,33.982109,31,Blount,0.001501,94.630949,993.7,0.002941,85.449139,44029,0,0,1009,0.009831,5832,305.178865,0.146498,0,480.139034,4,0.086405,103779.411765,285.565676,0,52695,0.356165,-86.567906,41491.083333,197,0.700122,52695,54,AL,31,0.971845,11.793023,0.178954,282.151911,1009


In [43]:
df_nytimes.loc[df_nytimes['fips'].isnull()==True]

Unnamed: 0_level_0,county,state,fips,cases,deaths,fips_new
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-01,New York City,New York,,1,0,00nan
2020-03-01,Unknown,Rhode Island,,2,0,00nan
2020-03-02,New York City,New York,,1,0,00nan
2020-03-02,Unknown,Rhode Island,,2,0,00nan
2020-03-03,New York City,New York,,2,0,00nan
...,...,...,...,...,...,...
2020-05-05,Unknown,Rhode Island,,1414,36,00nan
2020-05-05,Unknown,Utah,,0,3,00nan
2020-05-05,Unknown,Vermont,,4,0,00nan
2020-05-05,Unknown,Virgin Islands,,66,4,00nan


In [41]:
data['fips_new']=data['fips'].astype(int).apply(lambda x: '{0:0>5}'.format(x)) #add leading zeros to fips code
#data['fips_new']=data['fips'].astype(int)
df_nytimes['fips_new']=df_nytimes['fips'].astype(int)
df_nytimes['fips_new']=df_nytimes['fips_new'].astype(object)
df_nytimes['fips_new'] = df_nytimes['fips_new'].apply(lambda x: '{0:0>5}'.format(x)) #add leading zeros to fips code


ValueError: Cannot convert non-finite values (NA or inf) to integer

In [38]:
df_nytimes.dtypes

county       object
state        object
fips        float64
cases         int64
deaths        int64
fips_new     object
dtype: object

In [39]:
data.dtypes

population_frac_county      float64
negative                      int64
Last_Update                  object
positiveIncrease              int64
Province_State               object
Lat                         float64
Confirmed                     int64
Admin2                       object
pct_asian                   float64
mean_summer_rm              float64
Crude.Rate                  float64
pct_native                  float64
mean_winter_rm              float64
population                    int64
Deaths                        int64
deathIncrease                 int64
fips                          int64
pct_blk                     float64
positive                      int64
mean_summer_temp            float64
older_percent               float64
hospitalizedIncrease          int64
totalTestResults_county     float64
q_popdensity                  int64
poverty                     float64
medianhousevalue            float64
mean_winter_temp            float64
negativeIncrease            

In [37]:
df_joined = df_nytimes.join(data, on ='fips_new', how = 'left')

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat