In [305]:
from platform import python_version

print(python_version())

3.6.12


In [306]:
cvid_file='us-counties.csv'
aqi_file='daily_aqi_by_county_2020.csv'
# thresh_pcntg = 0.6 # drop parameter if fraction of null is less than this fraction

## Read Covid-19 data for US

In [307]:
import pandas as pd

cvid=pd.read_csv(cvid_file)
cvid.county=cvid.county.str.lower()
cvidCounties = cvid.county.unique().tolist()

## Read AQI data

In [308]:
aqi=pd.read_csv(aqi_file)
aqi.City=aqi.City.str.lower()
aqiCounties=aqi.City.unique().tolist()
aqi=aqi.drop(columns=['State Code', 'County Code', 'Category', 'Number of Sites Reporting'])

## Consider Counties which have both Covid data and AQI info available

In [309]:
commonCounties=set(aqiCounties).intersection(cvidCounties)
len(commonCounties)

778

## Keep rows with Counties from Common Counties only

In [310]:
cvid=cvid[cvid.county.isin(commonCounties)]
aqi=aqi[aqi.City.isin(commonCounties)]

## Sync Dates (i.e. keep common dates only)

In [311]:
min(cvid.date), max(cvid.date), min(aqi.Date), max(aqi.Date)

('2020-01-21', '2020-12-04', '2020-01-01', '2020-11-04')

In [312]:
startdate=max(min(cvid.date), min(aqi.Date))
enddate=min(max(cvid.date), max(aqi.Date))
startdate, enddate

('2020-01-21', '2020-11-04')

In [313]:
cvid=cvid[cvid.date.between(startdate, enddate, inclusive=True)]
aqi=aqi[aqi.Date.between(startdate, enddate, inclusive=True)]

## Split dataframe by County name
## Sync rows between cvid and aqi for each county

## FillNa Method

In [330]:
cvidByCounty={county:df for county, df in cvid.groupby('county')}
aqiByCounty={county:df for county, df in aqi.groupby('City')}

cvid_aqiByCounty={}

for county, cvidCounty in cvidByCounty.items():
    
    aqiCounty=aqiByCounty[county]
    
    aqiCountyDates=set(aqiCounty.Date)
    cvidCountyDates=set(cvidCounty.date)
    
    # filtering common dates only
    commonCountyDates=aqiCountyDates.intersection(cvidCountyDates)
    cvidByCounty[county]=cvidCounty[cvidCounty.date.isin(commonCountyDates)]
    aqiByCounty[county]=aqiCounty[aqiCounty.Date.isin(commonCountyDates)]  
    
    
    cvidByCounty[county]=cvidByCounty[county].groupby('date').agg({'cases': 'mean', 
                                            'date': lambda x: pd.unique(x)}).reset_index(drop=True)    
    
    
    aqiByCounty[county]=aqiByCounty[county].drop_duplicates(subset = 'Date', keep = 'last')
    aqiByCounty[county]=aqiByCounty[county].pivot(index='Date', columns='Defining Parameter', values='AQI')
    aqiByCounty[county].reset_index(inplace=True)

    if not len(cvidByCounty[county]):
        continue    
    
    aqiByCounty[county]=aqiByCounty[county].sort_values(by=['Date'])
    cvidByCounty[county]=cvidByCounty[county].sort_values(by=['date'])
    
    print(county, len(cvidByCounty[county]), len(aqiByCounty[county]))
    
    cvid_aqiByCounty[county]=aqiByCounty[county].copy()
    cvid_aqiByCounty[county]['cases']=cvidByCounty[county]['cases'].tolist()
    
    cvid_aqiByCounty[county]=cvid_aqiByCounty[county].fillna(cvid_aqiByCounty[county].mean())
    print(cvid_aqiByCounty[county].isna().sum())
    cvid_aqiByCounty[county].reset_index(drop=True, inplace=True)

ada 111 111
Defining Parameter
Date     0
CO       0
NO2      0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
adair 104 104
Defining Parameter
Date     0
CO       0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
adams 216 216
Defining Parameter
Date     0
CO       0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
aiken 179 179
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
alachua 113 113
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
alameda 155 155
Defining Parameter
Date     0
CO       0
Ozone    0
PM2.5    0
cases    0
dtype: int64
albany 203 203
Defining Parameter
Date     0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
albemarle 190 190
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
alexander 185 185
Defining Parameter
Date     0
Ozone    0
PM10     0
cases    0
dtype: int64
alexandria city 55 55
Defining Parameter
Date     0
PM10     0
cases    0
dtype: int64
alle

buncombe 194 194
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
burke 191 191
Defining Parameter
Date     0
Ozone    0
PM10     0
PM2.5    0
SO2      0
cases    0
dtype: int64
burleigh 198 198
Defining Parameter
Date     0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
butler 202 202
Defining Parameter
Date     0
Ozone    0
PM2.5    0
SO2      0
cases    0
dtype: int64
butte 164 164
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
cabell 186 186
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
cache 228 228
Defining Parameter
Date     0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
caddo 192 192
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
caguas 50 50
Defining Parameter
Date     0
CO       0
NO2      0
cases    0
dtype: int64
calaveras 129 129
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
calcasieu 185 185
Defining Parameter
Date    

darlington 164 164
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
dauphin 163 163
Defining Parameter
Date     0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
davidson 173 173
Defining Parameter
Date     0
NO2      0
Ozone    0
PM2.5    0
cases    0
dtype: int64
daviess 195 195
Defining Parameter
Date     0
Ozone    0
PM2.5    0
SO2      0
cases    0
dtype: int64
davis 240 240
Defining Parameter
Date     0
NO2      0
Ozone    0
PM2.5    0
cases    0
dtype: int64
dawson 195 195
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
dekalb 207 207
Defining Parameter
Date     0
CO       0
NO2      0
Ozone    0
PM2.5    0
cases    0
dtype: int64
del norte 88 88
Defining Parameter
Date     0
PM2.5    0
cases    0
dtype: int64
delaware 240 240
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
delta 99 99
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
denton 139 139
Defining Parameter
Date     0
Ozone

greenup 87 87
Defining Parameter
Date     0
Ozone    0
SO2      0
cases    0
dtype: int64
greenville 200 200
Defining Parameter
Date     0
NO2      0
Ozone    0
PM2.5    0
cases    0
dtype: int64
gregg 112 112
Defining Parameter
Date     0
Ozone    0
SO2      0
cases    0
dtype: int64
guilford 197 197
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
gunnison 165 165
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
gwinnett 208 208
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
hall 228 228
Defining Parameter
Date     0
PM2.5    0
cases    0
dtype: int64
hamilton 203 203
Defining Parameter
Date     0
CO       0
Ozone    0
PM2.5    0
cases    0
dtype: int64
hampden 150 150
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
hampshire 141 141
Defining Parameter
Date     0
NO2      0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
hampton city 189 189
Defining Parameter
Date     0
CO   

lafayette 194 194
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
lafourche 201 201
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
lake 235 235
Defining Parameter
Date     0
Ozone    0
PM10     0
PM2.5    0
SO2      0
cases    0
dtype: int64
lancaster 174 174
Defining Parameter
Date     0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
lane 198 198
Defining Parameter
Date     0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
laporte 232 232
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
laramie 107 107
Defining Parameter
Date     0
CO       0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
larimer 202 202
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
latah 91 91
Defining Parameter
Date     0
PM2.5    0
cases    0
dtype: int64
lauderdale 190 190
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
lawrence 195 195
Defining Parameter
D

monongalia 193 193
Defining Parameter
Date     0
Ozone    0
PM2.5    0
SO2      0
cases    0
dtype: int64
monroe 207 207
Defining Parameter
Date     0
CO       0
Ozone    0
PM2.5    0
cases    0
dtype: int64
monterey 107 107
Defining Parameter
Date     0
CO       0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
montezuma 156 156
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
montgomery 241 241
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
morgan 195 195
Defining Parameter
Date     0
Ozone    0
SO2      0
cases    0
dtype: int64
morris 111 111
Defining Parameter
Date     0
NO2      0
Ozone    0
cases    0
dtype: int64
multnomah 124 124
Defining Parameter
Date     0
CO       0
NO2      0
Ozone    0
PM2.5    0
cases    0
dtype: int64
murray 184 184
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
muscatine 227 227
Defining Parameter
Date     0
PM10     0
PM2.5    0
SO2      0
cases    0
dtype: int64
muscogee 195 195

providence 191 191
Defining Parameter
Date     0
CO       0
NO2      0
Ozone    0
PM2.5    0
cases    0
dtype: int64
prowers 159 159
Defining Parameter
Date     0
PM10     0
cases    0
dtype: int64
pueblo 44 44
Defining Parameter
Date     0
PM10     0
PM2.5    0
cases    0
dtype: int64
pulaski 204 204
Defining Parameter
Date     0
CO       0
Ozone    0
PM2.5    0
cases    0
dtype: int64
putnam 194 194
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
racine 93 93
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
ramsey 117 117
Defining Parameter
Date     0
PM10     0
PM2.5    0
cases    0
dtype: int64
randall 158 158
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
randolph 195 195
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
rapides 34 34
Defining Parameter
Date     0
PM2.5    0
cases    0
dtype: int64
ravalli 133 133
Defining Parameter
Date     0
PM2.5    0
cases    0
dtype: int64
richland 199 199

stafford 118 118
Defining Parameter
Date     0
Ozone    0
PM10     0
cases    0
dtype: int64
stanislaus 204 204
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
stark 204 204
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
stearns 111 111
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
steuben 195 195
Defining Parameter
Date     0
CO       0
Ozone    0
PM2.5    0
cases    0
dtype: int64
stevens 165 165
Defining Parameter
Date     0
PM10     0
PM2.5    0
cases    0
dtype: int64
sublette 157 157
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
suffolk 243 243
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
suffolk city 193 193
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
sullivan 113 113
Defining Parameter
Date     0
Ozone    0
PM2.5    0
SO2      0
cases    0
dtype: int64
summit 202 202
Defining Parameter
Date     0
Ozone    0
PM2.5    0
case

williamson 117 117
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
wilson 108 108
Defining Parameter
Date     0
NO2      0
cases    0
dtype: int64
winchester city 30 30
Defining Parameter
Date     0
PM10     0
cases    0
dtype: int64
windham 156 156
Defining Parameter
Date     0
Ozone    0
cases    0
dtype: int64
winnebago 201 201
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
wood 199 199
Defining Parameter
Date     0
Ozone    0
PM2.5    0
SO2      0
cases    0
dtype: int64
woodbury 65 65
Defining Parameter
Date     0
PM2.5    0
cases    0
dtype: int64
worcester 152 152
Defining Parameter
Date     0
NO2      0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
wright 103 103
Defining Parameter
Date     0
Ozone    0
PM2.5    0
cases    0
dtype: int64
wyandotte 112 112
Defining Parameter
Date     0
CO       0
NO2      0
Ozone    0
PM10     0
PM2.5    0
cases    0
dtype: int64
wyoming 166 166
Defining Parameter
Date     0
PM2.5    0
cas

## Initialize a model for each county

In [341]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
model=LinearRegression()

In [369]:
df=pd.concat(list(cvid_aqiByCounty.values()))
df=df.sample(frac=0.7)

In [370]:
df.fillna(df.mean(), inplace=True)
df.isnull().sum()

Date     0
CO       0
NO2      0
Ozone    0
PM10     0
PM2.5    0
cases    0
SO2      0
dtype: int64

In [381]:
X, y = df.drop(columns=['cases', 'Date']), df.cases
y=y.to_numpy().reshape(-1, 1)

In [382]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

poly=PolynomialFeatures(2)
scaler=MinMaxScaler()
X_train=poly.fit_transform(X_train)

X_test = poly.transform(X_test)

In [383]:
from sklearn.metrics import accuracy_score, r2_score
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
print(r2_score(y_test, y_pred))

0.19061197015400355
