# Notebook for preparing redesign of data management

First, we need to make sure we are in the root directory of the project to import custom modules. These notebooks are stored in `notebooks/` for cleanliness.

In [10]:
pwd

'/Users/DanOvadia/Projects/covid-hotspots'

In [3]:
cd ..

/Users/DanOvadia/Projects/covid-hotspots


### Python Libraries

In [9]:
import os
import pandas as pd

In [22]:
from urllib.request import urlopen
import requests
import json

In [None]:
import time

In [24]:
import plotly.express as px

### Custom Modules

In [12]:
from modules import data_processing
from modules import plotting
from config import config

# Extension to auto reload custom modules
%load_ext autoreload

%autoreload 1

%aimport modules.data_processing
%aimport modules.plotting

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Importing

### County Census Data

In [None]:
# Get Census Data
CENSUS_COUNTY_DF = data_processing.get_census_county_data()

### County GeoJson - polygons for choropleth plot

We are getting these data from [plotly](https://plotly.com/python/mapbox-county-choropleth/).

In [None]:
# Get county geojson
COVID_GEOJSON = data_processing.load_county_geojson()

### County Coronavirus Data
We import data from [New York Times GitHub](https://github.com/nytimes/covid-19-data) to get county level coronavirus data.

In [None]:
# Get county data
COVID_COUNTIES_DF = data_processing.get_covid_county_data()

### State Coronavirus Data

We get state level data from The Atlantic's [Covid Tracking Project](https://covidtracking.com/) through their [Data API](https://covidtracking.com/data/api).

In [15]:
# App Engine default service account credentials 
if 'GOOGLE_APPLICATION_CREDENTIALS' not in os.environ:
    print(f"Credentials: {'GOOGLE_APPLICATION_CREDENTIALS' in os.environ}. Setting environment variable.")
    # Retrieve the name of the file from config.py
    CONFIG_FILENAME = config.service_account_credentials_file

    # Generate path for my local computer
    CONFIG_PATH = f"/Users/DanOvadia/Projects/covid-hotspots/config/{CONFIG_FILENAME}"

    # Assign the environment variable for this session of python
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = CONFIG_PATH
else:
    print(f"Credentials: {'GOOGLE_APPLICATION_CREDENTIALS' in os.environ}. Proceeding.")

Credentials: False. Setting environment variable.


In [16]:
%%time
# Get state data
COVID_STATES_DF = data_processing.get_covid_state_data(cache_mode = 3)

Pulling state data from Cloud Storage
gs://us_covid_hotspot-bucket/covid_states.csv.gz
CPU times: user 607 ms, sys: 177 ms, total: 784 ms
Wall time: 4.73 s


In [54]:

response = requests.get(states_url)

In [62]:
type(response)

requests.models.Response

In [61]:
response.close()

In [68]:
%%time
df = pd.DataFrame.from_records(response.json(), index = range(len(response.json())))

CPU times: user 570 ms, sys: 45.5 ms, total: 616 ms
Wall time: 645 ms


In [69]:
df

Unnamed: 0,date,state,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,posNeg,deathIncrease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade
0,20200830,AK,6086.0,340753.0,,41.0,,,,8.0,...,346839,0,0,23a6d4e06222fff29540cd66f84b3a0085967dc5,0,0,0,0,0,
1,20200830,AL,125235.0,855827.0,,969.0,14267.0,,1467.0,,...,981062,10,0,e134256a7879f82784279693c5236970f5fad17b,0,0,0,0,0,
2,20200830,AR,60856.0,651831.0,,391.0,4182.0,,,84.0,...,712687,12,40,933863f59e9ce543a3f40713964edc94c30893a2,0,0,0,0,0,
3,20200830,AS,0.0,1514.0,,,,,,,...,1514,0,0,2556ceee23885cd503a0a92fa37b9fe852084e41,0,0,0,0,0,
4,20200830,AZ,201661.0,997780.0,,757.0,21421.0,263.0,,161.0,...,1199441,23,-12,a232b57dd11d842b04896f6d263d07ab55773faa,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10004,20200126,WA,0.0,0.0,,,,,,,...,0,0,0,d071f52c7b741eb47e42b29ec43d1fd5e5669698,0,0,0,0,0,
10005,20200125,WA,0.0,0.0,,,,,,,...,0,0,0,626988ec4dfe62f38d72dc25126a264411769056,0,0,0,0,0,
10006,20200124,WA,0.0,0.0,,,,,,,...,0,0,0,6f40087f42d06db4121e09b184785b4110cd4df8,0,0,0,0,0,
10007,20200123,WA,0.0,0.0,,,,,,,...,0,0,0,978c05d8a7a9d46e9fa826d83215f5b9732f2c6d,0,0,0,0,0,


In [64]:
response.json()[0]['state']

'AK'

In [43]:
response.close()

In [36]:
%%time
states_url = "https://covidtracking.com/api/states/daily"
with urlopen(states_url) as response:
    covid_states_df = pd.DataFrame(json.load(response))

CPU times: user 525 ms, sys: 175 ms, total: 700 ms
Wall time: 10.1 s


In [52]:
response = urlopen(states_url)
a = json.load(response)
response.close()

In [53]:
type(a)

list

In [None]:
json.load(response)

In [50]:
type(response)

http.client.HTTPResponse

In [33]:
%%time
states_url = "https://covidtracking.com/api/states/daily"

with requests.get(states_url) as response:
    covid_states_df = pd.DataFrame(response.json())

CPU times: user 545 ms, sys: 57.9 ms, total: 603 ms
Wall time: 1.69 s


In [30]:
covid_states_df

Unnamed: 0,date,state,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,posNeg,deathIncrease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade
0,20200830,AK,6086.0,340753.0,,41.0,,,,8.0,...,346839,0,0,23a6d4e06222fff29540cd66f84b3a0085967dc5,0,0,0,0,0,
1,20200830,AL,125235.0,855827.0,,969.0,14267.0,,1467.0,,...,981062,10,0,e134256a7879f82784279693c5236970f5fad17b,0,0,0,0,0,
2,20200830,AR,60856.0,651831.0,,391.0,4182.0,,,84.0,...,712687,12,40,933863f59e9ce543a3f40713964edc94c30893a2,0,0,0,0,0,
3,20200830,AS,0.0,1514.0,,,,,,,...,1514,0,0,2556ceee23885cd503a0a92fa37b9fe852084e41,0,0,0,0,0,
4,20200830,AZ,201661.0,997780.0,,757.0,21421.0,263.0,,161.0,...,1199441,23,-12,a232b57dd11d842b04896f6d263d07ab55773faa,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10004,20200126,WA,0.0,0.0,,,,,,,...,0,0,0,d071f52c7b741eb47e42b29ec43d1fd5e5669698,0,0,0,0,0,
10005,20200125,WA,0.0,0.0,,,,,,,...,0,0,0,626988ec4dfe62f38d72dc25126a264411769056,0,0,0,0,0,
10006,20200124,WA,0.0,0.0,,,,,,,...,0,0,0,6f40087f42d06db4121e09b184785b4110cd4df8,0,0,0,0,0,
10007,20200123,WA,0.0,0.0,,,,,,,...,0,0,0,978c05d8a7a9d46e9fa826d83215f5b9732f2c6d,0,0,0,0,0,


In [7]:
COVID_STATES_DF.head()

Unnamed: 0,date,state,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,density,lat,long,Lived,Standard,fips_y,case_pm,death_pm,deaths_14MA,cases_14MA
0,2020-08-30,AK,6086.0,340753.0,,41.0,,,,8.0,...,1.2863,63.59,-154.49,27.0,1.0,2.0,8291.530541,50.408582,,
1,2020-08-30,AL,125235.0,855827.0,,969.0,14267.0,,1467.0,,...,96.9221,32.32,-86.9,93.0,37.0,1.0,25513.275521,440.449568,,
2,2020-08-30,AR,60856.0,651831.0,,391.0,4182.0,,,84.0,...,58.403,35.2,-91.83,68.0,22.0,5.0,20025.014816,257.979683,,
3,2020-08-30,AS,0.0,1514.0,,,,,,,...,,,,,,,,,,
4,2020-08-30,AZ,201661.0,997780.0,,757.0,21421.0,263.0,,161.0,...,64.955,34.05,-111.09,125.0,23.0,4.0,27330.916038,681.710929,,


In [73]:
COVID_STATES_DF['date'] = pd.to_datetime(COVID_STATES_DF['date'], format = '%Y-%m-%d')

In [None]:
time.strftime('%Y-%m-%d',COVID_STATES_DF['date'][0].timetuple())

In [None]:
time.strftime('%Y-%m-%d',time.localtime(COVID_STATES_DF['date'][0]))

In [47]:
COVID_STATES_DF.columns

Index(['date', 'state', 'positive', 'negative', 'pending',
       'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently',
       'inIcuCumulative', 'onVentilatorCurrently', 'onVentilatorCumulative',
       'recovered', 'dataQualityGrade', 'lastUpdateEt', 'dateModified',
       'checkTimeEt', 'death', 'hospitalized', 'dateChecked',
       'totalTestsViral', 'positiveTestsViral', 'negativeTestsViral',
       'positiveCasesViral', 'deathConfirmed', 'deathProbable',
       'totalTestEncountersViral', 'totalTestsPeopleViral',
       'totalTestsAntibody', 'positiveTestsAntibody', 'negativeTestsAntibody',
       'totalTestsPeopleAntibody', 'positiveTestsPeopleAntibody',
       'negativeTestsPeopleAntibody', 'totalTestsPeopleAntigen',
       'positiveTestsPeopleAntigen', 'totalTestsAntigen',
       'positiveTestsAntigen', 'fips_x', 'positiveIncrease',
       'negativeIncrease', 'total', 'totalTestResultsSource',
       'totalTestResults', 'totalTestResultsIncrease', 'posNeg',
    

In [72]:
COVID_STATES_DF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10009 entries, 0 to 10008
Data columns (total 66 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   date                         10009 non-null  int64  
 1   state                        10009 non-null  object 
 2   positive                     9970 non-null   float64
 3   negative                     9829 non-null   float64
 4   pending                      1194 non-null   float64
 5   hospitalizedCurrently        7313 non-null   float64
 6   hospitalizedCumulative       5555 non-null   float64
 7   inIcuCurrently               3947 non-null   float64
 8   inIcuCumulative              1545 non-null   float64
 9   onVentilatorCurrently        3389 non-null   float64
 10  onVentilatorCumulative       546 non-null    float64
 11  recovered                    6566 non-null   float64
 12  dataQualityGrade             9857 non-null   object 
 13  lastUpdateEt    

In [82]:
COVID_STATES_DF['case_pm'].mean()

5914.295166039732

In [1]:
## %%time
date_mask = (COVID_STATES_DF['date']>='2020-03-01')
plotting.plot_animation(COVID_STATES_DF[date_mask].sort_values(
    by='date',
    ascending=True
), 'positiveIncrease')

NameError: name 'COVID_STATES_DF' is not defined

In [None]:
px.choropleth()

Lets check nulls for fips codes from NYtimes

In [None]:
date_mask = (COVID_COUNTIES_DF['date'] == '2020-08-28')

fips_error_mask = (COVID_COUNTIES_DF['fips'].map(lambda x:len(x)) < 5)

print(len(COVID_COUNTIES_DF[fips_error_mask & date_mask]))
COVID_COUNTIES_DF[fips_error_mask & date_mask]

We have 30 counties that have null fips codes. Some of them even have null county names. Not sure how to deal with this for the dashboard. NYC combined the five borroughs to represent the whole city.

------