In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
from datetime import date
import geopandas as gpd
import pydeck as pdk
from pydeck.types import String
import json
import altair as alt

Analysing cases by states

In [2]:
#load geojson files for the states
cur_json = json.load(open('geojson-data/us_states_20m.json', encoding='ISO-8859-1'))
path,ext = os.path.splitext('geojson-data/us_states_20m.json')
new_path =path+"_new"+ext
with open(new_path,"w", encoding='utf-8') as jsonfile:
        json.dump(cur_json,jsonfile,ensure_ascii=False)
us_state = gpd.read_file(new_path, driver='GeoJSON')

#process the table and get lattitude and longitudes
us_state = us_state.sort_values(by='NAME')
us_state['coordinate'] = us_state.geometry.centroid
us_state['long'] = us_state.coordinate.apply(lambda p:p.x)
us_state['lat'] = us_state.coordinate.apply(lambda p:p.y)

In [3]:
us_state.head()

Unnamed: 0,GEO_ID,STATE,NAME,LSAD,CENSUSAREA,geometry,coordinate,long,lat
25,0400000US01,1,Alabama,,50645.326,"MULTIPOLYGON (((-88.12466 30.28364, -88.08681 ...",POINT (-86.82817 32.78987),-86.828173,32.789867
26,0400000US02,2,Alaska,,570640.95,"MULTIPOLYGON (((-162.25503 54.97835, -162.2496...",POINT (-152.18843 64.21002),-152.188432,64.210016
0,0400000US04,4,Arizona,,113594.084,"POLYGON ((-112.53859 37.00067, -112.53454 37.0...",POINT (-111.66498 34.29348),-111.664982,34.293481
1,0400000US05,5,Arkansas,,52035.477,"POLYGON ((-94.04296 33.01922, -94.04304 33.079...",POINT (-92.44133 34.90050),-92.441327,34.900502
2,0400000US06,6,California,,155779.22,"MULTIPOLYGON (((-120.24848 33.99933, -120.2473...",POINT (-119.61041 37.24589),-119.610409,37.245893


In [4]:
#drop unnecessary columns
try:
    us_state.drop(['GEO_ID', 'STATE', 'LSAD', 'CENSUSAREA', 'coordinate'], axis=1, inplace=True)
except:
    print('Aready removed the columns')
us_state.head()

Unnamed: 0,NAME,geometry,long,lat
25,Alabama,"MULTIPOLYGON (((-88.12466 30.28364, -88.08681 ...",-86.828173,32.789867
26,Alaska,"MULTIPOLYGON (((-162.25503 54.97835, -162.2496...",-152.188432,64.210016
0,Arizona,"POLYGON ((-112.53859 37.00067, -112.53454 37.0...",-111.664982,34.293481
1,Arkansas,"POLYGON ((-94.04296 33.01922, -94.04304 33.079...",-92.441327,34.900502
2,California,"MULTIPOLYGON (((-120.24848 33.99933, -120.2473...",-119.610409,37.245893


In [5]:
#load csv with statewise covid-19 cases
us_state_df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')
#only take data from 2021 onwards
#and remove some state as they are not availabe in the Geojson data
us_state_df = us_state_df.loc[16679:]
us_state_df = us_state_df[~((us_state_df.state == 'Guam').values | (us_state_df.state == 'Northern Mariana Islands').values | (us_state_df.state == 'Virgin Islands').values | (us_state_df.state == 'American Samoa').values)]
us_state_df.head()


Unnamed: 0,date,state,fips,cases,deaths
16679,2020-12-31,Alabama,1,361226,4827
16680,2020-12-31,Alaska,2,46740,198
16681,2020-12-31,Arizona,4,523829,8879
16682,2020-12-31,Arkansas,5,225138,3676
16683,2020-12-31,California,6,2307860,25965


In [6]:
#group the table by dates so that we can view cases over time
us_state_grouped_df = us_state_df.groupby(by=['date', 'state']).sum()

In [7]:
#add daily cases and deaths columns from the cumulative cases and deaths columns
start_date = datetime.date(2020, 12, 31)
end_date = datetime.date(2022, 1, 29)
delta = datetime.timedelta(days=1)
us_state_grouped_df['daily_cases'] = -1 
us_state_grouped_df['daily_deaths'] = -1 
daily_total_results = pd.DataFrame()
while start_date < end_date:
    curr_date = start_date + delta
    prev_date = start_date
    cases = (us_state_grouped_df.loc[curr_date.isoformat(), 'cases'] - us_state_grouped_df.loc[prev_date.isoformat(), 'cases' ]).values
    deaths = (us_state_grouped_df.loc[curr_date.isoformat(), 'deaths'] - us_state_grouped_df.loc[prev_date.isoformat(), 'deaths' ]).values
    us_state_grouped_df.loc[curr_date.isoformat(), 'daily_cases'] = cases
    us_state_grouped_df.loc[curr_date.isoformat(), 'daily_deaths'] = deaths
    daily_total_results.loc[curr_date.isoformat(), 'cases'] = cases.sum()
    daily_total_results.loc[curr_date.isoformat(), 'deaths'] = deaths.sum()
    start_date = curr_date

daily_total_results.index.name ='date'
daily_total_results.reset_index().head()

Unnamed: 0,date,cases,deaths
0,2021-01-01,147234.0,1920.0
1,2021-01-02,291604.0,2372.0
2,2021-01-03,201779.0,1351.0
3,2021-01-04,251813.0,2049.0
4,2021-01-05,235003.0,3689.0


In [8]:
#save the files to load directly into the web app
us_state_grouped_df.to_csv('covid-data/us_state_cases_grouped.csv')
daily_total_results.to_csv('covid-data/us_daily_total_results.csv')

In [9]:
#create a dictionary of state fip ID to state name
#this will be used later to convert state fip ID to state name
state_fips = us_state_df[['state','fips']]
state_dict = state_fips.drop_duplicates().reset_index(drop=True).set_index('fips').to_dict()
state_dict = state_dict['state']

In [10]:
us_state_grouped_df.loc['2021-01-01'].head()

Unnamed: 0_level_0,fips,cases,deaths,daily_cases,daily_deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,1,365747,4872,4521,45
Alaska,2,46740,198,0,0
Arizona,4,530267,9015,6438,136
Arkansas,5,229442,3711,4304,35
California,6,2345811,26236,37951,271


In [11]:
#set the start date and end dates to view the cases over time
start_date = datetime.date(2020, 12, 31)
end_date = datetime.date(2022, 1, 29)
delta = datetime.timedelta(days=1)

In [12]:
#check if we have all the states
state_set = set(us_state.NAME)
for state in us_state_df[us_state_df.date=='2021-01-01'].state:
    if state in state_set:
        pass
    else:
        print(state)

In [29]:
#plot map

LAND_COVER = [[[-123.0, 49.196], [-123.0, 49.324], [-123.306, 49.324], [-123.306, 49.196]]]

INITIAL_VIEW_STATE = pdk.ViewState(latitude=37.090, 
                                    longitude=-94.7, 
                                    zoom=4, 
                                    max_zoom=16, 
                                    pitch=60, 
                                    bearing=0)

text = pdk.Layer(
    "TextLayer",
    us_state,
    #pickable=True,
    get_position=['long', 'lat'],
    get_text="NAME",
    get_size=12,
    get_color=[255, 0, 0],
    get_angle=0,
    get_alignment_baseline=String("bottom"),
)

start_date = start_date + 2*delta       #skip 2 days at a time
iso_date = start_date.isoformat()
covid_daily_cases = us_state_grouped_df.loc[iso_date, 'daily_cases'].values
us_state['daily_cases'] = covid_daily_cases


geojson = pdk.Layer(
        "GeoJsonLayer",
        data=us_state,
        opacity=0.5,
        stroked=True,
        filled=True,
        extruded=True,
        wireframe=True,
        get_elevation='daily_cases',
        elevation_Scale=0.001,
        get_fill_color="[220, 250, 255]",
        get_line_color=[0, 200, 0]
)

column = pdk.Layer(
        'ColumnLayer',
        us_state,
        get_position=['long', 'lat'],
        auto_highlight=True,
        elevation_scale=20,
        pickable=True,
        get_elevation='daily_cases',
        elevation_range=[0, 3000],
        extruded=True,
        coverage=60,
        get_fill_color="[255, 255-255*daily_cases/20000, 0]")

tooltip={"html": "<b>State:</b> {NAME}</br> <b>Cases:</b> {daily_cases}"}
r = pdk.Deck(layers=[text, geojson, column], initial_view_state=INITIAL_VIEW_STATE, tooltip=tooltip)
r.to_html("US_states_covid_cases_demo.html")
r.show()

DeckGLWidget(carto_key=None, custom_libraries=[], google_maps_key=None, json_input='{"initialViewState": {"bea…

Plotting the mask usage data by counties

In [14]:
#getting the counties Geojson file
cur_json = json.load(open('geojson-data/us_counties_20m.json', encoding='ISO-8859-1'))
path,ext = os.path.splitext('geojson-data/us_counties_20m.json')
new_path =path+"_new"+ext
with open(new_path,"w", encoding='utf-8') as jsonfile:
        json.dump(cur_json,jsonfile,ensure_ascii=False)
us_county = gpd.read_file(new_path, driver='GeoJSON')

In [15]:
#loading the mask usage file
county_mask_df = pd.read_csv('covid-data/mask-use-by-county.csv')

In [16]:
#clean up the fip code for county
s = us_county['STATE'].values + us_county['COUNTY'].values
for i in range(len(s)):
    s[i] = s[i].strip('0')

us_county['COUNTYFP'] = s

In [17]:
#combine the mask and the geojson file via county fip ID
us_county.COUNTYFP = us_county.COUNTYFP.astype('int64')
county_us = us_county.set_index('COUNTYFP')
county_mask = county_mask_df.set_index('COUNTYFP')
result = pd.merge(us_county, county_mask_df, on='COUNTYFP')

In [18]:
#compute the lattitude and longitude of each county
#match the State fip ID to State name by using the state_dict we created earlier
result['coordinate'] = result.geometry.centroid
result['long'] = result.coordinate.apply(lambda p:p.x)
result['lat'] = result.coordinate.apply(lambda p:p.y)

result.STATE = result.STATE.astype(int)
result.STATE = result.STATE.map(state_dict)
result.head()

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,coordinate,long,lat
0,0500000US01001,Alabama,1,Autauga,County,594.436,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...",1001,0.053,0.074,0.134,0.295,0.444,POINT (-86.64120 32.53615),-86.641196,32.536153
1,0500000US01009,Alabama,9,Blount,County,644.776,"POLYGON ((-86.57780 33.76532, -86.75914 33.840...",1009,0.053,0.114,0.18,0.194,0.459,POINT (-86.56976 33.98525),-86.569756,33.985248
2,0500000US01017,Alabama,17,Chambers,County,596.531,"POLYGON ((-85.18413 32.87053, -85.12342 32.772...",1017,0.117,0.037,0.15,0.136,0.56,POINT (-85.38992 32.91159),-85.389924,32.911594
3,0500000US01021,Alabama,21,Chilton,County,692.854,"POLYGON ((-86.51734 33.02057, -86.51596 32.929...",1021,0.06,0.07,0.058,0.194,0.618,POINT (-86.71979 32.85025),-86.719793,32.850253
4,0500000US01033,Alabama,33,Colbert,County,592.619,"POLYGON ((-88.13999 34.58170, -88.13925 34.587...",1033,0.082,0.096,0.152,0.159,0.51,POINT (-87.80589 34.70164),-87.80589,34.701644


In [19]:
#remove the unnecessary columns
try:
    result['ALWAYS_and_FREQUENTLY'] = (result['ALWAYS'] + result['FREQUENTLY']).astype('float32') 
    result['NEVER_and_RARELY'] = (result['NEVER'] + result['RARELY'] +  result['SOMETIMES']).astype('float32')
    result.drop(['GEO_ID','COUNTY','LSAD','CENSUSAREA','COUNTYFP', 'coordinate','NEVER','RARELY','SOMETIMES','FREQUENTLY','ALWAYS'], axis=1, inplace=True)
except:
    pass
result.head()

Unnamed: 0,STATE,NAME,geometry,long,lat,ALWAYS_and_FREQUENTLY,NEVER_and_RARELY
0,Alabama,Autauga,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...",-86.641196,32.536153,0.739,0.261
1,Alabama,Blount,"POLYGON ((-86.57780 33.76532, -86.75914 33.840...",-86.569756,33.985248,0.653,0.347
2,Alabama,Chambers,"POLYGON ((-85.18413 32.87053, -85.12342 32.772...",-85.389924,32.911594,0.696,0.304
3,Alabama,Chilton,"POLYGON ((-86.51734 33.02057, -86.51596 32.929...",-86.719793,32.850253,0.812,0.188
4,Alabama,Colbert,"POLYGON ((-88.13999 34.58170, -88.13925 34.587...",-87.80589,34.701644,0.669,0.33


In [20]:
#save the cleaned file for web app
gpd.GeoDataFrame(result).to_file(r'covid-data\mask_data_cleaned.geojson')

In [21]:
d = gpd.read_file(r'covid-data\mask_data_cleaned.geojson')

In [22]:
d

Unnamed: 0,STATE,NAME,long,lat,ALWAYS_and_FREQUENTLY,NEVER_and_RARELY,geometry
0,Alabama,Autauga,-86.641196,32.536153,0.739,0.261,"POLYGON ((-86.49677 32.34444, -86.71790 32.402..."
1,Alabama,Blount,-86.569756,33.985248,0.653,0.347,"POLYGON ((-86.57780 33.76532, -86.75914 33.840..."
2,Alabama,Chambers,-85.389924,32.911594,0.696,0.304,"POLYGON ((-85.18413 32.87053, -85.12342 32.772..."
3,Alabama,Chilton,-86.719793,32.850253,0.812,0.188,"POLYGON ((-86.51734 33.02057, -86.51596 32.929..."
4,Alabama,Colbert,-87.805890,34.701644,0.669,0.330,"POLYGON ((-88.13999 34.58170, -88.13925 34.587..."
...,...,...,...,...,...,...,...
3085,Virginia,Accomack,-75.636151,37.764307,0.924,0.075,"MULTIPOLYGON (((-75.24227 38.02721, -75.29687 ..."
3086,Virginia,Bland,-81.131775,37.130947,0.606,0.395,"POLYGON ((-81.22510 37.23487, -81.20477 37.243..."
3087,Virginia,Buchanan,-82.041954,37.265360,0.644,0.355,"POLYGON ((-81.96830 37.53780, -81.92787 37.512..."
3088,Virginia,Charlotte,-78.662058,37.016986,0.743,0.257,"POLYGON ((-78.44332 37.07940, -78.49303 36.891..."


In [23]:
#plot county mask usage map

INITIAL_VIEW_STATE = pdk.ViewState(latitude=39.0, 
                                    longitude=-94.7, 
                                    zoom=3.6, 
                                    max_zoom=16, 
                                    pitch=0, 
                                    bearing=0)
geojson = pdk.Layer(
        "GeoJsonLayer",
        data=d,
        pickable=True,
        opacity=0.1,
        stroked=True,
        filled=True,
        extruded=True,
        wireframe=True,
        get_elevation=10,
        elevation_Scale=1,
        get_fill_color="[1.5*255*(1-(1/0.992)*ALWAYS_and_FREQUENTLY), (1/0.992)*255*ALWAYS_and_FREQUENTLY, 0.5*255*ALWAYS_and_FREQUENTLY]",
        get_line_color=[0, 200, 0]
)

tooltip={"html": "<b>County:</b> {NAME}</br> <b>State:</b> {STATE}</br> <b>Fraction Wear Mask:</b> {ALWAYS_and_FREQUENTLY}"}

r = pdk.Deck(layers=[geojson], initial_view_state=INITIAL_VIEW_STATE, tooltip=tooltip)
r.show()

DeckGLWidget(carto_key=None, custom_libraries=[], google_maps_key=None, json_input='{"initialViewState": {"bea…

Map showing mask usage by counties. Green is where the proportion of people wearing mask frequently or always is high. Red is where this proportion is low.

In [24]:
d = pd.read_csv('covid-data/us_state_cases_grouped.csv')
d.groupby(by=['date', 'state']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,fips,cases,deaths,daily_cases,daily_deaths
date,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-31,Alabama,1,361226,4827,-1,-1
2020-12-31,Alaska,2,46740,198,-1,-1
2020-12-31,Arizona,4,523829,8879,-1,-1
2020-12-31,Arkansas,5,225138,3676,-1,-1
2020-12-31,California,6,2307860,25965,-1,-1
...,...,...,...,...,...,...
2022-01-31,Virginia,51,1545650,16217,-1,-1
2022-01-31,Washington,53,1336783,10838,-1,-1
2022-01-31,West Virginia,54,446771,5763,-1,-1
2022-01-31,Wisconsin,55,1516634,12374,-1,-1


In [25]:
us_state_grouped_df

Unnamed: 0_level_0,Unnamed: 1_level_0,fips,cases,deaths,daily_cases,daily_deaths
date,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-31,Alabama,1,361226,4827,-1,-1
2020-12-31,Alaska,2,46740,198,-1,-1
2020-12-31,Arizona,4,523829,8879,-1,-1
2020-12-31,Arkansas,5,225138,3676,-1,-1
2020-12-31,California,6,2307860,25965,-1,-1
...,...,...,...,...,...,...
2022-01-31,Virginia,51,1545650,16217,-1,-1
2022-01-31,Washington,53,1336783,10838,-1,-1
2022-01-31,West Virginia,54,446771,5763,-1,-1
2022-01-31,Wisconsin,55,1516634,12374,-1,-1


In [26]:
daily_total_results

Unnamed: 0_level_0,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-01,147234.0,1920.0
2021-01-02,291604.0,2372.0
2021-01-03,201779.0,1351.0
2021-01-04,251813.0,2049.0
2021-01-05,235003.0,3689.0
...,...,...
2022-01-25,510791.0,2972.0
2022-01-26,684263.0,3891.0
2022-01-27,528881.0,3089.0
2022-01-28,549862.0,3836.0


In [27]:
result

Unnamed: 0,STATE,NAME,geometry,long,lat,ALWAYS_and_FREQUENTLY,NEVER_and_RARELY
0,Alabama,Autauga,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...",-86.641196,32.536153,0.739,0.261
1,Alabama,Blount,"POLYGON ((-86.57780 33.76532, -86.75914 33.840...",-86.569756,33.985248,0.653,0.347
2,Alabama,Chambers,"POLYGON ((-85.18413 32.87053, -85.12342 32.772...",-85.389924,32.911594,0.696,0.304
3,Alabama,Chilton,"POLYGON ((-86.51734 33.02057, -86.51596 32.929...",-86.719793,32.850253,0.812,0.188
4,Alabama,Colbert,"POLYGON ((-88.13999 34.58170, -88.13925 34.587...",-87.805890,34.701644,0.669,0.330
...,...,...,...,...,...,...,...
3085,Virginia,Accomack,"MULTIPOLYGON (((-75.24227 38.02721, -75.29687 ...",-75.636151,37.764307,0.924,0.075
3086,Virginia,Bland,"POLYGON ((-81.22510 37.23487, -81.20477 37.243...",-81.131775,37.130947,0.606,0.395
3087,Virginia,Buchanan,"POLYGON ((-81.96830 37.53780, -81.92787 37.512...",-82.041954,37.265360,0.644,0.355
3088,Virginia,Charlotte,"POLYGON ((-78.44332 37.07940, -78.49303 36.891...",-78.662058,37.016986,0.743,0.257


In [28]:
gpd.read_file('covid-data/mask_data_cleaned.csv', driver='GeoJSON')

DriverError: covid-data/mask_data_cleaned.csv: No such file or directory