This notebook explores US covid-19 cases using [covid19py](https://pypi.org/project/COVID19Py/) package

In [1]:
#!pip install COVID19Py

In [2]:
import COVID19Py
import plotly.express as px

import pandas as pd
import numpy as np

### Lets explore the api

In [3]:
covid19 = COVID19Py.COVID19()

There are 3 ways to choose the source of the 
- csbs (data Conference of State Bank Supervisors)
- jhu (John Hopkins University)
- nyt (New York Times)

jhu data-source will be used as a default source if you don't specify a source parameter in your request.

In [4]:
covid19 = COVID19Py.COVID19(data_source="csbs")

In [5]:
# Quick summary
latest = covid19.getLatest()
latest

{'confirmed': 1531275, 'deaths': 90082, 'recovered': 0}

In [6]:
# Location based update (current numbers)
locations = covid19.getLocations()
locations[1]

{'id': 1,
 'country': 'US',
 'country_code': 'US',
 'country_population': 327167434,
 'province': 'New York',
 'county': 'Nassau',
 'last_updated': '2020-05-19T20:15:00Z',
 'coordinates': {'latitude': '40.74165225', 'longitude': '-73.58899619'},
 'latest': {'confirmed': 39295, 'deaths': 2060, 'recovered': 0}}

In [7]:
# longitudinal data
covid19 = COVID19Py.COVID19(data_source="jhu")
data = covid19.getLocationByCountryCode("US",timelines=True)

In [8]:
data

[{'id': 225,
  'country': 'US',
  'country_code': 'US',
  'country_population': 327167434,
  'province': '',
  'last_updated': '2020-05-20T03:30:43.926701Z',
  'coordinates': {'latitude': '37.0902', 'longitude': '-95.7129'},
  'latest': {'confirmed': 1528568, 'deaths': 91921, 'recovered': 0},
  'timelines': {'confirmed': {'latest': 1528568,
    'timeline': {'2020-01-22T00:00:00Z': 1,
     '2020-01-23T00:00:00Z': 1,
     '2020-01-24T00:00:00Z': 2,
     '2020-01-25T00:00:00Z': 2,
     '2020-01-26T00:00:00Z': 5,
     '2020-01-27T00:00:00Z': 5,
     '2020-01-28T00:00:00Z': 5,
     '2020-01-29T00:00:00Z': 5,
     '2020-01-30T00:00:00Z': 5,
     '2020-01-31T00:00:00Z': 7,
     '2020-02-01T00:00:00Z': 8,
     '2020-02-02T00:00:00Z': 8,
     '2020-02-03T00:00:00Z': 11,
     '2020-02-04T00:00:00Z': 11,
     '2020-02-05T00:00:00Z': 11,
     '2020-02-06T00:00:00Z': 11,
     '2020-02-07T00:00:00Z': 11,
     '2020-02-08T00:00:00Z': 11,
     '2020-02-09T00:00:00Z': 11,
     '2020-02-10T00:00:00Z': 1

### Let's get data from India

In [9]:
covid19 = COVID19Py.COVID19(data_source="jhu")
india_data = covid19.getLocationByCountryCode("IN",timelines=True)

In [10]:
india_data

[{'id': 131,
  'country': 'India',
  'country_code': 'IN',
  'country_population': 1352617328,
  'province': '',
  'last_updated': '2020-05-20T03:30:43.276534Z',
  'coordinates': {'latitude': '21.0', 'longitude': '78.0'},
  'latest': {'confirmed': 106475, 'deaths': 3302, 'recovered': 0},
  'timelines': {'confirmed': {'latest': 106475,
    'timeline': {'2020-01-22T00:00:00Z': 0,
     '2020-01-23T00:00:00Z': 0,
     '2020-01-24T00:00:00Z': 0,
     '2020-01-25T00:00:00Z': 0,
     '2020-01-26T00:00:00Z': 0,
     '2020-01-27T00:00:00Z': 0,
     '2020-01-28T00:00:00Z': 0,
     '2020-01-29T00:00:00Z': 0,
     '2020-01-30T00:00:00Z': 1,
     '2020-01-31T00:00:00Z': 1,
     '2020-02-01T00:00:00Z': 1,
     '2020-02-02T00:00:00Z': 2,
     '2020-02-03T00:00:00Z': 3,
     '2020-02-04T00:00:00Z': 3,
     '2020-02-05T00:00:00Z': 3,
     '2020-02-06T00:00:00Z': 3,
     '2020-02-07T00:00:00Z': 3,
     '2020-02-08T00:00:00Z': 3,
     '2020-02-09T00:00:00Z': 3,
     '2020-02-10T00:00:00Z': 3,
     '2020-

In [11]:
# Longitudinal data of confirmed cases from India
india_data[0]['timelines']['confirmed']['timeline'].items()

dict_items([('2020-01-22T00:00:00Z', 0), ('2020-01-23T00:00:00Z', 0), ('2020-01-24T00:00:00Z', 0), ('2020-01-25T00:00:00Z', 0), ('2020-01-26T00:00:00Z', 0), ('2020-01-27T00:00:00Z', 0), ('2020-01-28T00:00:00Z', 0), ('2020-01-29T00:00:00Z', 0), ('2020-01-30T00:00:00Z', 1), ('2020-01-31T00:00:00Z', 1), ('2020-02-01T00:00:00Z', 1), ('2020-02-02T00:00:00Z', 2), ('2020-02-03T00:00:00Z', 3), ('2020-02-04T00:00:00Z', 3), ('2020-02-05T00:00:00Z', 3), ('2020-02-06T00:00:00Z', 3), ('2020-02-07T00:00:00Z', 3), ('2020-02-08T00:00:00Z', 3), ('2020-02-09T00:00:00Z', 3), ('2020-02-10T00:00:00Z', 3), ('2020-02-11T00:00:00Z', 3), ('2020-02-12T00:00:00Z', 3), ('2020-02-13T00:00:00Z', 3), ('2020-02-14T00:00:00Z', 3), ('2020-02-15T00:00:00Z', 3), ('2020-02-16T00:00:00Z', 3), ('2020-02-17T00:00:00Z', 3), ('2020-02-18T00:00:00Z', 3), ('2020-02-19T00:00:00Z', 3), ('2020-02-20T00:00:00Z', 3), ('2020-02-21T00:00:00Z', 3), ('2020-02-22T00:00:00Z', 3), ('2020-02-23T00:00:00Z', 3), ('2020-02-24T00:00:00Z', 3), ('

In [12]:
df = pd.DataFrame.from_dict(india_data[0]['timelines']['confirmed']['timeline'].items())
df.tail()

Unnamed: 0,0,1
114,2020-05-15T00:00:00Z,85784
115,2020-05-16T00:00:00Z,90648
116,2020-05-17T00:00:00Z,95698
117,2020-05-18T00:00:00Z,100328
118,2020-05-19T00:00:00Z,106475


^ So we have data for last 115 days. Let' create a formatted dataframe

In [13]:
covid19 = COVID19Py.COVID19(data_source="jhu")
india_data = covid19.getLocationByCountryCode("IN",timelines=True)
# dict to df
df = pd.DataFrame.from_dict(india_data[0]['timelines']['confirmed']['timeline'].items())
# rename
mapping = {df.columns[0]:'datetime', df.columns[1]: 'confirmed_cases'}
df = df.rename(columns=mapping)
# get date column
df['date'] = pd.to_datetime(df['datetime'])
df['date'] = df['date'].dt.date
india_confirmed = df
india_confirmed['country'] = 'India'
india_confirmed['percent_change'] = 100*india_confirmed.confirmed_cases.pct_change()
india_confirmed['percent_change'][0] = 0.00000
# reverse the df
india_confirmed = india_confirmed.reindex(index=india_confirmed.index[::-1])
india_confirmed.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,datetime,confirmed_cases,date,country,percent_change
118,2020-05-19T00:00:00Z,106475,2020-05-19,India,6.126904
117,2020-05-18T00:00:00Z,100328,2020-05-18,India,4.838137
116,2020-05-17T00:00:00Z,95698,2020-05-17,India,5.571
115,2020-05-16T00:00:00Z,90648,2020-05-16,India,5.670055
114,2020-05-15T00:00:00Z,85784,2020-05-15,India,4.618462


In [14]:
# lets get a T - day column 
india_confirmed.reset_index(drop=True,inplace=True)
india_confirmed['days_ago'] = india_confirmed.index + 1
india_confirmed.head()

Unnamed: 0,datetime,confirmed_cases,date,country,percent_change,days_ago
0,2020-05-19T00:00:00Z,106475,2020-05-19,India,6.126904,1
1,2020-05-18T00:00:00Z,100328,2020-05-18,India,4.838137,2
2,2020-05-17T00:00:00Z,95698,2020-05-17,India,5.571,3
3,2020-05-16T00:00:00Z,90648,2020-05-16,India,5.670055,4
4,2020-05-15T00:00:00Z,85784,2020-05-15,India,4.618462,5


### Let's get data from USA

In [15]:
covid19 = COVID19Py.COVID19(data_source="jhu")
us_data = covid19.getLocationByCountryCode("US",timelines=True)
# dict to df
df = pd.DataFrame.from_dict(us_data[0]['timelines']['confirmed']['timeline'].items())
# rename
mapping = {df.columns[0]:'datetime', df.columns[1]: 'confirmed_cases'}
df = df.rename(columns=mapping)
# get date column
df['date'] = pd.to_datetime(df['datetime'])
df['date'] = df['date'].dt.date
us_confirmed = df
us_confirmed['country'] = 'US'
us_confirmed['percent_change'] = 100*us_confirmed.confirmed_cases.pct_change()
us_confirmed['percent_change'][0] = 0.00000
# reverse the df
us_confirmed = us_confirmed.reindex(index=us_confirmed.index[::-1])
us_confirmed.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,datetime,confirmed_cases,date,country,percent_change
118,2020-05-19T00:00:00Z,1528568,2020-05-19,US,1.343227
117,2020-05-18T00:00:00Z,1508308,2020-05-18,US,1.449531
116,2020-05-17T00:00:00Z,1486757,2020-05-17,US,1.290145
115,2020-05-16T00:00:00Z,1467820,2020-05-16,US,1.732436
114,2020-05-15T00:00:00Z,1442824,2020-05-15,US,1.766854


In [16]:
us_confirmed.reset_index(drop=True,inplace=True)
us_confirmed['days_ago'] = us_confirmed.index + 1
us_confirmed.head()

Unnamed: 0,datetime,confirmed_cases,date,country,percent_change,days_ago
0,2020-05-19T00:00:00Z,1528568,2020-05-19,US,1.343227,1
1,2020-05-18T00:00:00Z,1508308,2020-05-18,US,1.449531,2
2,2020-05-17T00:00:00Z,1486757,2020-05-17,US,1.290145,3
3,2020-05-16T00:00:00Z,1467820,2020-05-16,US,1.732436,4
4,2020-05-15T00:00:00Z,1442824,2020-05-15,US,1.766854,5


In [None]:
# Last 30 days
df = pd.concat([us_confirmed.iloc[:30,],india_confirmed.iloc[:30,]],ignore_index=True)
fig = px.scatter(df, x="days_ago", y="confirmed_cases", color="country", trendline="ols")
fig['layout']['xaxis']['autorange'] = "reversed"
fig.show()

results = px.get_trendline_results(fig)
print(results)

In [None]:
# Last 10 days
df = pd.concat([us_confirmed.iloc[:10,],india_confirmed.iloc[:10,]],ignore_index=True)
fig = px.scatter(df, x="days_ago", y="confirmed_cases", color="country", trendline="ols")
fig['layout']['xaxis']['autorange'] = "reversed"
fig.show()

results = px.get_trendline_results(fig)
print(results)

In [None]:
# Last 30 days - Percent Change
df = pd.concat([us_confirmed.iloc[:30,],india_confirmed.iloc[:30,]],ignore_index=True)
fig = px.scatter(df, x="days_ago", y="percent_change", color="country", trendline="ols")
fig['layout']['xaxis']['autorange'] = "reversed"
fig.update_layout(title_text='Daily Percent Increase in COVID-19 cases (last 30 days)')
fig.show()

#results = px.get_trendline_results(fig)
#print(results)

In [None]:
india_confirmed.head()

In [None]:
us_confirmed.head()

In [None]:
# US - Percent Change
df = us_confirmed
fig = px.scatter(df, x="days_ago", y="percent_change", color="country", trendline="ols")
fig['layout']['xaxis']['autorange'] = "reversed"
fig.update_layout(title_text='US - Daily Percent Increase in COVID-19 cases (last 120 days)')
fig.show()

In [None]:
# India - Percent Change
df = india_confirmed
fig = px.scatter(df, x="days_ago", y="percent_change", color="country", trendline="ols")
fig['layout']['xaxis']['autorange'] = "reversed"
fig.update_layout(title_text='India - Daily Percent Increase in COVID-19 cases (last 120 days)')
fig.show()

^ As we can see outliers reduce the usablity of this plot significantly.

So let's add log scale

In [None]:
# India - Percent Change
df = india_confirmed
fig = px.scatter(df, x="days_ago", y="percent_change", color="country", trendline="ols")
fig['layout']['xaxis']['autorange'] = "reversed"
fig.update_layout(yaxis_type="log")
fig.update_layout(title_text='India - Daily Percent Increase in COVID-19 cases (last 120 days on log scale)')
fig.show()