# This notebook uses Covid Data from [John Hopkins University](https://github.com/CSSEGISandData/COVID-19)
## Utilizes Linux, AWS and Python to store and display data. 


In [1]:
import io
import boto3
import os
import pandas as pd
import numpy as np
import plotly.io as pio
import plotly.graph_objects as go # for data visualisation
import plotly.express as px

#### Acquire API key for Mapbox 

In [2]:
access_token = 'pk.primary_key'
px.set_mapbox_access_token(access_token)

## Pull from AWS S3 Bucket


In [3]:
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket="davecovid",Key="time_series_covid19_confirmed_global.csv")
file = response["Body"]

df = pd.read_csv(file)
df

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20,12/4/20,12/5/20,12/6/20,12/7/20,12/8/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,46116,46274,46516,46718,46837,46837,47072,47306,47516,47716
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,37625,38182,39014,39719,40501,41302,42148,42988,43683,44436
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,82221,83199,84152,85084,85927,86730,87502,88252,88825,89416
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,6712,6745,6790,6842,6904,6955,7005,7050,7084,7127
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,15103,15139,15251,15319,15361,15493,15536,15591,15648,15729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,,Vietnam,14.058324,108.277199,0,2,2,2,2,2,...,1343,1347,1351,1358,1361,1361,1365,1366,1367,1377
267,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,83585,85647,88004,90192,92708,94676,96098,98038,99758,101109
268,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2177,2191,2197,2217,2239,2267,2304,2337,2383,2078
269,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,17608,17647,17665,17700,17730,17857,17898,17916,17931,17963


### Fill in missing data (Needed later for Plotly Graph)

In [4]:
missing_states = pd.isnull(df['Province/State'])
df.loc[missing_states,'Province/State'] = df.loc[missing_states,'Country/Region']

### Transform data

In [5]:
df1 = df.groupby(["Province/State","Country/Region","Lat","Long"]).sum()
df1_stack = df1.stack()
df1_stack = df1_stack.reset_index()
df1_stack.columns = ['Province', 'Country', 'Lat','Long','Date','Confirmed']
df1_stack

Unnamed: 0,Province,Country,Lat,Long,Date,Confirmed
0,Afghanistan,Afghanistan,33.939110,67.709953,1/22/20,0
1,Afghanistan,Afghanistan,33.939110,67.709953,1/23/20,0
2,Afghanistan,Afghanistan,33.939110,67.709953,1/24/20,0
3,Afghanistan,Afghanistan,33.939110,67.709953,1/25/20,0
4,Afghanistan,Afghanistan,33.939110,67.709953,1/26/20,0
...,...,...,...,...,...,...
86935,Zimbabwe,Zimbabwe,-19.015438,29.154857,12/4/20,10547
86936,Zimbabwe,Zimbabwe,-19.015438,29.154857,12/5/20,10617
86937,Zimbabwe,Zimbabwe,-19.015438,29.154857,12/6/20,10718
86938,Zimbabwe,Zimbabwe,-19.015438,29.154857,12/7/20,10839


### Save dataset to S3 bucket.

In [6]:
import s3fs

s3 = s3fs.S3FileSystem(anon=False)

# Use 'w' for py3, 'wb' for py2
with s3.open('davecovid/data_fix/confirm.csv','w') as f:
    df1_stack.to_csv(f, index=False)

### Import datasets

In [7]:
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket="davecovid",Key="data_fix/confirmed.csv")
file = response["Body"]

df_confirmed = pd.read_csv(file)

s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket="davecovid",Key="data_fix/deaths.csv")
file = response["Body"]

df_deaths = pd.read_csv(file)

s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket="davecovid",Key="data_fix/recovered.csv")
file = response["Body"]

df_recovered = pd.read_csv(file)



### Combine datasets

In [8]:
df_merge = pd.merge(df_confirmed, df_recovered, on=['Lat','Long','Date','Country','Province'], how='left')
df_combined = pd.merge(df_merge, df_deaths, on=['Lat','Long','Date','Country','Province'], how='left')
df_combined.dtypes

Province      object
Country       object
Lat          float64
Long         float64
Date          object
Confirmed      int64
Recovered    float64
Deaths       float64
dtype: object

### Change format, fix erronreous values, and get number of currently active cases.

In [9]:
df_combined['Confirmed'].fillna('0',inplace=True)
df_combined['Recovered'].fillna('0',inplace=True)
df_combined['Deaths'].fillna('0',inplace=True)
df_combined['Recovered'] = df_combined['Recovered'].astype(np.int64)
df_combined['Deaths'] = df_combined['Deaths'].astype(np.int64)
df_combined['Active'] = df_combined['Confirmed'] - df_combined['Recovered'] - df_combined['Deaths']
df_combined['Active'] = df_combined['Active'].abs()
df_combined['Date'] = pd.to_datetime(df_combined['Date'], format='%m/%d/%y').dt.strftime('%m/%d/%y')

## Display Covid-19 data


### Display Current Data

In [10]:
date_mask = df_combined['Date'] == df_combined['Date'].max()

In [11]:
df_combined['Active'] = df_combined['Active'].abs()


In [12]:
current_fig = px.scatter_mapbox(
df_combined[date_mask], lat="Lat", lon="Long",
size="Confirmed", size_max=50,
color="Deaths", color_continuous_scale=px.colors.sequential.Pinkyl,
hover_name="Province",  hover_data={"Province": True, "Confirmed":True, "Recovered":True,"Active":True, "Deaths":True,"Lat":False,"Long":False},
mapbox_style='dark', zoom=1
)


[Current Covid-19 Data](https://devindra10.github.io/Projects/docs/current_fig.html)

### Display Timeseries Data

In [13]:
timeseries_fig = px.scatter_mapbox(
df_combined, lat="Lat", lon="Long",
size="Active", size_max=50,
color="Deaths", color_continuous_scale=px.colors.sequential.Pinkyl,
hover_name="Province",  hover_data={"Province": False, "Confirmed":True, "Recovered":True,"Active":True, "Deaths":True,"Lat":False,"Long":False, "Date":False},
mapbox_style='dark', zoom=1,
animation_frame="Date", animation_group="Province")


[Timeseries Covid-19 Data](https://devindra10.github.io/Projects/docs/timeseries_fig.html)