# This notebook uses Covid Data from [John Hopkins University](https://github.com/CSSEGISandData/COVID-19)
## Utilizes Linux, AWS and Python to store and display data. 


In [1]:
import io
import boto3
import os
import pandas as pd
import numpy as np
import plotly.io as pio
import plotly.graph_objects as go # for data visualisation
import plotly.express as px

#### Acquire API key for Mapbox 

In [2]:
access_token = 'pk.primary_key'
px.set_mapbox_access_token(access_token)

## Pull from AWS S3 Bucket


In [3]:
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket="davecovid",Key="time_series_covid19_confirmed_global.csv")
file = response["Body"]

df = pd.read_csv(file)
df

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/14/20,11/15/20,11/16/20,11/17/20,11/18/20,11/19/20,11/20/20,11/21/20,11/22/20,11/23/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,43035,43240,43403,43628,43851,44228,44443,44503,44706,44988
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,27233,27830,28432,29126,29837,30623,31459,32196,32761,33556
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,66819,67679,68589,69591,70629,71652,72755,73774,74862,75867
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,5725,5872,5914,5951,6018,6066,6142,6207,6256,6304
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,13374,13451,13615,13818,13922,14134,14267,14413,14493,14634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,62167,63031,63867,64935,66186,67296,68768,70254,71644,73196
266,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
267,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2072,2072,2078,2081,2083,2086,2090,2093,2099,2107
268,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,17097,17123,17187,17243,17280,17350,17373,17394,17424,17454


### Fill in missing data (Needed later for Plotly Graph)

In [4]:
missing_states = pd.isnull(df['Province/State'])
df.loc[missing_states,'Province/State'] = df.loc[missing_states,'Country/Region']

### Transform data

In [5]:
df1 = df.groupby(["Province/State","Country/Region","Lat","Long"]).sum()
df1_stack = df1.stack()
df1_stack = df1_stack.reset_index()
df1_stack.columns = ['Province', 'Country', 'Lat','Long','Date','Confirmed']
df1_stack

Unnamed: 0,Province,Country,Lat,Long,Date,Confirmed
0,Afghanistan,Afghanistan,33.939110,67.709953,1/22/20,0
1,Afghanistan,Afghanistan,33.939110,67.709953,1/23/20,0
2,Afghanistan,Afghanistan,33.939110,67.709953,1/24/20,0
3,Afghanistan,Afghanistan,33.939110,67.709953,1/25/20,0
4,Afghanistan,Afghanistan,33.939110,67.709953,1/26/20,0
...,...,...,...,...,...,...
82885,Zimbabwe,Zimbabwe,-19.015438,29.154857,11/19/20,9046
82886,Zimbabwe,Zimbabwe,-19.015438,29.154857,11/20/20,9120
82887,Zimbabwe,Zimbabwe,-19.015438,29.154857,11/21/20,9172
82888,Zimbabwe,Zimbabwe,-19.015438,29.154857,11/22/20,9220


### Save dataset to S3 bucket.

In [6]:
import s3fs

s3 = s3fs.S3FileSystem(anon=False)

# Use 'w' for py3, 'wb' for py2
with s3.open('davecovid/data_fix/confirm.csv','w') as f:
    df1_stack.to_csv(f, index=False)

### Import datasets

In [7]:
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket="davecovid",Key="data_fix/confirmed.csv")
file = response["Body"]

df_confirmed = pd.read_csv(file)

s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket="davecovid",Key="data_fix/deaths.csv")
file = response["Body"]

df_deaths = pd.read_csv(file)

s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket="davecovid",Key="data_fix/recovered.csv")
file = response["Body"]

df_recovered = pd.read_csv(file)



### Combine datasets

In [8]:
df_merge = pd.merge(df_confirmed, df_recovered, on=['Lat','Long','Date','Country','Province'], how='left')
df_combined = pd.merge(df_merge, df_deaths, on=['Lat','Long','Date','Country','Province'], how='left')
df_combined.dtypes

Province      object
Country       object
Lat          float64
Long         float64
Date          object
Confirmed      int64
Recovered    float64
Deaths         int64
dtype: object

### Change format, fix erronreous values, and get number of currently active cases.

In [9]:
df_combined['Confirmed'].fillna('0',inplace=True)
df_combined['Recovered'].fillna('0',inplace=True)
df_combined['Deaths'].fillna('0',inplace=True)
df_combined['Recovered'] = df_combined['Recovered'].astype(np.int64)
df_combined['Deaths'] = df_combined['Deaths'].astype(np.int64)
df_combined['Active'] = df_combined['Confirmed'] - df_combined['Recovered'] - df_combined['Deaths']
df_combined['Active'] = df_combined['Active'].abs()
df_combined['Date'] = pd.to_datetime(df_combined['Date'], format='%m/%d/%y').dt.strftime('%m/%d/%y')

## Display Covid-19 data


### Display Current Data

In [10]:
date_mask = df_combined['Date'] == df_combined['Date'].max()

In [11]:
df_combined['Active'] = df_combined['Active'].abs()


In [12]:
current_fig = px.scatter_mapbox(
df_combined[date_mask], lat="Lat", lon="Long",
size="Confirmed", size_max=50,
color="Deaths", color_continuous_scale=px.colors.sequential.Pinkyl,
hover_name="Province",  hover_data={"Province": True, "Confirmed":True, "Recovered":True,"Active":True, "Deaths":True,"Lat":False,"Long":False},
mapbox_style='dark', zoom=1
)


[Current Covid-19 Data](https://devindra10.github.io/Projects/docs/current_fig.html)

### Display Timeseries Data

In [13]:
timeseries_fig = px.scatter_mapbox(
df_combined, lat="Lat", lon="Long",
size="Active", size_max=50,
color="Deaths", color_continuous_scale=px.colors.sequential.Pinkyl,
hover_name="Province",  hover_data={"Province": False, "Confirmed":True, "Recovered":True,"Active":True, "Deaths":True,"Lat":False,"Long":False, "Date":False},
mapbox_style='dark', zoom=1,
animation_frame="Date", animation_group="Province")


[Timeseries Covid-19 Data](https://devindra10.github.io/Projects/docs/timeseries_fig.html)