In [1]:
import pandas as pd
import wget
import numpy as np

In [2]:
'''
# url of the raw csv dataset
urls = [
    'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv',
    'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
    'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
]
[wget.download(url) for url in urls]
'''

url1 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
url3 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'


In [3]:
confirmed_df = pd.read_csv(url1)
deaths_df = pd.read_csv(url2)
recovered_df = pd.read_csv(url3)

In [4]:
dates = confirmed_df.columns[4:]
confirmed_df_long = confirmed_df.melt(
    id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
    value_vars=dates, 
    var_name='Date', 
    value_name='Confirmed'
)
deaths_df_long = deaths_df.melt(
    id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
    value_vars=dates, 
    var_name='Date', 
    value_name='Deaths'
)
recovered_df_long = recovered_df.melt(
    id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
    value_vars=dates, 
    var_name='Date', 
    value_name='Recovered'
)

In [5]:
recovered_df_long = recovered_df_long[recovered_df_long['Country/Region']!='Canada']

In [6]:
# Merging confirmed_df_long and deaths_df_long
full_table = confirmed_df_long.merge(
  right=deaths_df_long, 
  how='left',
  on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long']
)

In [7]:
# Merging full_table and recovered_df_long
full_table = full_table.merge(
  right=recovered_df_long, 
  how='left',
  on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long']
)

In [8]:
full_table['Date'] = pd.to_datetime(full_table['Date'])

In [9]:
full_table['Recovered'] = full_table['Recovered'].fillna(0)

In [10]:
ship_rows = full_table['Province/State'].str.contains('Grand Princess') | full_table['Province/State'].str.contains('Diamond Princess') | full_table['Country/Region'].str.contains('Diamond Princess') | full_table['Country/Region'].str.contains('MS Zaandam')
full_ship = full_table[ship_rows]

In [11]:
full_table = full_table[~(ship_rows)]

In [12]:
# Active Case = confirmed - deaths - recovered
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']

In [13]:
full_grouped = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()

  """Entry point for launching an IPython kernel.


In [14]:
# new cases 
temp = full_grouped.groupby(['Country/Region', 'Date', ])['Confirmed', 'Deaths', 'Recovered']
temp = temp.sum().diff().reset_index()
mask = temp['Country/Region'] != temp['Country/Region'].shift(1)
temp.loc[mask, 'Confirmed'] = np.nan
temp.loc[mask, 'Deaths'] = np.nan
temp.loc[mask, 'Recovered'] = np.nan
# renaming columns
temp.columns = ['Country/Region', 'Date', 'New cases', 'New deaths', 'New recovered']
# merging new values
full_grouped = pd.merge(full_grouped, temp, on=['Country/Region', 'Date'])
# filling na with 0
full_grouped = full_grouped.fillna(0)
# fixing data types
cols = ['New cases', 'New deaths', 'New recovered']
full_grouped[cols] = full_grouped[cols].astype('int')
# 
full_grouped['New cases'] = full_grouped['New cases'].apply(lambda x: 0 if x<0 else x)

  


In [15]:
full_grouped.to_csv('COVID-19-time-series-clean-complete.csv')

In [16]:
import pandas as pd
import altair as alt
full_grouped = pd.read_csv('COVID-19-time-series-clean-complete.csv', parse_dates=['Date'])
uk = full_grouped[full_grouped['Country/Region'] == 'United Kingdom']

In [17]:
base = alt.Chart(uk).mark_bar().encode(
    x='monthdate(Date):O',
).properties(
    width=500
)

In [18]:
red = alt.value('#f54242')
base.encode(y='Confirmed').properties(title='Total confirmed')|base.encode(y='Deaths',color=red).properties(title='Total deaths')

In [19]:
red = alt.value('#f54242')
base.encode(y='New cases').properties(title='Daily new cases')|base.encode(y='New deaths',color=red).properties(title='Daily new deaths')

In [20]:
def group_by_country(country):
    full_grouped = pd.read_csv('COVID-19-time-series-clean-complete.csv', parse_dates=['Date'])
    df = full_grouped[full_grouped['Country/Region'] == country]
    return df

In [21]:
country = 'Portugal'
df_country = group_by_country(country)
base = alt.Chart(df_country).mark_bar().encode(
    x='monthdate(Date):O',
).properties(
    width=500
)

In [22]:
red = alt.value('#f54242')
base.encode(y='Confirmed').properties(title='Total confirmed')|base.encode(y='Deaths',color=red).properties(title='Total deaths')

In [23]:
red = alt.value('#f54242')
base.encode(y='New cases').properties(title='Daily new cases')|base.encode(y='New deaths',color=red).properties(title='Daily new deaths')

In [24]:
import pandas as pd
import altair as alt
full_grouped = pd.read_csv('COVID-19-time-series-clean-complete.csv', parse_dates=['Date'])
countries = ['US', 'Italy', 'China', 'Spain', 'Germany', 'France', 'Iran', 'United Kingdom', 'Switzerland','Portugal','Brazil']
selected_countries = full_grouped[full_grouped['Country/Region'].isin(countries)]

In [25]:
alt.Chart(selected_countries).mark_circle().encode(
    x='monthdate(Date):O',
    y='Country/Region',
    color='Country/Region',
    size=alt.Size('New cases:Q',
        scale=alt.Scale(range=[0, 1000]),
        legend=alt.Legend(title='Daily new cases')
    ) 
).properties(
    width=800,
    height=300
)

In [26]:
uk

Unnamed: 0.1,Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered
176,176,2020-01-22,United Kingdom,0,0,0.0,0.0,0,0,0
362,362,2020-01-23,United Kingdom,0,0,0.0,0.0,0,0,0
548,548,2020-01-24,United Kingdom,0,0,0.0,0.0,0,0,0
734,734,2020-01-25,United Kingdom,0,0,0.0,0.0,0,0,0
920,920,2020-01-26,United Kingdom,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
31424,31424,2020-07-08,United Kingdom,288511,44602,1378.0,242531.0,637,126,3
31610,31610,2020-07-09,United Kingdom,289154,44687,1378.0,243089.0,643,85,0
31796,31796,2020-07-10,United Kingdom,289678,44735,1378.0,243565.0,524,48,0
31982,31982,2020-07-11,United Kingdom,290504,44883,1378.0,244243.0,826,148,0


In [27]:
# File name of historical data file
data_filename='COVID-19-time-series-clean-complete.csv'

# Name of fields in the hystorical data frame for in the following order: 
# date, region, confirmed cases, fatalities
#field_names=['Date','Country_Region','ConfirmedCases','Fatalities']

df=pd.read_csv(data_filename)
del df['Recovered']
del df['Active']
del df['New cases']
del df['New deaths']
del df['New recovered']
del df['Unnamed: 0']
df = df.rename(columns={"Date": "Date", "Country/Region": "Country_Region", "Confirmed": "ConfirmedCases", "Deaths": "Fatalities",})
#df.columns=['date',region_field,cases_field,fatalities_field]
df.head()

Unnamed: 0,Date,Country_Region,ConfirmedCases,Fatalities
0,2020-01-22,Afghanistan,0,0
1,2020-01-22,Albania,0,0
2,2020-01-22,Algeria,0,0
3,2020-01-22,Andorra,0,0
4,2020-01-22,Angola,0,0


In [28]:
df.to_csv('Bayesian_Model_Covid/data/train.csv')