# Covid-19

In [1]:
import os

import pandas as pd
import numpy as np


from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt

init_notebook_mode(connected=True)

In [2]:
#### set own path ######

#os.chdir("")
os.chdir('C:\\Users\\BRIX\\OneDrive\\Data Science\\3 Semester\\DS808 - Visualisering\\Project\\Scripts')
df = pd.read_csv("owid-covid-data.csv")

In [3]:
df['death_rate'] = (df['new_deaths_smoothed_per_million'] / df['new_cases_smoothed_per_million']).replace(np.inf,np.nan)
df['population_coverage'] = df['total_tests'] / df['population']
df['datetime'] = pd.to_datetime(df['date'])

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,58698.0,29348.5,16944.8,0.0,14674.25,29348.5,44022.75,58697.0
total_cases,55046.0,176367.5,1784635.0,1.0,187.0,2171.0,24088.5,58767140.0
new_cases,57752.0,2035.146,19181.44,-8261.0,0.0,14.0,246.0,679970.0
new_cases_smoothed,56938.0,2002.46,18691.87,-552.0,0.857,19.857,258.0,597734.1
total_deaths,45535.0,7042.994,57379.81,1.0,13.0,87.0,757.0,1388661.0
new_deaths,57752.0,48.08862,400.7257,-1918.0,0.0,0.0,4.0,11938.0
new_deaths_smoothed,56938.0,47.79979,387.1711,-232.143,0.0,0.286,4.286,9797.714
total_cases_per_million,54753.0,3320.269,6549.652,0.001,94.416,626.368,3629.404,80968.1
new_cases_per_million,57688.0,41.39261,137.6125,-2212.545,0.0,2.2735,27.138,8652.658
new_cases_smoothed_per_million,56873.0,40.48079,105.9922,-269.978,0.277,3.977,30.825,2472.188


### Some columns are surprisingly negative, lets filter them

In [5]:
problem_idx = df[(df['new_cases']<0)|(df['new_deaths']<0)|(df['new_cases_smoothed']<0)|(df['new_deaths_smoothed']<0)|(df['new_cases_per_million']<0)|(df['new_deaths_per_million']<0)
                 |(df['new_deaths_smoothed_per_million']<0)|(df['new_tests']<0)|(df['new_tests_per_thousand']<0)|(df['location'].isin(['World']))].index

In [6]:
df = df[~df.index.isin(problem_idx)]
df = df[~df['date'].isin(['2020-09-18', '2020-09-19'])].reset_index(drop=True)

Distribution graphs (histogram/bar graph) of sampled columns:

In [7]:
# next two chuncks doesnt work currently

#from mpl_toolkits.mplot3d import Axes3D
#from sklearn.preprocessing import StandardScaler

#plotPerColumnDistribution(df, 10, 5)

Correlation matrix:

In [8]:
#plotCorrelationMatrix(df, 8)

### New Cases, Tests, Deaths and positive rate over all countries

In [9]:
trace1 = go.Scatter(
    x=df.groupby(['date'])['date'].apply(lambda x: np.unique(x)[0]),
    y=df.groupby(['date'])['new_tests_smoothed'].sum().astype(int),
        xaxis='x2',
    yaxis='y2',
    name = "new tests smoothed"
)
trace2 = go.Scatter(
    x=df.groupby(['date'])['date'].apply(lambda x: np.unique(x)[0]),
    y=df.groupby(['date'])['new_deaths_smoothed'].sum().astype(int),
    name = "new deaths smoothed"
)
trace3 = go.Scatter(
    x=df.groupby(['date'])['date'].apply(lambda x: np.unique(x)[0]),
    y=(df.groupby(['date'])['positive_rate'].mean() * 100).round(3),
    xaxis='x3',
    yaxis='y3',
    name = "test positive rate %"
)
trace4 = go.Scatter(
    x=df.groupby(['date'])['date'].apply(lambda x: np.unique(x)[0]),
    y=df.groupby(['date'])['new_cases_smoothed'].sum().astype(int),
    xaxis='x4',
    yaxis='y4',
    name = "new cases smoothed"
)

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    xaxis=dict(
        domain=[0, 0.45]
    ),
    yaxis=dict(
        domain=[0, 0.45]
    ),
    xaxis2=dict(
        domain=[0.55, 1]
    ),
    xaxis3=dict(
        domain=[0, 0.45],
        anchor='y3'
    ),
    xaxis4=dict(
        domain=[0.55, 1],
        anchor='y4'
    ),
    yaxis2=dict(
        domain=[0, 0.45],
        anchor='x2'
    ),
    yaxis3=dict(
        domain=[0.55, 1]
    ),
    yaxis4=dict(
        domain=[0.55, 1],
        anchor='x4'
    ),
    title = 'New tests, deaths, cases and test positive rate'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [10]:
trace1 = go.Scatter(
    x=df.groupby(['date'])['date'].apply(lambda x: np.unique(x)[0]),
    y=df.groupby(['date'])['new_deaths_smoothed_per_million'].mean(),
        xaxis='x2',
    yaxis='y2',
    name = "mean new deaths smoothed per million"
)
trace2 = go.Scatter(
    x=df.groupby(['date'])['date'].apply(lambda x: np.unique(x)[0]),
    y=df.groupby(['date'])['new_tests_smoothed_per_thousand'].mean(),
    name = "mean new tests smoothed per thousand"
)
trace3 = go.Scatter(
    x=df.groupby(['date'])['date'].apply(lambda x: np.unique(x)[0]),
    y=(df.groupby(['date'])['death_rate'].mean().replace([np.inf],np.nan).interpolate(method='linear', limit_direction='forward', axis=0) * 100).round(3),
    xaxis='x3',
    yaxis='y3',
    name = "interpolated death rate %"
)
trace4 = go.Scatter(
    x=df.groupby(['date'])['date'].apply(lambda x: np.unique(x)[0]),
    y=((df.groupby(['date'])['new_cases_per_million'].apply(lambda x: np.mean(x/1e+6))) * 100).round(6),
    xaxis='x4',
    yaxis='y4',
    name = "mean covid population d2d coverage %"
)

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    xaxis=dict(
        domain=[0, 0.45]
    ),
    yaxis=dict(
        domain=[0, 0.45]
    ),
    xaxis2=dict(
        domain=[0.55, 1]
    ),
    xaxis3=dict(
        domain=[0, 0.45],
        anchor='y3'
    ),
    xaxis4=dict(
        domain=[0.55, 1],
        anchor='y4'
    ),
    yaxis2=dict(
        domain=[0, 0.45],
        anchor='x2'
    ),
    yaxis3=dict(
        domain=[0.55, 1]
    ),
    yaxis4=dict(
        domain=[0.55, 1],
        anchor='x4'
    ),
    title = 'Mean new deaths per 1M, new tests per 1K, death rate and covid mean coverage'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### New Deaths Smoothed

In [11]:
trace1 = go.Scatter(
                    x = df[(df['continent']=='Asia')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Asia')].groupby(['date','continent'])['new_deaths_smoothed'].sum(),
                    mode = "lines",
                    name = "Asia",
                    marker = dict(color = 'green'),
)

trace2 = go.Scatter(
                    x = df[(df['continent']=='Europe')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Europe')].groupby(['date','continent'])['new_deaths_smoothed'].sum(),
                    mode = "lines",
                    name = "Europe",
                    marker = dict(color = 'red'),
)

trace3 = go.Scatter(
                    x = df[(df['continent']=='Africa')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Africa')].groupby(['date','continent'])['new_deaths_smoothed'].sum(),
                    mode = "lines",
                    name = "Africa",
                    marker = dict(color = 'blue'),
                    #text= df.university_name
)

trace4 = go.Scatter(
                    x = df[(df['continent']=='North America')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='North America')].groupby(['date','continent'])['new_deaths_smoothed'].sum(),
                    mode = "lines",
                    name = "North America",
                    marker = dict(color = 'black'),
)

trace5 = go.Scatter(
                    x = df[(df['continent']=='South America')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='South America')].groupby(['date','continent'])['new_deaths_smoothed'].sum(),
                    mode = "lines",
                    name = "South America",
                    marker = dict(color = 'brown'),
)

data = [trace1,trace2,trace3,trace4,trace5]
layout = dict(title = 'New Deaths Smoothed',
              xaxis= dict(title= '# deaths day by day',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)

### New Tests Smoothed

In [12]:
trace1 = go.Scatter(
                    x = df[(df['continent']=='Asia')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Asia')].groupby(['date','continent'])['new_tests_smoothed'].sum(),
                    mode = "lines",
                    name = "Asia",
                    marker = dict(color = 'green'),
)

trace2 = go.Scatter(
                    x = df[(df['continent']=='Europe')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Europe')].groupby(['date','continent'])['new_tests_smoothed'].sum(),
                    mode = "lines",
                    name = "Europe",
                    marker = dict(color = 'red'),
)

trace3 = go.Scatter(
                    x = df[(df['continent']=='Africa')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Africa')].groupby(['date','continent'])['new_tests_smoothed'].sum(),
                    mode = "lines",
                    name = "Africa",
                    marker = dict(color = 'blue'),
)

trace4 = go.Scatter(
                    x = df[(df['continent']=='North America')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='North America')].groupby(['date','continent'])['new_tests_smoothed'].sum(),
                    mode = "lines",
                    name = "North America",
                    marker = dict(color = 'black'),
)

trace5 = go.Scatter(
                    x = df[(df['continent']=='South America')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='South America')].groupby(['date','continent'])['new_tests_smoothed'].sum(),
                    mode = "lines",
                    name = "South America",
                    marker = dict(color = 'brown'),
)

data = [trace1,trace2,trace3,trace4,trace5]
layout = dict(title = 'New tests smoothed',
              xaxis= dict(title= '# tests day by day',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)

### Death rate

In [13]:
trace1 = go.Scatter(
                    x = df[(df['continent']=='Asia')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Asia')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['death_rate'].mean()*100,
                    mode = "lines",
                    name = "Asia",
                    marker = dict(color = 'green'),
)

trace2 = go.Scatter(
                    x = df[(df['continent']=='Europe')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Europe')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['death_rate'].mean()*100,
                    mode = "lines",
                    name = "Europe",
                    marker = dict(color = 'red'),
)

trace3 = go.Scatter(
                    x = df[(df['continent']=='Africa')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Africa')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['death_rate'].mean()*100,
                    mode = "lines",
                    name = "Africa",
                    marker = dict(color = 'blue'),
)

trace4 = go.Scatter(
                    x = df[(df['continent']=='North America')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='North America')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['death_rate'].mean()*100,
                    mode = "lines",
                    name = "North America",
                    marker = dict(color = 'black'),
)

trace5 = go.Scatter(
                    x = df[(df['continent']=='South America')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='South America')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['death_rate'].mean(),
                    mode = "lines",
                    name = "South America",
                    marker = dict(color = 'brown'),
)

data = [trace1,trace2,trace3,trace4,trace5]
layout = dict(title = 'Mean death rate over continents',
              xaxis= dict(title= 'mean deaths/cases %',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)

### Test population coverage

In [14]:
trace1 = go.Scatter(
                    x = df[(df['continent']=='Asia')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Asia')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['death_rate'].mean()*100,
                    mode = "lines",
                    name = "Asia",
                    marker = dict(color = 'green'),
)

trace2 = go.Scatter(
                    x = df[(df['continent']=='Europe')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Europe')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['population_coverage'].mean()*100,
                    mode = "lines",
                    name = "Europe",
                    marker = dict(color = 'red'),
)

trace3 = go.Scatter(
                    x = df[(df['continent']=='Africa')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='Africa')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['population_coverage'].mean()*100,
                    mode = "lines",
                    name = "Africa",
                    marker = dict(color = 'blue'),
)

trace4 = go.Scatter(
                    x = df[(df['continent']=='North America')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='North America')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['population_coverage'].mean()*100,
                    mode = "lines",
                    name = "North America",
                    marker = dict(color = 'black'),
)

trace5 = go.Scatter(
                    x = df[(df['continent']=='South America')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['date'].apply(lambda x: np.unique(x)[0]),
                    y = df[(df['continent']=='South America')&(df['datetime']>='2020-03-01')].groupby(['date','continent'])['population_coverage'].mean(),
                    mode = "lines",
                    name = "South America",
                    marker = dict(color = 'brown'),
)

data = [trace1,trace2,trace3,trace4,trace5]
layout = dict(title = 'Mean population test coverage over continents',
              xaxis= dict(title= 'mean tests/population %',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)

### Let's find some gdp_per_capita and new_cases clusters over countries

In [15]:
df_grouped = df.groupby(['location','continent']).agg({'new_deaths': np.sum, 'gdp_per_capita': np.mean, 'new_cases':np.sum}).reset_index()
df_grouped = df_grouped[(~df_grouped['new_deaths'].isnull())&(~df_grouped['new_cases'].isnull())&(~df_grouped['gdp_per_capita'].isnull())&(~df_grouped['continent'].isnull())]

In [16]:
fig = px.scatter(df_grouped, 
                 x="new_deaths", y="gdp_per_capita", size="new_cases", color="continent",
                 hover_name="location", log_x=True, size_max=60)
fig.show()

### Let's find some new_deaths_smoothed_per_million, handwashing_facilities and extreme_poverty clusters over countries

In [17]:
df_grouped = df.groupby(['location','continent']).agg({'handwashing_facilities': np.mean, 'new_deaths_smoothed_per_million': np.sum, 'extreme_poverty':np.mean}).reset_index()
df_grouped = df_grouped[(~df_grouped['handwashing_facilities'].isnull())&(~df_grouped['new_deaths_smoothed_per_million'].isnull())&(~df_grouped['extreme_poverty'].isnull())&(~df_grouped['continent'].isnull())]

In [18]:
fig = px.scatter(df_grouped, 
                 x="new_deaths_smoothed_per_million", y="handwashing_facilities", size="extreme_poverty", color="continent",
                 hover_name="location", log_x=True, size_max=60)
fig.show()

### Let's find some new_deaths_smoothed_per_million, aged_70_older and population_density clusters over countries

In [19]:
df_grouped = df.groupby(['location','continent']).agg({'population_density': np.mean, 'new_deaths_smoothed_per_million': np.sum, 'aged_70_older':np.mean}).reset_index()
df_grouped = df_grouped[(~df_grouped['population_density'].isnull())&(~df_grouped['new_deaths_smoothed_per_million'].isnull())&(~df_grouped['aged_70_older'].isnull())&(~df_grouped['continent'].isnull())]

In [20]:
fig = px.scatter(df_grouped, 
                 x="new_deaths_smoothed_per_million", y="aged_70_older", size="population_density", color="continent",
                 hover_name="location", log_x=True, size_max=60)
fig.show()

### Let's find some new_deaths_smoothed_per_million, life_expectancy and  hospital_beds_per_thousand clusters over countries

In [21]:
df_grouped = df.groupby(['location','continent']).agg({'life_expectancy': np.mean, 'new_deaths_smoothed_per_million': np.sum, 'hospital_beds_per_thousand':np.mean}).reset_index()
df_grouped = df_grouped[(~df_grouped['life_expectancy'].isnull())&(~df_grouped['new_deaths_smoothed_per_million'].isnull())&(~df_grouped['hospital_beds_per_thousand'].isnull())&(~df_grouped['continent'].isnull())]

In [22]:
fig = px.scatter(df_grouped, 
                 x="new_deaths_smoothed_per_million", y="life_expectancy", size="hospital_beds_per_thousand", color="continent",
                 hover_name="location", log_x=True, size_max=60)
fig.show()

### Stringency Index and death rate correlation

In [23]:
df_grouped = df.groupby(['location','continent']).agg({'death_rate': np.mean, 'stringency_index': np.mean, 'new_cases':np.sum}).reset_index()
df_grouped = df_grouped[(~df_grouped['death_rate'].isnull())&(~df_grouped['stringency_index'].isnull())&(~df_grouped['new_cases'].isnull())&(~df_grouped['continent'].isnull())]

In [24]:
fig = px.scatter(df_grouped, 
                 x="death_rate", y="stringency_index", size="new_cases", color="continent",
                 hover_name="location", log_x=True, size_max=60)
fig.show()

### Let's check changes in TOP-4 countries by Cases starting from April

In [25]:
df['year_month'] = df['date'].apply(lambda x: x[:7])
df_grouped = df.groupby(['location','year_month']).agg({'new_deaths': np.sum, 'gdp_per_capita': np.mean, 'new_cases':np.sum}).reset_index()
df_grouped = df_grouped[(~df_grouped['new_deaths'].isnull())&(~df_grouped['new_cases'].isnull())&(~df_grouped['gdp_per_capita'].isnull())&(~df_grouped['location'].isnull())]
del df['year_month']

In [26]:
df_grouped = df_grouped[df_grouped['year_month'].isin(['2020-04','2020-05','2020-06','2020-07','2020-08','2020-09'])]

In [27]:
fig = px.scatter(df_grouped[df_grouped['location'].isin(['United States','India','Brazil','Russia'])], 
                 x="new_cases", y="new_deaths", animation_frame="year_month", animation_group="location",
                 size="new_cases", color="location", hover_name="location", facet_col="location",
                 log_x=True, size_max=60,range_x=[5000,10000000], range_y=[100,70000])
fig.show()