In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

from scipy import signal

%matplotlib inline
mpl.rcParams['figure.figsize'] = (16, 10)
pd.set_option('display.max_rows', 500)

import plotly.graph_objects as go

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data load

In [2]:
# try to parse the dates right at the beginning 
# it works out of the box if the date was stored ISO YYYY-MM-DD format

df_analyse=df_plot=pd.read_csv('../data/processed/COVID_table_for_dash.csv',sep=';',
                       parse_dates=[0])  

df_analyse.sort_values('date',ascending=True).tail()

Unnamed: 0,date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,Burundi,Sierra Leone,Malawi,South Sudan,Western Sahara,Sao Tome and Principe,Yemen,Comoros,Tajikistan,Lesotho
188,2020-07-28,36368,4997,28615,907,1000,86,173355,37629,15582,...,378,1786,3709,2305,10,867,1703,354,7276,505
189,2020-07-29,36471,5105,29229,918,1078,91,178996,37937,16298,...,387,1803,3738,2322,10,868,1711,378,7320,576
190,2020-07-30,36542,5197,29831,922,1109,91,185373,38196,16903,...,387,1818,3858,2322,10,870,1726,378,7366,604
191,2020-07-31,36675,5276,30394,925,1148,91,191302,38550,17280,...,387,1823,4078,2322,10,871,1728,378,7409,604
192,2020-08-01,36710,5396,30950,925,1164,91,196543,38841,17269,...,395,1823,4186,2352,10,874,1730,386,7451,702


In [3]:
country_list=df_analyse.columns[1:]

# Helper functions

In [4]:
def quick_plot(x_in, df_input,y_scale='log',slider=False):
    """ Quick basic plot for quick static evaluation of a time series
    
        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]
        
        Parameters:
        ----------
        x_in : array 
            array of date time object, or array of numbers
        df_input : pandas dataframe 
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider
    
        
        Returns:
        ----------
        
    """
    fig = go.Figure()

    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        name=each,
                        opacity=0.8))
    
    fig.update_layout(autosize=True,
        width=1024,
        height=768,
        font=dict(
            family="PT Sans, monospace",
            size=18,
            color="#7f7f7f"
            )
        )
    fig.update_yaxes(type=y_scale),#range=y_range
    fig.update_xaxes(tickangle=-45,
                 nticks=20,
                 tickfont=dict(size=14,color="#7f7f7f")
                )
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show(renderer='notebook_connected')
   ##fig.show(renderer='browser')

In [5]:
quick_plot(df_analyse.date,
           df_analyse.iloc[:,1:],
           y_scale='linear',
           slider=True)

In [6]:
threshold=100

In [7]:
compare_list=[]
for pos,country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))

In [8]:
pd_sync_timelines=pd.DataFrame(compare_list,index=df_analyse.columns[1:]).T

In [9]:
pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

In [10]:
pd_sync_timelines.head()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,Sierra Leone,Malawi,South Sudan,Western Sahara,Sao Tome and Principe,Yemen,Comoros,Tajikistan,Lesotho,date
0,110.0,104.0,139.0,113.0,113.0,,128.0,115.0,107.0,104.0,...,104.0,101.0,120.0,,174.0,106.0,106.0,128.0,134.0,0
1,110.0,123.0,201.0,133.0,118.0,,158.0,136.0,128.0,131.0,...,104.0,101.0,120.0,,174.0,122.0,106.0,230.0,184.0,1
2,120.0,146.0,230.0,164.0,130.0,,266.0,160.0,128.0,182.0,...,124.0,101.0,120.0,,187.0,128.0,106.0,293.0,184.0,2
3,170.0,174.0,264.0,188.0,138.0,,301.0,194.0,200.0,246.0,...,136.0,203.0,156.0,,208.0,130.0,132.0,379.0,233.0,3
4,174.0,186.0,302.0,224.0,140.0,,387.0,235.0,250.0,302.0,...,155.0,273.0,194.0,,208.0,167.0,132.0,461.0,245.0,4


In [11]:
plt = quick_plot(pd_sync_timelines.date,
           pd_sync_timelines.iloc[:,:-1],
               
           
           
           
           y_scale='log',
           slider=True)

## Doubling Rate

$N(t)=N_0*2^{t/T}$

In [12]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)

In [71]:
max_days=192

norm_slopes={
    #'doubling every day':doubling_rate(100,np.arange(10),1),
    'doubling every two days':doubling_rate(100,np.arange(max_days),2),
    'doubling every four days':doubling_rate(100,np.arange(max_days),4),
    'doubling every ten days':doubling_rate(100,np.arange(max_days),10),
}

In [14]:
pd_sync_timelines_w_slope=pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)

In [15]:
pd_sync_timelines_w_slope

Unnamed: 0,doubling every two days,doubling every 4 days,doubling every 10 days,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,...,Sierra Leone,Malawi,South Sudan,Western Sahara,Sao Tome and Principe,Yemen,Comoros,Tajikistan,Lesotho,date
0,100.0,100.0,100.0,110.0,104.0,139.0,113.0,113.0,,128.0,...,104.0,101.0,120.0,,174.0,106.0,106.0,128.0,134.0,0
1,141.4214,118.9207,107.1773,110.0,123.0,201.0,133.0,118.0,,158.0,...,104.0,101.0,120.0,,174.0,122.0,106.0,230.0,184.0,1
2,200.0,141.4214,114.8698,120.0,146.0,230.0,164.0,130.0,,266.0,...,124.0,101.0,120.0,,187.0,128.0,106.0,293.0,184.0,2
3,282.8427,168.1793,123.1144,170.0,174.0,264.0,188.0,138.0,,301.0,...,136.0,203.0,156.0,,208.0,130.0,132.0,379.0,233.0,3
4,400.0,200.0,131.9508,174.0,186.0,302.0,224.0,140.0,,387.0,...,155.0,273.0,194.0,,208.0,167.0,132.0,461.0,245.0,4
5,565.6854,237.8414,141.4214,237.0,197.0,367.0,267.0,142.0,,387.0,...,166.0,279.0,203.0,,208.0,184.0,132.0,522.0,256.0,5
6,800.0,282.8427,151.5717,273.0,212.0,409.0,308.0,148.0,,502.0,...,178.0,284.0,203.0,,208.0,197.0,132.0,612.0,256.0,6
7,1131.371,336.3586,162.4505,281.0,223.0,454.0,334.0,155.0,,589.0,...,199.0,336.0,236.0,,208.0,209.0,141.0,612.0,256.0,7
8,1600.0,400.0,174.1101,299.0,243.0,511.0,370.0,166.0,,690.0,...,225.0,358.0,236.0,,220.0,212.0,141.0,661.0,311.0,8
9,2262.742,475.6828,186.6066,349.0,259.0,584.0,376.0,172.0,,745.0,...,231.0,369.0,290.0,,235.0,222.0,141.0,729.0,359.0,9


In [16]:
quick_plot(pd_sync_timelines_w_slope.date,
           pd_sync_timelines_w_slope.iloc[:,0:3],
           y_scale='log',
           slider=True)
#fig.update_meta.linestyle='--', (linestyle = '--'),

In [48]:
X = (pd_sync_timelines_w_slope.date, pd_sync_timelines.date)
Y = (pd_sync_timelines_w_slope.iloc[:,0:-1])
"""""quick_plot(pd_sync_timelines_w_slope.date,
           pd_sync_timelines_w_slope.iloc[:,0:6],
           pd_sync_timelines.date,
           pd_sync_timelines.iloc[:,:-1])"""""
quick_plot(X,
                Y,
                y_scale='log',
                slider=True)
#plt = quick_plot(pd_sync_timelines.date,
        #pd_sync_timelines.iloc[:,:-1],
        #y_scale='log',
        #slider=True)
#plt2 = (pd_sync_timelines_w_slope.date,
        #pd_sync_timelines_w_slope.iloc[:,0:3]),
#fig.add_trace(plt2)""""

In [18]:
pd_sync_timelines_w_slope.to_csv('../data/processed/COVID_small_sync_timeline_table.csv',sep=';',index=False)

# Understanding Linear Regression

In [19]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)

In [20]:

l_vec=len(df_analyse['Germany'])
X=np.arange(l_vec-5).reshape(-1, 1)
y=np.log(np.array(df_analyse['Germany'][5:]))

In [21]:
# do a simple regression on all data

In [22]:
reg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
X_hat=np.arange(l_vec).reshape(-1, 1)
Y_hat=reg.predict(X_hat)

In [24]:
LR_inspect=df_analyse[['date','Germany']].copy()

In [25]:
LR_inspect['prediction']=np.exp(Y_hat)

In [26]:
quick_plot(LR_inspect.date,
           LR_inspect.iloc[:,1:],
           y_scale='log',
           slider=True)

# Doubling Rate - Piecewise Linear Regression

In [27]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
l_vec=len(df_analyse['Germany'])
X=np.arange(l_vec-5).reshape(-1, 1)
y=np.log(np.array(df_analyse['Germany'][5:]))

In [28]:
reg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [29]:
reg.intercept_

4.5151433945700425

In [30]:
reg.coef_

array([0.05617512])

In [31]:
reg.coef_/reg.intercept_

array([0.01244149])

In [32]:
df_analyse

Unnamed: 0,date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,Burundi,Sierra Leone,Malawi,South Sudan,Western Sahara,Sao Tome and Principe,Yemen,Comoros,Tajikistan,Lesotho
0,2020-01-22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-01-23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020-01-24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-01-25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-01-26,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
5,2020-01-27,0,0,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
6,2020-01-28,0,0,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
7,2020-01-29,0,0,0,0,0,0,0,0,6,...,0,0,0,0,0,0,0,0,0,0
8,2020-01-30,0,0,0,0,0,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0
9,2020-01-31,0,0,0,0,0,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0


In [33]:
def get_rate_via_regression (in_array):
    ' ' ' Use a linear regression to approximate the slope ' ' '
    y = np.array(in_array)
    X = np.arange(-1, 2).reshape(-1, 1)
    
    assert len (in_array) ==3
    
    reg.fit(X,y)
    intercept = reg.intercept_
    slope=reg.coef_
    return intercept/slope

In [34]:
country_list[60]

'Germany'

In [35]:
quick_plot(df_analyse.date,df_analyse.iloc[:,[60]],y_scale='linear')

In [36]:
def doubling_time(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [37]:
df_analyse['Germany_DT_wiki'] = df_analyse['Germany'].rolling(window = 3,
                                                min_periods = 3).apply(doubling_time)

In [38]:
quick_plot(df_analyse.date,df_analyse.iloc[:,[60]],y_scale='linear')

In [39]:
from scipy import signal

In [40]:
## filter data
df_analyse['US'+'_filter']=signal.savgol_filter(df_analyse['US'],
                         3, # window size used for filtering
                           1) # order of fitted polynomial

In [41]:
start_pos=1
quick_plot(df_analyse.date[start_pos:],
           df_analyse[['US','US_filter']].iloc[start_pos:,:], #['US','US_filter'  filter_cols]
           y_scale='log',
           slider=True)

In [60]:
filter_cols = []
for each in country_list:
    filter_cols.append(each+'_filter')

In [61]:
# calculate slope of regression of last x days
# use always a limited number of days to approximate the triangle, attention exponential base assumption
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

KeyError: 'Afghanistan_filter'

In [70]:
quick_plot(df_analyse.date,df_analyse.iloc[150:],y_scale='linear')

In [63]:
df_analyse.columns
df_analyse.drop(columns=['date'])

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,Malawi,South Sudan,Western Sahara,Sao Tome and Principe,Yemen,Comoros,Tajikistan,Lesotho,Germany_DT_wiki,US_filter
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,0.8333333
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,1.333333
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,1.666667
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,3.0
4,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,,4.0
5,0,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0.0,5.0
6,0,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0.0,5.0
7,0,0,0,0,0,0,0,0,6,0,...,0,0,0,0,0,0,0,0,1.5,5.0
8,0,0,0,0,0,0,0,0,9,0,...,0,0,0,0,0,0,0,0,inf,5.666667
9,0,0,0,0,0,0,0,0,9,0,...,0,0,0,0,0,0,0,0,9.318851,6.666667


In [64]:
filter_cols = []
for each in country_list:
    filter_cols.append(each+'_filter')

In [65]:
# run on all filtered data
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

KeyError: 'Afghanistan_filter'

In [66]:
# cross check the matematical 
df_analyse['Germany_DR_math']=df_analyse['Germany'].rolling(
                                window=days_back,
                                min_periods=days_back).apply(doubling_time, raw=False)

In [67]:
# run on all filtered data
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

KeyError: 'Afghanistan_filter'

In [68]:
df_analyse.columns

Index(['date', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       ...
       'South Sudan', 'Western Sahara', 'Sao Tome and Principe', 'Yemen',
       'Comoros', 'Tajikistan', 'Lesotho', 'Germany_DT_wiki', 'US_filter',
       'Germany_DR_math'],
      dtype='object', length=192)

In [69]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[11,12,13,14]], #
           y_scale='linear',
           slider=True)

In [None]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[16,17,18,19]], #17,18,19   # US comparison 12,17
           y_scale='linear',
           slider=True)