In [1]:
# Imports
import numpy as np

import pandas as pd

import plotly.graph_objects as go

import dash
import dash_core_components as dcc
import dash_html_components as dhtml
from dash.dependencies import Input, Output

from sklearn import linear_model


## CRISP-DM
![CRISP-DM](../reports/figures/CRISP_DM.png)

# Modelling- Spread
- We want to focus on modelling how the virus spreads

In [2]:
# Utility plotting function
def quick_plot(x_data, y_data, type="log", xslider=False):
    """ Basic plot for rapid evaluation of time series data

    Parameters:
    ----------
    x_data: array
        array of numbers or datetime objects
    y_data: pandas dataframe
        matrix to plot, each column is plotted as a trace
        the column name would be used as legent entry for the trace 
    type: str
        y-axis scale, 'log' or 'linear'
    xslider: bool
        x-axis slider, True or False

    Returns:
    -------

    """
    # Create figure
    fig= go.Figure()

    # Column list
    column_list= y_data.columns
    # Loop through each column
    for column in column_list:
        # Add a trace
        fig.add_trace(
            go.Scatter(
                x=x_data,
                y=y_data[column],
                mode="markers+lines",
                opacity=0.8,
                line_width=2,
                marker_size=5,
                name= column
            )
        )

    # Set figure layout
    fig.update_layout(
        width=900,
        height=600,
        xaxis_title="Time",
        xaxis={
            "tickangle": -45,
            "nticks": 20,
            "tickfont": dict(size=14, color="#7f7f7f")
        },
        yaxis_title="Quick Plot",
        yaxis={
            "type": type
        }
    )

    # Set pre-specified scale
    fig.update_yaxes(type=type)

    # Introduce range slider on x-axis
    if(xslider):
        fig.update_layout(xaxis_rangeslider_visible=True)

    fig.show()


In [3]:
# Import compact dataset
df_base= pd.read_csv("../data/processed/COVID_flat_small.csv", sep=";")
df_base.tail()

Unnamed: 0,date,Spain,Nigeria,Germany,Afghanistan,Italy
172,2020-07-12,253908,32558,199919,34451,243061
173,2020-07-13,255953,33153,200180,34455,243230
174,2020-07-14,256619,33616,200456,34740,243344
175,2020-07-15,257494,34259,200890,34994,243506
176,2020-07-16,258855,34854,201450,35070,243736


In [4]:
# Try quick plot
quick_plot(df_base.iloc[:, 0], df_base.iloc[:, 1:], type="linear", xslider=True)

In [5]:
# Align time series to coincide at some pre-defined threshold
# This allows us to compare behaviour

threshold= 1000


In [6]:
country_list= df_base.columns.drop("date")

In [7]:
# Temporary list
temp_list=[]

for pos,country in enumerate(country_list):
    # Slice off parts of the dataframe that's above the specified threshold and append to list
    temp_list.append(np.array(df_base[country][df_base[country]>threshold]))

In [8]:
# Push list into DataFrame
df_thres= pd.DataFrame(temp_list)
df_thres

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,129,130,131,132,133,134,135,136,137,138
0,1073,1695,2277,2277,5232,6391,7798,9942,11748,13910,...,258855.0,,,,,,,,,
1,1095,1182,1273,1337,1532,1728,1932,2170,2388,2558,...,,,,,,,,,,
2,1040,1176,1457,1908,2078,3675,4585,5795,7272,9257,...,200890.0,201450.0,,,,,,,,
3,1026,1092,1176,1279,1351,1463,1531,1703,1828,1939,...,,,,,,,,,,
4,1128,1694,2036,2502,3089,3858,4636,5883,7375,9172,...,241956.0,242149.0,242363.0,242639.0,242827.0,243061.0,243230.0,243344.0,243506.0,243736.0


In [9]:
# Set country names as index
df_thres= df_thres.set_index(country_list)
df_thres

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,129,130,131,132,133,134,135,136,137,138
Spain,1073,1695,2277,2277,5232,6391,7798,9942,11748,13910,...,258855.0,,,,,,,,,
Nigeria,1095,1182,1273,1337,1532,1728,1932,2170,2388,2558,...,,,,,,,,,,
Germany,1040,1176,1457,1908,2078,3675,4585,5795,7272,9257,...,200890.0,201450.0,,,,,,,,
Afghanistan,1026,1092,1176,1279,1351,1463,1531,1703,1828,1939,...,,,,,,,,,,
Italy,1128,1694,2036,2502,3089,3858,4636,5883,7375,9172,...,241956.0,242149.0,242363.0,242639.0,242827.0,243061.0,243230.0,243344.0,243506.0,243736.0


In [10]:
# Transpose
df_thres= df_thres.T
df_thres

Unnamed: 0,Spain,Nigeria,Germany,Afghanistan,Italy
0,1073.0,1095.0,1040.0,1026.0,1128.0
1,1695.0,1182.0,1176.0,1092.0,1694.0
2,2277.0,1273.0,1457.0,1176.0,2036.0
3,2277.0,1337.0,1908.0,1279.0,2502.0
4,5232.0,1532.0,2078.0,1351.0,3089.0
...,...,...,...,...,...
134,,,,,243061.0
135,,,,,243230.0
136,,,,,243344.0
137,,,,,243506.0


In [11]:
# x_data for plot
x_data= np.arange(df_thres.shape[0])

# Quick Plot
quick_plot(x_data, df_thres)

**Exponential Function**  
$N(t, T)= N_0 * 2^{t/T}$  
t- time  
T- base time (i.e. $N(t,2)$ is the doubling rate for every 2 days)  

In [12]:
def doubled_series(N_0, t, T):
    """ Calculate the doubled time series for a specified doubling rate

    Parameters:
    ----------
    N_0: double
        initial value
    t: array Nx1
        time
    T: int
        base time

    Returns:
    -------
    doubling_rate: array Nx1
    """
    return N_0 * np.power(2, (t/T))

In [13]:
# Calculate the doubled series for different doubling rates
d_rates= [10,12,15]

# Will hold output series
df_series= {}

for rate in d_rates:
    df_series["doubling every {0} days".format(rate)]= doubled_series(threshold, x_data, rate)

In [14]:
# Concatenate doubled series with synchronized timeline series
df_sync_with_slopes= pd.concat([pd.DataFrame(df_series), df_thres], axis=1)

In [15]:
# Plot Data with doubled series (which are essentially normalized slopes)
quick_plot(x_data, df_sync_with_slopes)

## Understanding Linear Regression

In [16]:
# Create Linear Regression Model
reg= linear_model.LinearRegression(fit_intercept= False)    
#fit_intercept= False sets the y-axis intercept to zero.

In [72]:
# Setup fitting data
y= np.array(df_base["Nigeria"])

In [73]:
# First non-zero index
not_0_idx= np.where(y>0)
# Truncate data to start from first non-zero index
y= y[not_0_idx[0][0]:]

In [74]:
# scikit-learn takes X matrix with rows as features and columns as samples
nig_row= np.arange(y.shape[0])
X= nig_row.reshape(-1, 1)
#np.reshape(-1,1) infers the row number from the array upon which it is applied

#### Linear Regression Fit

In [75]:
reg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [76]:
# Inspect Solution
X_hat= X
y_hat= reg.predict(X_hat)

In [77]:
LR_inspect= pd.DataFrame([df_base["date"], df_base["Nigeria"]])
LR_inspect= LR_inspect.T
LR_inspect.set_index= "date"
# Truncate data to start from first non-zero index
LR_inspect= LR_inspect[not_0_idx[0][0]:]

In [78]:
LR_inspect["prediction"]= y_hat

In [79]:
LR_inspect

Unnamed: 0,date,Nigeria,prediction
37,2020-02-28,1,0.000000
38,2020-02-29,1,152.712686
39,2020-03-01,1,305.425371
40,2020-03-02,1,458.138057
41,2020-03-03,1,610.850742
...,...,...,...
172,2020-07-12,32558,20616.212545
173,2020-07-13,33153,20768.925231
174,2020-07-14,33616,20921.637916
175,2020-07-15,34259,21074.350602


In [80]:
# Quick Plot
quick_plot(LR_inspect["date"], LR_inspect.iloc[:,1:], type='linear', xslider=True)

### Data Transformation to Log Domain
Here, we transform out data into the Logarithimic domain before applying linear regression.  
However, it's imperative to remember that the prediction outcome must be transformed back to 
linear domain for comparison with original data.

In [102]:
reg= linear_model.LinearRegression()

In [103]:
# Transform data to log domain
y= np.log(np.array(df_base["Nigeria"]))
# SLice out zero-values
y= y[not_0_idx[0][0]:]

In [110]:
# X data
nig_row= np.arange(y.shape[0])
X= nig_row.reshape(-1, 1)

# Linear Regression
reg.fit(X,y)

# Get estimated values
y_hat= reg.predict(X)

# Convert back to linear domain
y_hat= np.exp(y_hat)

# Replace predicted values in LR_inspect
LR_inspect["prediction"]= y_hat

# Make quick plot
quick_plot(LR_inspect["date"], LR_inspect.iloc[:,1:], type='log', xslider=True)