## Evaluation Steps
1. Update all data
1. Process pipeline
1. Slope calculation
1. Visual board

### 1. Update all data

In [1]:
# %load ../src/data/get_data.py
# Imports
import os, subprocess, json

# Environmental Variables
from dotenv import load_dotenv

import pandas as pd

# HTTP Client
import requests

# For parsing and sifting through HTML
from bs4 import BeautifulSoup


# Load environmental variables specified in .env
load_dotenv()


def get_johns_hopkings():
    """ Update data from Johns Hopkings (GITHUB)
    
    Parameters:
    ----------

    Returns:
    -------
    """

    # GIT PULL
    cmd= "git pull"
    cmd_wd= "../data/raw/JH_dataset/COVID-19"
    # Pull from Git repo
    git_proc= subprocess.Popen(
        cmd,
        cwd=cmd_wd, shell=True, 
        stdout= subprocess.PIPE, stderr= subprocess.PIPE
    )

    proc_timeout= 600
    try:
        (git_proc_out, git_proc_err)= git_proc.communicate(timeout=proc_timeout)
    except TimeoutError:
        print("Update operation on Johns Hopkins Dataset from GITHUB failed...\n")

    print("Output: " + str(git_proc_out))
    print("Error: " + str(git_proc_err))



def get_current_nigeria():
    """ Update data from Nigeria Centre for Disease Control (NCDC)

    Update data from NCDC via webscraping
    
    Parameters:
    ----------

    Returns:
    -------
    """
    # WEB SCRAPING
    # Pull page on COVID-19
    page= requests.get("https://covid19.ncdc.gov.ng/")
    # Parse HTML
    parsed_page= BeautifulSoup(page.content, 'html.parser')
    # Pull Table
    html_table= parsed_page.find('table')
    # Pull table rows
    table_rows= html_table.find_all('tr')

    # Table Header
    table_header= dict()
    # Table data
    table_data=[]

    # Loop through table rows
    for idx,row in enumerate(table_rows):
        # Table headers in first row
        if(idx==0):
            # Pull column headers
            col_headers= row.find_all('th')
            # Make a dictionary of column headers
            table_headers= { idx:col_header.get_text(strip=True) for idx,col_header in enumerate(col_headers) }
        
        # Table data
        # Get row columns
        row_cols= row.find_all('td')
        # Get data body into list
        row_data= [ col.get_text(strip=True) for col in row_cols ]
        # Append col to row list
        table_data.append(row_data)

        # Make data into Pandas Frame
    pd_table= pd.DataFrame(table_data)
    # Remove empty rows
    pd_table= pd_table.dropna()
    # Insert column names
    pd_table= pd_table.rename(columns=table_headers)

    # Drop column "No. of Cases (on admission)"
    pd_table= pd_table.drop(["No. of Cases (on admission)"], axis=1)
    # Rename "No. of Cases (Lab Confirmed)"
    pd_table= pd_table.rename(
        columns={"No. of Cases (Lab Confirmed)": "No. of Cases"}
    )

    # UPDATE DATASET
    pd_table.to_csv(
        "../data/processed/NCDC.csv", sep=";", 
    )
    print("Updated data for all {0} states in Nigeria.".format(pd_table.shape[0]))


if __name__ == "__main__":
    get_johns_hopkings()
    get_current_nigeria()


Output: b'Already up to date.\n'
Error: b'From https://github.com/CSSEGISandData/COVID-19\n   4abbeceb..2b07dfce  web-data   -> origin/web-data\n'
Updated data for all 37 states in Nigeria.


### 2. Process pipeline

In [2]:
# %load ../src/data/process_JH_data.py
# Imports
import os, subprocess, json
from datetime import datetime

# Environmental Variables
from dotenv import load_dotenv

import pandas as pd

# HTTP Client
import requests
# For parsing and sifting through HTML
from bs4 import BeautifulSoup


def store_relational_model():
    """ Process Johns Hopkings data into a Relational dataset
    
    Parameters:
    ----------

    Returns:
    -------
    """
    # Read data into dataframe
    data_path= "../data/raw/JH_dataset/COVID-19/" + \
        "csse_covid_19_data/csse_covid_19_time_series/" + \
        "time_series_covid19_confirmed_global.csv"
    pd_raw= pd.read_csv(data_path)

    # Create DataFrame
    rel_fr= pd.DataFrame(pd_raw)

    # Discard Lat and Long columns
    rel_fr= rel_fr.drop(["Lat", "Long"], axis=1)
    
    # Set NaN to 'no'. Important for indexing
    rel_fr= rel_fr.fillna('no')

    # Rename columns for convienence
    rel_fr= rel_fr.rename(
        columns={"Province/State": "state", "Country/Region": "country"}
        )
    # Index data by (state, country)
    rel_fr= rel_fr.set_index(["state", "country"])
    # Make dates row headers and state/country column headers
    rel_fr= rel_fr.T
    # Stack the data by dates and reset indices
    rel_fr= rel_fr.stack(["state", "country"]).reset_index()
    # Set new column names
    rel_fr= rel_fr.rename(columns={"level_0": "date", 0:"confirmed"})

    # Convert date to datetime type
    rel_fr["date"]= rel_fr.date.astype("datetime64[ns]")

    # UPDATE DATASET
    rel_fr.to_csv(
        "../data/processed/COVID_relational_full.csv", sep=";",index=False
    )
    print("Number of rows stored: {0}.".format(rel_fr.shape[0]))


if __name__ == "__main__":
    store_relational_model()

Number of rows stored: 63308.


### 3. Filtering and Doubling Rate Calculation

In [3]:
# %load ../src/features/build_features.py
import numpy as np
import pandas as pd
from sklearn import linear_model
from scipy import signal

# Create Linear Regression Model
reg= linear_model.LinearRegression(fit_intercept= True)  


def get_doubling_rate_via_regression(in_array):
    """ Approximate the doubling time using linear regression.

    3 datapoints are used to approximate the number of days 
    it takes for the number of infected people to double at each point.

    Parameters:
    ----------
    in_array: List/ numpy Array
        input data

    Returns:
    -------
    doubling_time: double
    """
    
    # Assert output vector is 3 datapoints long
    assert len(in_array)==3
 
    y= np.array(in_array)
    # Calculate slope using central difference
    X= np.arange(-1,2).reshape(-1,1)

    # Fit data
    reg.fit(X,y)
    intercept= reg.intercept_
    slope= reg.coef_

    return intercept/slope


def rolling_regression(df_input, col="confirmed"):
    """ Roll over entries to approximate the doubling time using linear regression.

    Parameters:
    ----------
    df_input: pandas DataFrame
        input data
    col: string
        key to column which holds data entries

    Returns:
    -------
    result: pandas Series
    """
    
    days_back= 3
    
    result= df_input[col].rolling(
            window=days_back,
            min_periods=days_back
        ).apply(get_doubling_rate_via_regression, raw=False)
    
    return result


def savgol_filter(df_input, col='confirmed', window=5):
    """ Filter data using savgol filter.

    Parameters:
    ----------
    df_input: pandas DataFrame
        input data
    col: string
        key to column which holds data entries

    Returns:
    -------
    df_result: pandas DataFrame
        df_input with additional column with name col+"_filtered"
    """

    window=5
    degree=1

    df_result=df_input

    filter_in= df_input[col].fillna(0)
    result= signal.savgol_filter(
            np.array(filter_in), window, degree
        )

    df_result[col+ "_filtered"]= result
    return df_result
    

def calc_filtered_data(df_input, filter_on='confirmed'):
    """ Filter data using savgol filter and return merged dataframe

    Parameters:
    ----------
    df_input: pandas DataFrame
        input data
    filter_on: string
        key to column which holds data entries on which to filter

    Returns:
    -------
    df_out: pandas DataFrame
        df_input with additional column with name filter_on+"_filtered"
    """

    # Assertion
    must_contain= set(['state', 'country', filter_on])
    assert must_contain.issubset(set(df_input.columns))

    pd_filt_res= df_input.groupby(['state','country']).apply(savgol_filter, filter_on).reset_index()
    df_out= pd.merge(df_input, pd_filt_res[['index', filter_on+'_filtered']], on=['index'], how='left')

    return df_out


def calc_doubling_rate(df_input, double_on='confirmed'):
    """ Calculate doubling rate using linear regression and return merged dataframe

    Parameters:
    ----------
    df_input: pandas DataFrame
        input data
    double_on: string
        key to column which holds data entries

    Returns:
    -------
    df_out: pandas DataFrame
        df_input with additional column with name double_on+"_filtered"
    """

    # Assertion
    must_contain= set(['state', 'country', double_on])
    assert must_contain.issubset(set(df_input.columns))

    pd_doub_res= df_input.groupby(['state','country']).apply(rolling_regression, double_on).reset_index()
    pd_doub_res= pd_doub_res.rename(columns={'level_2': 'index', double_on: double_on+"_DR"})

    df_out= pd.merge(df_input, pd_doub_res[['index', double_on+'_DR']], on=['index'], how='left')

    return df_out



if __name__ == "__main__":
    # Test data
    test_data= np.array([2,4,6])
    # Expected result= 2
    result= get_doubling_rate_via_regression(test_data)
    assert(int(result[0]) == 2)

    pd_JH_rel= pd.read_csv(
            '../data/processed/COVID_relational_full.csv', 
            sep=';', parse_dates=[0]
        )
    pd_JH_rel= pd_JH_rel.sort_values('date', ascending=True).reset_index(drop=True)
    pd_JH_rel= pd_JH_rel.reset_index()

    pd_res= calc_filtered_data(pd_JH_rel, filter_on='confirmed')
    pd_res= calc_doubling_rate(pd_res, double_on='confirmed')
    pd_res= calc_doubling_rate(pd_res, double_on='confirmed_filtered')
    
    
    # Cleanup confirmed_filtered_DR
    DR_mask= pd_res['confirmed']>100
    pd_res['confirmed_filtered_DR']= pd_res['confirmed_filtered_DR'].where(DR_mask, other=np.NaN)

    # Save
    pd_res.to_csv('../data/processed/COVID_final_set.csv', sep=';', index=False)
    
    print(pd_res.tail())



       index       date state   country  confirmed  confirmed_filtered  \
63303  63303 2020-09-15    no  Barbados      184.0               183.8   
63304  63304 2020-09-15    no   Belarus    74552.0             74553.0   
63305  63305 2020-09-15    no   Belgium    94795.0             94983.0   
63306  63306 2020-09-15    no   Albania    11672.0             11677.6   
63307  63307 2020-09-15    no  Zimbabwe     7576.0              7567.4   

       confirmed_DR  confirmed_filtered_DR  
63303    121.777778             166.090909  
63304    392.409850             387.094742  
63305    140.575124             112.833893  
63306     72.194357              70.335370  
63307    301.773333             347.728111  


### 4. Visual Board

In [None]:
# %load ../src/visualization/visualize.py
import numpy as np
import pandas as pd
import plotly.graph_objects as go

import dash
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as dhtml
from dash.dependencies import Input, Output

import os

print("Working Directory: {0}".format(os.getcwd()))

df_JH_data= pd.read_csv('../data/processed/COVID_final_set.csv', sep=';')

# Create figure
fig= go.Figure()

# Create Dash App
app= dash.Dash(external_stylesheets=[dbc.themes.LUX])

# Country List Select
ctry_input= dbc.FormGroup([
    dhtml.H5("Select Countries"),
    dcc.Dropdown(
        id="country_dropdown",
        options=[ {'label': each, 'value': each} for each in df_JH_data['country'].unique() ],
        value=['Nigeria', 'Germany', 'Italy'],
        multi=True
    )    
])

# Visualization Select
vis_input= dbc.FormGroup([
    dhtml.H5("Select Timeline"),
    dcc.Dropdown(
        id="visual_time",
        options=[
            {'label': 'Confirmed Cases', 'value': 'confirmed'},
            {'label': 'Confirmed Cases Filtered', 'value': 'confirmed_filtered'},
            {'label': 'Doubling Rate of Confirmed Cases', 'value': 'confirmed_DR'},
            {'label': 'Doubling Rate of Confirmed Cases Filtered', 'value': 'confirmed_filtered_DR'}
        ],
        value='confirmed',
        multi=False,
        clearable=False
    )    
])

#Create layout
app.layout= dbc.Container(
    fluid=True,
    children=[
        # Header
        dhtml.H1("COVID-19 Dashboard Prototype", className="text-center"),
        dhtml.Br(),dhtml.Br(),
        dhtml.P(children=[
            "A COVID-19 Dashboard Prototype developed using the Cross Industry \
            Standard Process for Data Mining. The data is sourced from ",
            dhtml.A("Johns Hopkings University", href="https://github.com/CSSEGISandData/COVID-19"),
            " and the Doubling Times (the extimated number of days it will take for the current number of \
            confirmed cases to get doubled) calculated using Linear Regression over a window of 3 days."
        ]),
        
        # Body
        dbc.Row([
            dbc.Col(md=6, lg=4, children=[ctry_input]),
            dbc.Col(md=6, lg=4, children=[vis_input]),
            dhtml.Br(),dhtml.Br(),
            # Plot
            dbc.Col(sm=12, children=[
                dbc.Col(dhtml.H4("Plots", className="text-center"), sm=12),
                dcc.Graph(figure=fig, id="main_figure")
            ]
            )
        ], className="align-items-center"
        )        
    ],
)

# Add callback for Dropdown

# Callback wrapper
@app.callback(
    Output("main_figure", "figure"),
    [
        Input("country_dropdown", "value"),
        Input('visual_time', 'value')
    ]
)
# Callback function
def update_fig(selected_countries, visual_name):

    # Title
    if('DR' in visual_name):
        my_yaxis={
            'type': 'log',
            'title': 'Approximated doubling rate over 3 days (the larger the number, the better)'
        }
    
    else: 
        my_yaxis={
            'type': 'linear',
            'title': 'Confirmed cases (source: Johns Hopkings, linear-scale)'
        }

    #Traces
    traces= []
    for country in selected_countries:

        # Selected country mask
        df_plot= df_JH_data[df_JH_data['country']== country]

        # Aggregate country-wide data
        if 'DR' in visual_name:
            # If doubling rate is being calculated, use the mean over the states
            df_plot= df_plot[[
                'date', 'state', 'country', 'confirmed', 'confirmed_filtered', 
                'confirmed_DR', 'confirmed_filtered_DR'
                ]].groupby(['country', 'date']).agg(np.mean).reset_index()

        else:
            # Otherwise, sum up the values for all states
            df_plot= df_plot[[
                'date', 'state', 'country', 'confirmed', 'confirmed_filtered', 
                'confirmed_DR', 'confirmed_filtered_DR'
                ]].groupby(['country', 'date']).agg(np.sum).reset_index()


        # Add a trace
        traces.append(
            {
                "x": df_plot.date,
                "y": df_plot[visual_name],
                "mode":"markers+lines",
                "opacity": 0.8,
                "name": country
            }
        )

    # Layout
    fig_design= dict(
        xaxis_title="Timeline",
        xaxis={
            "tickangle": -75,
            "nticks": 20,
            "tickfont": dict(size=14, color="#7f7f7f")
        },
        yaxis=my_yaxis
    )

    return {
        "data": traces,
        "layout": fig_design
    }



if __name__ == "__main__":

    app.run_server(debug=True, use_reloader=False)


Working Directory: /home/faaizz/Desktop/SS_2020/EDS/personal/ads_covid-19
Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


In [18]:
help(dhtml.A)

Help on class A in module dash_html_components.A:

class A(dash.development.base_component.Component)
 |  A(children=None, id=undefined, n_clicks=undefined, n_clicks_timestamp=undefined, key=undefined, role=undefined, download=undefined, href=undefined, hrefLang=undefined, media=undefined, rel=undefined, shape=undefined, target=undefined, accessKey=undefined, className=undefined, contentEditable=undefined, contextMenu=undefined, dir=undefined, draggable=undefined, hidden=undefined, lang=undefined, spellCheck=undefined, style=undefined, tabIndex=undefined, title=undefined, loading_state=undefined, **kwargs)
 |  
 |  An A component.
 |  A is a wrapper for the <a> HTML5 element.
 |  For detailed attribute info see:
 |  https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a
 |  
 |  Keyword arguments:
 |  - children (a list of or a singular dash component, string or number; optional): The children of this component
 |  - id (string; optional): The ID of this component, used to identif