SECTION OBJECTIVES:
* Importing and Cleaning Dataset
* Building web application layout
* Build interactive elements
* Launch aaplication

* WORKFLOW:
    * Preparing Data
        * Importing: `wrangle` function
    * Building Dashboard
        * Application layout: **Application layers**
        * Variance bar chart:
            * **Callback:** Allows interactive elements of dash application to automatically update
            * **decorator:** Added to a python function to give additional powers
        * K-Means slider
        * PCA scatter plot
    * Application Deployment

In [1]:
# importing necessary libraries
import pandas as pd

import plotly.express as px
from dash import Input, Output, dcc, html # dash core components, html elements for layouts
from jupyter_dash import JupyterDash # used for building dash app inside Jupyter notebook

from scipy.stats.mstats import trimmed_var

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# 1. Preparing Data
## Importing
### Wrangle Function

In [2]:
def wrangle(filepath):

    """Read SCF data file into ``DataFrame``.

    Returns only credit fearful households whose net worth is less than $2 million.

    Parameters
    ----------
    filepath : str
        Location of CSV file.
    """
    # Loading data
    df = pd.read_csv(filepath)
    # Creating mask
    mask = (df['TURNFEAR'] == 1) & (df['NETWORTH'] < 2e6)
    # Subsetting dataframe
    df = df[mask]
    return df

In [3]:
# Loading file using wrangle function
df = wrangle('SCFP2019.zip')
print(df.shape)
df.head()

(4418, 351)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
5,2,21,3790.476607,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
6,2,22,3798.868505,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,3,2,2
7,2,23,3799.468393,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
8,2,24,3788.076005,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
9,2,25,3793.066589,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2


# 2. Building Dashboard
## 2.1 Application Layout
### Instantiating Application
Instantiating a `JupyterDash` application and assigning it to the variable name `app`

In [4]:
# dunder (__) method grabs name of current module
app = JupyterDash(__name__)

### Layout
* **NOTE:**
    * Layout needs to be updated as new business layers are added
        * **Presentation Layer:** What the user sees
        * **Service Layer:** Translation/Go-between Presentation and Business layers to ensure smooth changes
            * It gets input from other two layers and translates it into an output the other layers can understand
        * **Business Layer:** The actual code feeding data to presentation layer
    * Building a dash app is an iterative process

In [30]:
# Setting layout for dashboard app
app.layout = html.Div(
    # List for putting elements
    [
        # Application title
        html.H1('Survey of Consumer Finances'),
        # Bar Chart element
        html.H2('High Variance Features'),
        # Bar chart graph
        dcc.Graph(id = 'bar-chart'),
        # Adding Radio Button
        dcc.RadioItems(
            options = [
                {'label': 'trimmed', 'value': True},
                {'label': 'not trimmed', 'value': False}
            ],
            value = True,
            id = 'trim-button'
        ),
        
        
        # Adding text for K-Means Slider
        html.H2('K-Means Clustering'),
        html.H3('Number of Clusters (k)'),
        # Adding K-Means Slider
        dcc.Slider(min = 2, max = 12, step = 1, value = 2, id = 'k-slider'),
        # Adding Text Placeholder for K-Means Metrics
        html.Div(id = 'metrics'),
        
        
        # Adding PCA Scatter Plot
        dcc.Graph(id = 'pca-scatter')
    ]
)

## 2.2 Variance Bar Chart
### Business Layer

In [6]:
def get_high_var_features(trimmed = True, return_feat_names = True):

    """Returns the five highest-variance features of ``df``.

    Parameters
    ----------
    trimmed : bool, default = True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    return_feat_names : bool, default = True
        If ``True``, returns feature names as a ``list``. If ``False``
        returns ``Series``, where index is feature names and values are
        variances.
    """
    # Calculating variance
    if trimmed:
        top_five_features = (
            df.apply(trimmed_var).sort_values().tail(5)
        )
    else:
        top_five_features = df.var().sort_values().tail(5)
    
    # Extracting names
    if return_feat_names:
        top_five_features = top_five_features.index.tolist()
       
    return top_five_features

### Service Layer

In [7]:
# Adding callback decorator
@app.callback(
    Output('bar-chart', 'figure'),
    Input('trim-button', 'value')
)
def serve_bar_chart(trimmed = True):

    """Returns a horizontal bar chart of five highest-variance features.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.
    """
    # Get features
    top_five_features = get_high_var_features(
        trimmed = trimmed,
        return_feat_names = False
    )
    # Building bar chart
    fig = px.bar(
        x = top_five_features,
        y = top_five_features.index,
        orientation = 'h'
    )
    # Updating bar chart
    fig.update_layout(
        xaxis_title = 'Variance',
        yaxis_title = 'Feature'       
    )
    
    return fig  

## 2.3 K-Means Slider
### Business Layer

In [18]:
def get_model_metrics(trimmed = True, k = 2, return_metrics = False):

    """Builds ``KMeans`` model based on five highest-variance features in ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.

    return_metrics : bool, default=False
        If ``False`` returns ``KMeans`` model. If ``True`` returns ``dict``
        with inertia and silhouette score.

    """
    # Getting high variance features
    features = get_high_var_features(trimmed = trimmed, return_feat_names = True)
    # Creating feature matrix
    X = df[features]
    # Building model
    model = make_pipeline(
        StandardScaler(),
        KMeans(n_clusters = k, random_state = 108)
    )
    # Fitting model
    model.fit(X)
    # Adding `return_metrics` functionality
    if return_metrics:
        # Calculating inertia
        i = model.named_steps['kmeans'].inertia_
        # Calculating silhouette score
        ss = silhouette_score(X, model.named_steps['kmeans'].labels_)
        # Putting results into dictionary
        metrics = {
            'inertia': round(i),
            'silhouette': round(ss, 3)
        }
        # Return dictionary to user
        return metrics
    
    return model

### Service Layer
* Step 1: This function will get input for `k` from Presentation Layer
* Step 2: It will then use `get_model_metrics` to get that information
* Step 3: It will then take those metrics put them into html element 
* Step 4: finally it will pass that element on to presentation layer

In [28]:
# Adding callback decorator
@app.callback(
    Output('metrics', 'children'),
    Input('trim-button', 'value'),
    Input('k-slider', 'value')
)
def serve_metrics(trimmed = True, k = 2):

    """Returns list of ``H3`` elements containing inertia and silhouette score
    for ``KMeans`` model.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Getting metrics
    metrics = get_model_metrics(trimmed = trimmed, k = k, return_metrics = True)
    # Adding metrics to html elements
    text = [
        html.H3(f'Inertia: {metrics["inertia"]}'),
        html.H3(f'Silhouette Score: {metrics["silhouette"]}')
    ]
    
    return text

## 2.4 PCA Scatter Plot
### Business Layer

In [50]:
def get_pca_labels(trimmed = True, k = 2):

    """
    ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Creating feature matrix
    features = get_high_var_features(trimmed = trimmed, return_feat_names = True)
    X = df[features]
    # Building Transformer
    transformer = PCA(n_components = 2, random_state = 108)
    # Transforming data
    X_t = transformer.fit_transform(X)
    # Putting transformed data into a DataFrame
    X_pca = pd.DataFrame(X_t, columns = ['PC1', 'PC2'])
    # Adding labels
    model = get_model_metrics(trimmed = trimmed, k = k, return_metrics = False)
    # Adding new column
    X_pca['labels'] = model.named_steps['kmeans'].labels_.astype(str)
    # Sorting dataframe by labels column
    X_pca.sort_values('labels', inplace = True)
    
    return X_pca

### Service Layer

In [56]:
# Adding callback decorator
@app.callback(
    Output('pca-scatter', 'figure'),
    Input('trim-button', 'value'),
    Input('k-slider', 'value')
)
def serve_scatter_plot(trimmed = True, k = 2):

    """Builds 2D scatter plot of ``df`` with ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Creating Scatter Plot
    fig = px.scatter(
        data_frame = get_pca_labels(trimmed = trimmed, k = k),
        x = 'PC1',
        y = 'PC2',
        color = 'labels',
        title = 'PCA Representation of Clusters'
    )
    # Updating scatter plot
    fig.update_layout(xaxis_title = 'PC1', yaxis_title = 'PC2')
    
    return fig

# 3. Application Deployment

In [57]:
app.run_server()

Dash app running on http://127.0.0.1:8050/
