In [1]:
import pandas as pd
import plotly.express as px
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_is_fitted

In [8]:
# for interactive dashboards
from jupyter_plotly_dash import JupyterDash
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
#JupyterDash.infer_jupyter_proxy_config()

In [5]:
#pip install jupyter-plotly-dash

In [None]:
def wrangle(filepath):

    """Read SCF data file into ``DataFrame``.

    Returns only credit fearful households whose net worth is less than $2 million.

    Parameters
    ----------
    filepath : str
        Location of CSV file.
    """   
    # load data
    df = pd.read_csv(filepath)
    # create mask
    mask = (df['NETWORTH'] < 2e6) & (df['TURNFEAR'] == 1)
    # subset dataframe
    df = df[mask]

    return df

In [None]:
df = wrangle("data/SCFP2019.csv.gz")

print("df type:", type(df))
print("df shape:", df.shape)
df.head()

In [10]:
app = JupyterDash(__name__)

print("app type:", type(app))
#app.run_server()

app type: <class 'jupyter_plotly_dash.dash_wrapper.JupyterDash'>


In [None]:
app.layout = html.Div(
    [# Application Title
        html.H1("Survey of Consumer Finances"),
        # Bar Chart element
        html.H2("High Variance Features"),
        #Bar chart graph
        dcc.Graph(figure = serve_bar_chart(), id = "bar-chart"),
        dcc.RadioItems(
            options = [
                {"label": "trimmed", "value" : True},
                {"label" : "not trimmed", "value" : False}
            ],
            value = True, # default value
            id = 'trim-button' # name to show in app
        ),
        # Kmeans clusetring
        html.H2("K-means Clustering"),
        html.H3("Number of Clusters (k)"),
        dcc.Slider(min=2, max=12, step=1, value=2, id = 'k-slider'),
        html.Div(id = 'metrics'),
        # PCA Graph
        dcc.Graph(id = "pca-scatter")
    ]
)
        

In [None]:
#app.run_server(host="0.0.0.0", mode="external")

In [None]:
def get_high_var_features(trimmed=True, return_feat_names = False):
    """Returns the five highest-variance features of ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    return_feat_names : bool, default=False
        If ``True``, returns feature names as a ``list``. If ``False``
        returns ``Series``, where index is feature names and values are
        variances.
    """
    #calculate variance
    if trimmed:
        top_five_features = (
            df.apply(trimmed_var).sort_values().tail(5)
        )
    else:
        top_five_features = df.var().sort_values().tail(5)
        
    # extract names
    if return_feat_names:
        top_five_features = top_five_features.index.to_list()
            
    return top_five_features

In [None]:
# adding callback decorator
@app.callback(
    Output("bar-chart", "figure"), Input("trim-button", "value")
)
def serve_bar_chart(trimmed=True):

    """Returns a horizontal bar chart of five highest-variance features.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.
    """
    # get feature
    top_five_feature = get_high_var_features(trimmed,return_feat_names =False)
    
    # build bar chart
    fig = px.bar(
        x = top_five_feature,
        y = top_five_feature.index,
        orientation = "h"
    )
        # title already explained in the heading
    fig.update_layout(xaxis_title = 'Variance', yaxis_title = 'Feature')
        
    return fig

In [None]:
serve_bar_chart(trimmed=True)

In [None]:
def get_model_metrics(trimmed=True, k=2, return_metrics=False):

    """Build ``KMeans`` model based on five highest-variance features in ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.

    return_metrics : bool, default=False
        If ``False`` returns ``KMeans`` model. If ``True`` returns ``dict``
        with inertia and silhouette score.

    """
    # get high var features
    features = get_high_var_features(trimmed=trimmed, return_feat_names = True)
    # create feature matrix
    X  = df[features]
    # build model
    model = make_pipeline(
        StandardScaler(),
        KMeans(n_clusters = k, random_state= 42)
    )
    model.fit(X)
    if return_metrics:
        # calculate inertia
        i = model.named_steps['kmeans'].inertia_
        # calculate silhouette score
        ss = silhouette_score(X,model.named_steps['kmeans'].labels_)
        # add both to dictionary 
        metrics = {
            "inertia" : round(i),
            "silhouette" : round(ss,3)
        }
        return metrics
    return model

In [None]:
@app.callback(
    Output("metrics", "children"), 
    Input("trim-button", "value"), # input order should be same as serve_mtrix arguments
    Input("k-slider", "value")
)
def serve_metrics(trimmed = True, k=2):
    """Returns list of ``H3`` elements containing inertia and silhouette score
    for ``KMeans`` model.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # get metrics
    metrics = get_model_metrics(trimmed=trimmed, k=k, return_metrics=True)
    # add metrics to html elements
    text = [
        html.H3(f"Inertia: {metrics['inertia']}"),
        html.H3(f"Silhouette Score: {metrics['silhouette']}"),
                
    ]      
    return text

In [None]:
def get_pca_labels(trimmed = True, k=2):

    """
    ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # create feature metrix
    features = get_high_var_features(trimmed = trimmed, return_feat_names=True)
    X = df[features]
    
    # build transformer - PCA analysis
    transformer  = PCA(n_components = 2, random_state=42)
    
    # transform data
    X_t = transformer.fit_transform(X)
    
    X_pca = pd.DataFrame(X_t, columns = ['PC1', 'PC2'])
    
    # add labels column
    model = get_model_metrics(trimmed = trimmed, k=k, return_metrics=False)
    X_pca['labels'] = model.named_steps['kmeans'].labels_.astype(str)
    #sort values
    X_pca.sort_values('labels', inplace=True)
    
    return X_pca

In [None]:
@app.callback(
    Output("pca-scatter", "figure"),
    Input("trim-button","value"),
    Input("k-slider","value")
)
def serve_scatter_plot(trimmed =True, k=2):

    """Build 2D scatter plot of ``df`` with ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    df = get_pca_labels(trimmed = trimmed, k=k)
    fig = px.scatter(
        data_frame = df,
        x = 'PC1',
        y = 'PC2',
        color = 'labels',
        title = "PCA Representation of Clusters"
    )
    fig.update_layout(xaxis_title = "PC1", yaxis_title = "PC2")
        
    return fig

In [None]:

# I have to figure out where to host this and the data set to do this example
app.run_server(host="0.0.0.0", mode="external")