## Libraries

In [1]:
#Import Libraries
import pandas as pd
import plotly.express as px
from dash import Input, Output, dcc, html
from jupyter_dash import JupyterDash
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


## Import Data

In [2]:
def wrangle(filepath):

    """Read SCF data file into ``DataFrame``.

    Returns only credit fearful households whose net worth is less than $2 million.

    Parameters
    ----------
    filepath : str
        Location of CSV file.
    """
    #load data
    df=pd.read_csv(filepath)
    #create mask
    mask=(df["TURNFEAR"]==1)&(df["NETWORTH"]<2e6)
    #subset df
    df=df[mask]
    return df

In [3]:
df = wrangle("SCFP2019.csv")
print(df.shape)
df.head()

(4418, 351)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
5,2,21,3790.476607,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
6,2,22,3798.868505,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,3,2,2
7,2,23,3799.468393,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
8,2,24,3788.076005,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
9,2,25,3793.066589,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2


## Build Dashboard

In [4]:
#instantiate a jupyterdash app
app = JupyterDash(__name__)

In [5]:
app.layout = html.Div(
    [
        #Application title
        html.H1("Survey of Consumer Finances"),
        #Bar Chat element
        html.H2("High Variance Features"),
        #Bar Chart graph
        dcc.Graph(id="bar-chart"),
        dcc.RadioItems(options=[{"label":"trimmed", "value":True},
                        {"label":"not trimmed", "value":False}
    
],
              value=True,
              id="trim-button"),
        #K-means Slider
        html.H2("K-means Clustering"),
        html.H3("Number of Clusters (k)"),
        dcc.Slider(min=2,max=12,step=1,value=2,id="k-slider"),
        html.Div(id="metrics"),
        #PCA scatter plot
        dcc.Graph(id="pca-scatter")
    ]
)

## Variance Bar Chart

In [6]:
def get_high_var_features(trimmed=True,return_feat_names=True):

    """Returns the five highest-variance features of ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    return_feat_names : bool, default=False
        If ``True``, returns feature names as a ``list``. If ``False``
        returns ``Series``, where index is feature names and values are
        variances.
    """
    #calculate variance
    if trimmed:
        top_five_features=(
            df.apply(trimmed_var).sort_values().tail(5)
        )
    else:
        top_five_features=df.var().sort_values().tail(5)
    #Extract names
    if return_feat_names:
        top_five_features=top_five_features.index.tolist()
        
    return top_five_features

In [7]:
#create barchart and callback decorator
@app.callback(
    Output("bar-chart","figure"),Input("trim-button","value")
)
def serve_bar_chart(trimmed=True):

    """Returns a horizontal bar chart of five highest-variance features.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.
    """
    #Get Features
    top_five_features=get_high_var_features(trimmed=trimmed, return_feat_names=False)
    #Build bar chart
    fig=px.bar(x=top_five_features, y=top_five_features.index, orientation="h")
    fig.update_layout(xaxis_title="varience",yaxis_title="feature")
    
    return fig

## K-means Slider

In [8]:
def get_model_metrics(trimmed=True,k=2,return_metrics=False):

    """Build ``KMeans`` model based on five highest-variance features in ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.

    return_metrics : bool, default=False
        If ``False`` returns ``KMeans`` model. If ``True`` returns ``dict``
        with inertia and silhouette score.

    """
    #Get high var features
    features= get_high_var_features(trimmed=trimmed,return_feat_names=True)
    #create feature Metrix
    X=df[features]
    #build model
    model=make_pipeline(StandardScaler(),KMeans(n_clusters=k,random_state=42))
    model.fit(X)
    if return_metrics:
        #calculate inertia

        i=model.named_steps["kmeans"].inertia_
        #calculate ss
        ss=silhouette_score(X,model.named_steps["kmeans"].labels_)
        #put into a dict
        metrics= {
        "inertia": round(i),
        "silhoutte_score":round(ss,3)
        }
       # return dict to user
        return metrics
    
    return model

In [9]:
@app.callback(Output("metrics","children"),
             Input("trim-button","value"),
             Input("k-slider","value"))
def serve_metrics(trimmed=True,k=2):

    """Returns list of ``H3`` elements containing inertia and silhouette score
    for ``KMeans`` model.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    #Get metrics
    metrics=get_model_metrics(trimmed=trimmed,k=k,return_metrics=True)
    #add metrics to html elements
    text=[
        html.H3(f"Inertia:{metrics['inertia']}"),
        html.H3(f"Silhouette Score:{metrics['silhoutte_score']}")
                ]      
    
    return text

## PCA Scatterplot

In [10]:
def get_pca_labels(trimmed=True,k=2):

    """
    ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
   #create feature metrics 
    features=get_high_var_features(trimmed=trimmed,return_feat_names=True)
    X=df[features]
    #build transformer
    tranformer=PCA(n_components=2,random_state=42)
    #transform data
    X_t=tranformer.fit_transform(X)
    X_pca=pd.DataFrame(X_t,columns=["PCA1","PCA2"])
    #add labels
    model=get_model_metrics(trimmed=trimmed,k=k,return_metrics=False)
    X_pca["labels"]=model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values("labels",inplace=True)
    
    
    return X_pca

In [11]:
@app.callback(Output("pca-scatter","figure"),
             Input("trim-button","value"),
             Input("k-slider","value"))
def serve_scatter_plot(trimmed=True, k=2):

    """Build 2D scatter plot of ``df`` with ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    fig=px.scatter(
        data_frame=get_pca_labels(trimmed=trimmed,k=k),
        x="PCA1",
        y="PCA2",
        color="labels",
        title="PCA rep of clusters"
    )
    fig.update_layout(xaxis_title="PCA1",yaxis_title="PCA1")
    return fig

## Dash Application deployment

In [12]:
if __name__=='__main__':
    app.run_server(debug=True)

Dash app running on http://127.0.0.1:8050/
