In [34]:
%%writefile SCF.py
import streamlit as st
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

st.set_page_config(layout='wide',
                  page_title = 'Survey Of Consumer Finance Dashboards')

df = pd.read_csv("C:/SCFP2019.csv")
mask = (df['TURNFEAR']==1) & (df['NETWORTH']<2000000)
df_fear = df[mask]
agecl_dict = {
            1: "Under 35",
            2: "35-44",
            3: "45-54",
            4: "55-64",
            5: "65-74",
            6: "75 or Older"}
age_cl = df_fear['AGECL'].replace(agecl_dict)
age_cl_value_counts = age_cl.value_counts()

race_dict = {
        1: "White/Non-Hispanic",
        2: "Black/African-American",
        3: "Hispanic",
        5: "Other"}
race = df_fear["RACE"].replace(race_dict)
race_value_counts = race.value_counts(normalize=True)

race_df = df["RACE"].replace(race_dict)
race_df_value_counts = race_df.value_counts(normalize=True)

inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df['INCCAT'].replace(inccat_dict).groupby(df['TURNFEAR']).value_counts(normalize=True).rename('frequency').to_frame().reset_index()
)

educ_dict = {
    -1: "less than 1st grade",
    1: "1st , 2nd, 3rd, 4th grade",
    2: "5th, 6th grade",
    3: "7th, 8th grade",
    4: "9th grade",
    5: "10th grade",
    6: "11th grade",
    7: "12th grade",
    8: "High school Graduate",
    9: "Some College but no degree",
    10: "ASSOCIATE DEGREE IN COLLEGE - OCCUPATION",
    11: "ASSOCIATE DEGREE IN COLLEGE - ACADEMIC",
    12: "BACHELOR'S DEGREE",
    13: "MASTER'S DEFREE",
    14: "DOCTORATE DEGREE"
}

df_educ = (
    df['EDUC'].replace(educ_dict).groupby(df['TURNFEAR']).value_counts(normalize=True).rename('frequency').to_frame().reset_index()
)

top_ten_var = df_fear.var().sort_values().tail(10)
top_ten_trim_var = df_fear.apply(trimmed_var).sort_values().tail(10)


tab1, tab2, tab3 , tab4= st.tabs(['EDA', 'Exploring Features', 'Communicating findings','Model Prediction'])

with tab1:
    col1, col2 = st.columns(2)
    with col1:
            

        st.header('AGE')

        fig= px.bar(age_cl_value_counts, title="Credit Fearful: Age Groups")
        fig.update_layout(xaxis_title="Age Group", yaxis_title="Count")
        st.plotly_chart(fig,use_container_width=True)
    
        fig= px.histogram(df_fear['AGE'], title="Credit Fearful: Age Distribution", nbins=20)
        fig.update_layout(xaxis_title="Age")
        st.plotly_chart(fig,use_container_width=True)
        
        st.header('Race')


        fig= px.bar(race_value_counts, title="Credit Fearful: Racial Groups")
        fig.update_layout(xaxis_title="Race Group", yaxis_title="Count")
        st.plotly_chart(fig,use_container_width=True)

        fig= px.bar(race_df_value_counts, title="SCF Respondents: Racial Groups")
        fig.update_layout(xaxis_title="Race Group", yaxis_title="Count")
        st.plotly_chart(fig,use_container_width=True)

        st.header('INCOME')

        fig = px.bar(df_inccat, x="INCCAT", y='frequency',barmode='group', color="TURNFEAR", title='Income Distribution: Credit Fearful vs. Non-fearful')
        fig.update_layout(xaxis_title="Age")
        st.plotly_chart(fig,use_container_width=True)



        st.header('Asset')

        cols = ["ASSET", "HOUSES", "INCOME", "DEBT", "EDUC"]
        df_corr = df[cols].corr()
        fig = px.imshow(df_corr, title='Correlation matrix using whole dataset')
        st.plotly_chart(fig,use_container_width=True)

        
        fear_corr = df_fear[cols].corr()
        fig = px.imshow(fear_corr, title='Correlation matrix using only turnfear dataset')
        st.plotly_chart(fig,use_container_width=True)

        
        st.header('Education')

        
        fig = px.bar(df_educ, x="EDUC", y='frequency',color='TURNFEAR', title="Educational Attainment: Credit Fearful vs. Non-fearful")
        fig.update_layout(xaxis_title="Education Level", yaxis_title="Frequency")
        st.plotly_chart(fig,use_container_width=True)

with tab2:
    col1, col2 = st.columns(2)
    with col1:

        st.header('High Variance Features')
        fig = px.bar(x=top_ten_var, y=top_ten_var.index, title="SCF: High Variance Features")
        fig.update_layout(xaxis_title="variance", yaxis_title="features")
        st.plotly_chart(fig,use_container_width=True)

        st.header('Adressing Outliers')
        fig = px.box(data_frame=df_fear, x="NHNFIN", title= "Distribution of Non-home, Non-Financial Assets (NHNFIN)" )
        fig.update_layout(xaxis_title="Value $")
        st.plotly_chart(fig,use_container_width=True)


        st.header('Trimmed High Variance Features')
        fig = px.bar(x=top_ten_trim_var, y=top_ten_trim_var.index, title="SCF: High Variance Features")
        fig.update_layout(xaxis_title="variance", yaxis_title="features")
        st.plotly_chart(fig,use_container_width=True)
        
with tab3:
    col1, col2 = st.columns(2)
    with col1:

        high_var_cols = top_ten_trim_var.tail(5).index.to_list()
        X = df_fear[high_var_cols]
        ss = StandardScaler()
        X_scaled_data = ss.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled_data, columns=X.columns)
        n_clusters = range(2,13)
        inertia_errors = []
        silhouette_scores = []
        # Add `for` loop to train model and calculate inertia, silhouette score.
        for k in n_clusters:
            model = make_pipeline(StandardScaler(), KMeans(n_clusters=k, random_state=42))
            model.fit(X)
            inertia_errors.append(model.named_steps["kmeans"].inertia_)
            silhouette_scores.append(silhouette_score(X,model.named_steps["kmeans"].labels_))

        st.header('Evaluating Model (Inertia)')
        fig = px.line(x=n_clusters, y=inertia_errors, title="K-Means Model: Inertia vs Number of Clusters")
        fig.update_layout(xaxis_title="Number of Clusters", yaxis_title="Inertia")
        st.plotly_chart(fig,use_container_width=True)


        st.header('Evaluating Model (Silhouette Score)')
        fig = px.line(x=n_clusters, y=silhouette_scores, title="K-Means Model: Silhouette Score vs Number of Clusters")
        fig.update_layout(xaxis_title="Number of Clusters", yaxis_title="Silhouette Score")
        st.plotly_chart(fig,use_container_width=True)

        st.header('Clusters Summery')

        final_model = make_pipeline(StandardScaler(), KMeans(n_clusters=4, random_state=42))
        final_model.fit(X)
        labels = final_model.named_steps['kmeans'].labels_
        xgb = X.groupby(labels).mean()
        fig = px.bar(xgb, barmode='group', title="Mean Household Finances by Cluster")
        fig.update_layout(xaxis_title="Cluster", yaxis_title="Value [$]")
        st.plotly_chart(fig,use_container_width=True)

        st.header('2D Representation of Clusters (PCA)')
        pca = PCA(n_components=2, random_state=42)
        X_t = pca.fit_transform(X)
        X_pca = pd.DataFrame(X_t, columns=["PC1","PC2"])
        fig = px.scatter(data_frame=X_pca, x="PC1", y="PC2", color=labels.astype('str'), title="PCA Representation of Clusters")
        fig.update_layout(xaxis_title="PC1", yaxis_title="PC2")
        st.plotly_chart(fig,use_container_width=True)


with tab4:
    col1, col2 = st.columns(2)
    with col1:
        def predict_segment(NETWORTH,	DEBT,	HOUSES,	NFIN,	ASSET):
            
            prediction = final_model.predict(pd.DataFrame({'NETWORTH':[NETWORTH],\
                                           'DEBT': [DEBT], \
                                           'HOUSES': [HOUSES],\
                                           'NFIN': [NFIN],\
                                           'ASSET':[ASSET]}))
            label = ['0', '1' , '2']
            return label[prediction[0]]

        def main():
            
            
            st.title('Customer segmentation')
            html_temp="""
                <div style="background-color:red">
                <h2 style="color:white;text-align:center;"> Consumer finance segmentation </h2>
                </div>
              """
            st.markdown(html_temp,unsafe_allow_html=True)

        NETWORTH = st.text_input("NETWORTH","Type Here")
        DEBT = st.text_input("DEBT","Type Here")
        HOUSES = st.text_input("HOUSES","Type Here")
        NFIN = st.text_input("NFIN","Type Here")
        ASSET = st.text_input("ASSET","Type Here")

        result = ""
        if st.button('predict'):
            result = predict_segment(NETWORTH,	DEBT,	HOUSES,	NFIN,	ASSET)
            st.success('this person belongs to cluster name {}'.format(result))

if __name__ == "__main__":
  main()

Overwriting SCF.py


In [None]:
! streamlit run SCF.py