# Read Data

In [3]:
import pandas as pd

feature_names=["Age", "Workclass", "Final Weight", "Education", "Education-Num", "Marital Status",
               "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
               "Hours per week", "Country", "Income"]

df_train=pd.read_csv("../../data/adult.data",
                     names=feature_names)

df_test=pd.read_csv("../../data/adult.test",
                    skiprows =1,
                    names=feature_names)



# Data prep & problem definition

In [4]:
def df_prep(df):    
    LABEL="Income"
    y_train=df[LABEL].replace([" <=50K"," <=50K."," >50K"," >50K."],[0,0,1,1])
    df_X=df.drop(LABEL, axis=1)
    return df_X,y_train

df_X_train,y_train=df_prep(df_train)
df_X_test,y_test=df_prep(df_test)

# Very basic feature/classifier pipeline

In [5]:
import numpy as np
from sklearn_pandas import gen_features
from sklearn_pandas import DataFrameMapper
import sklearn.preprocessing as preprocessing

nums=[ ([c],preprocessing.Imputer()) for c in df_X_train.select_dtypes([np.number])]
cats=[ ([c],preprocessing.LabelBinarizer()) for c in df_X_train.select_dtypes(["object"])]

feature_mapper=DataFrameMapper(nums+cats,df_out=True)
feature_mapper

DataFrameMapper(default=False, df_out=True,
        features=[(['Age'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['Final Weight'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['Education-Num'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['C...sparse_output=False)), (['Country'], LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False))],
        input_df=False, sparse=False)

In [6]:
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier

param_dist = {"classifier__n_estimators": [10,50,75,100,],
              "classifier__max_depth": [1,2,4,8,10,20, 40,80],
              "classifier__min_samples_leaf": sp_randint(1, 50),
              "classifier__bootstrap": [True, False],
              "classifier__n_jobs":[-1]
             }

classifier = RandomForestClassifier(random_state=42)

pipeline=Pipeline([('featurize', feature_mapper),
                   ('classifier',classifier)])

In [7]:
from sklearn.model_selection import RandomizedSearchCV

# Quick grid search
N_ITER_SEARCH = 5
CV=5

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                                   n_jobs=1,n_iter=N_ITER_SEARCH,
                                   scoring="roc_auc",
                                   error_score=0,cv=CV,verbose=10)

random_search.fit(df_X_train,y_train)

clf=random_search.best_estimator_
clf

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1 
[CV]  classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.9090828109755972, total=   3.0s
[CV] classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.6s remaining:    0.0s


[CV]  classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.9053719902706228, total=   3.4s
[CV] classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.9s remaining:    0.0s


[CV]  classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.9118816329626511, total=   3.0s
[CV] classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.7s remaining:    0.0s


[CV]  classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.9156352809631133, total=   3.0s
[CV] classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   15.6s remaining:    0.0s


[CV]  classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=20, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.9147266347376328, total=   3.4s
[CV] classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   19.9s remaining:    0.0s


[CV]  classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.8700398574937968, total=   1.9s
[CV] classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   22.6s remaining:    0.0s


[CV]  classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.8688209734743412, total=   1.9s
[CV] classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   25.4s remaining:    0.0s


[CV]  classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.8688887607530876, total=   1.9s
[CV] classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   28.1s remaining:    0.0s


[CV]  classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.8870207936026353, total=   1.8s
[CV] classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   30.7s remaining:    0.0s


[CV]  classifier__bootstrap=False, classifier__max_depth=2, classifier__min_samples_leaf=44, classifier__n_estimators=50, classifier__n_jobs=-1, score=0.8749658806180238, total=   1.9s
[CV] classifier__bootstrap=True, classifier__max_depth=80, classifier__min_samples_leaf=28, classifier__n_estimators=10, classifier__n_jobs=-1 
[CV]  classifier__bootstrap=True, classifier__max_depth=80, classifier__min_samples_leaf=28, classifier__n_estimators=10, classifier__n_jobs=-1, score=0.9045919138197397, total=   1.6s
[CV] classifier__bootstrap=True, classifier__max_depth=80, classifier__min_samples_leaf=28, classifier__n_estimators=10, classifier__n_jobs=-1 
[CV]  classifier__bootstrap=True, classifier__max_depth=80, classifier__min_samples_leaf=28, classifier__n_estimators=10, classifier__n_jobs=-1, score=0.9010042578924773, total=   1.6s
[CV] classifier__bootstrap=True, classifier__max_depth=80, classifier__min_samples_leaf=28, classifier__n_estimators=10, classifier__n_jobs=-1 
[CV]  classif

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.5min finished


Pipeline(memory=None,
     steps=[('featurize', DataFrameMapper(default=False, df_out=True,
        features=[(['Age'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['Final Weight'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['Education-Num'], Imputer(ax...timators=50, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

# Prepare dataframe for Dash

In [8]:
# Add predicted probabilities
y_predicted=clf.predict_proba(df_X_test)[:,1]

eval_df=df_test
eval_df["Scored_Prob"]=y_predicted

# these two will be hidden from UI, below
eval_df["y"]=y_test
eval_df["index"]=eval_df.index.values
eval_df

Unnamed: 0,Age,Workclass,Final Weight,Education,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income,Scored_Prob,y,index
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.,0.000897,0,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.,0.265733,0,1
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.,0.319508,1,2
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.,0.873189,1,3
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.,0.001344,0,4
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.,0.017028,0,5
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K.,0.006993,0,6
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K.,0.703605,1,7
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K.,0.005810,0,8
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K.,0.087917,0,9


# Dash 

In [9]:
import os
# From awesome dash intro repo by Kevin Mader
# A quick intro to Dash made for the PyData event in Zurich
# https://github.com/4QuantOSS/DashIntro 

# Can use Jupyter nbserverproxy extension (available at /.../proxy/<port>)

def show_app(app, port = 10001, 
             width = 700, 
             height = 350, 
             offline = False,
            in_binder = None):
    in_binder ='JUPYTERHUB_SERVICE_PREFIX' in os.environ if in_binder is None else in_binder
    if in_binder:
        base_prefix = '{}proxy/{}/'.format(os.environ['JUPYTERHUB_SERVICE_PREFIX'], port)
        url = 'https://hub.mybinder.org{}'.format(base_prefix)
        app.config.requests_pathname_prefix = base_prefix
    else:
        url = 'http://localhost:%d' % port
    iframe = '<a href="{url}" target="_new">Open in new window</a><hr><iframe src="{url}" width={width} height={height}></iframe>'.format(url = url, 
                                                                                  width = width, 
                                                                                  height = height)

    iframe = '<a href="{url}" target="_new">Open in new window</a><hr>'.format(url = url, 
                                                                                  width = width, 
                                                                                  height = height)

    display.display_html(iframe, raw = True)
    if offline:
        app.css.config.serve_locally = True
        app.scripts.config.serve_locally = True
        
    return app.run_server(debug=False, # needs to be false in Jupyter
                          host = '0.0.0.0',
                          port=port)

In [10]:
import sklearn.metrics as metrics
import plotly.graph_objs as go

# Reference: https://github.com/plotly/dash-svm
def serve_roc_curve(df,y_score_col,y_test_col,group_by):
    
    # Get plotly trace roc for dataframe subset
    def get_trace_data(df,y_score_col,y_test_col,group_by_val):
        y_test=df[y_test_col]
        y_score=df[y_score_col]
        fpr, tpr, threshold = metrics.roc_curve(y_test, y_score)

        # AUC Score
        auc_score = metrics.roc_auc_score(y_true=y_test, y_score=y_score)

        trace0 = go.Scatter(
            x=fpr,
            y=tpr,
            mode='lines',
            name=f"{group_by_val} ({auc_score:.3f})",
        )
        return trace0
    
    data=[]
    if group_by: 
        unique_vals=df[group_by].unique()
    
        for group_by_val in unique_vals:
            group_df=df[df[group_by]==group_by_val]
            data.append(get_trace_data(group_df,y_score_col,y_test_col,group_by_val))
        title=f'ROC Curve'
    else:
        # Global ROC
        auc_score = metrics.roc_auc_score(df[y_test_col], df[y_score_col])
        data.append(get_trace_data(df,y_score_col,y_test_col,"Global AUC"))
        title=f'ROC Curve {auc_score:.3f} '
            
    layout = go.Layout(
        title=title,
        xaxis=dict(
            title='False Positive Rate'
        ),
        yaxis=dict(
            title='True Positive Rate'
        ),
        margin=dict(l=50, r=10, t=55, b=40),
    )
    
    figure = go.Figure(data=data, layout=layout)

    return figure



In [11]:
import matplotlib.pyplot as plt
from io import BytesIO
import base64
import dash
from dash.dependencies import Input, Output, State,Event
import dash_core_components as dcc
import dash_html_components as html
import dash_table_experiments as dt
import json
import plotly
from IPython import display

app = dash.Dash()

visible_cols=eval_df.columns.drop(["index","y"]).values
categorical_cols=eval_df.select_dtypes("object").columns.values

app.layout = html.Div([
    # Reference: https://github.com/plotly/dash-svm/
    # .container class is fixed, .container.scalable is scalable
    html.Div(className="banner", children=[
        html.Div(className='container scalable', children=[
            html.H2(html.A(
                'Dash Classification Eval - DevScope AI Lab',
                href='https://github.com/DevScope/ai-lab',
                style={
                    'text-decoration': 'none',
                    'color': 'inherit'
                }
            )),

            html.A(
                html.Img(src="https://s3-us-west-1.amazonaws.com/plotly-tutorials/logo/new-branding/dash-logo-by-plotly-stripe-inverted.png"),
                href='https://plot.ly/products/dash/'
            )
        ]),
    ]),
    html.Div(id='body', className='container scalable', children=[
         html.Div([
            html.Div(
                [
                    dt.DataTable(
                        rows=eval_df.to_dict('records'),
                        editable=False,
                        sortable=True,
                        columns=visible_cols,
                        row_selectable=False,
                        filterable=True,
                        id='score_table'
                        ),
                ],className="six columns"),
             html.Div(
                 [
                     dcc.Dropdown(
                         id='group_by',
                         options=[{'label':label,'value':label} for label in categorical_cols],
                         value=None,
                         placeholder="Split ROC by column"
                        ),
                        html.Div(id="output")
                   ],id="results",className="six columns")
        ],className="row")
     ])
])

@app.callback(
   Output('output', 'children'),
   [Input("score_table","rows"),Input("group_by","value")])
def update_feature_table(rows,group_by):
    if len(rows)==0:
        return []
    
    children=[]
    
    #rebuild dataframe from received data
    df_selected = pd.DataFrame(rows)
    
    roc_figure = serve_roc_curve(df_selected,"Scored_Prob","y",group_by)
    
    children.append(dcc.Graph(
                    id='graph-line-roc-curve',
                    figure=roc_figure))
    return children
    
    


# Reference: https://github.com/plotly/dash-svm/
external_css = [
    # Normalize the CSS
    "https://cdnjs.cloudflare.com/ajax/libs/normalize/7.0.0/normalize.min.css",
    # Fonts
    "https://fonts.googleapis.com/css?family=Open+Sans|Roboto",
    "https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css",
    # Base Stylesheet, replace this with your own base-styles.css using Rawgit
    "https://rawgit.com/xhlulu/9a6e89f418ee40d02b637a429a876aa9/raw/f3ea10d53e33ece67eb681025cedc83870c9938d/base-styles.css",
    # Custom Stylesheet, replace this with your own custom-styles.css using Rawgit
    "https://cdn.rawgit.com/plotly/dash-svm/bb031580/custom-styles.css"
]

for css in external_css:
    app.css.append_css({"external_url": css})
    

In [12]:
# use <esc> i+i on Jupyter to quick interrupt & get control back to jupyter
show_app(app)


 * Running on http://0.0.0.0:10001/ (Press CTRL+C to quit)
127.0.0.1 - - [24/Aug/2018 23:27:01] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [24/Aug/2018 23:27:05] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [24/Aug/2018 23:27:05] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [24/Aug/2018 23:27:05] "GET /favicon.ico HTTP/1.1" 200 -
127.0.0.1 - - [24/Aug/2018 23:27:07] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [24/Aug/2018 23:27:13] "POST /_dash-update-component HTTP/1.1" 200 -
