In [None]:
%load_ext autoreload
%autoreload 2

# AI Lab tools/utils

In [None]:
import ailab as lab

# Read Data

In [None]:
import pandas as pd

feature_names=["Age", "Workclass", "Final Weight", "Education", "Education-Num", "Marital Status",
               "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
               "Hours per week", "Country", "Income"]

df_train=pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
                     names=feature_names)

df_test=pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
                    skiprows =1,
                    names=feature_names)



# Data prep & problem definition

In [None]:
SAMPLE_ROWS=5000

def df_prep(df):
    df=df.copy().sample(SAMPLE_ROWS)
    LABEL="Income"
    y_train=df[LABEL].replace([" <=50K"," <=50K."," >50K"," >50K."],[0,0,1,1])
    df_X=df.drop(LABEL, axis=1)
    return df_X,y_train

df_X_train,y_train=df_prep(df_train)
df_X_test,y_test=df_prep(df_test)

# Very basic feature/classifier pipeline

In [None]:
import numpy as np
from sklearn_pandas import gen_features
from sklearn_pandas import DataFrameMapper
import sklearn.preprocessing as preprocessing

nums=[ ([c],preprocessing.Imputer()) for c in df_X_train.select_dtypes([np.number])]
cats=[ ([c],preprocessing.LabelBinarizer()) for c in df_X_train.select_dtypes(["object"])]

feature_mapper=DataFrameMapper(nums+cats,df_out=True)
feature_mapper

In [None]:
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier

param_dist = {
              # Note n_estimators probably not a true hyperparameter, 
              # in general more is better (aside performance/diminishing returns)
              "classifier__n_estimators": [10,20],
              "classifier__max_features": ['auto', 'sqrt', 'log2'],
              "classifier__max_depth": [1,8],
              "classifier__min_samples_leaf": [1,8],
              "classifier__bootstrap": [True,False],
              "classifier__n_jobs":[-1],
              "classifier__criterion" :['gini', 'entropy']
             }

classifier = RandomForestClassifier(random_state=42)

pipeline=Pipeline([('featurize', feature_mapper),
                   ('classifier',classifier)])

In [None]:
from sklearn.model_selection import GridSearchCV

# Quick grid search
CV=3

search_cv = GridSearchCV(pipeline, param_grid=param_dist,
                                   n_jobs=1,
                                   scoring="roc_auc",
                                   error_score=0,cv=CV,verbose=5,
                                   
                                   #will not be default for sklearn .021
                                   return_train_score=True)

search_cv.fit(df_X_train,y_train)

clf=search_cv.best_estimator_
clf

# Prepare dataframe for Dash

In [None]:
cv_results_df=pd.DataFrame(search_cv.cv_results_).sort_values(by='rank_test_score')
cv_results_df["dif_test_train"]=cv_results_df.mean_train_score-cv_results_df.mean_test_score

# drop param list column, doesnt work in dash, not needed for now
cv_results_df.drop("params",axis=1,inplace=True)

cv_results_df


# Dash 

In [None]:
import sklearn.metrics as metrics
import plotly.graph_objs as go
import seaborn as sns
import matplotlib.pyplot as plt

def serve_param_figure(df,col,metric_1,metric_2):
  
    split_col=""
    fig=plt.figure(figsize=(8,6))
    sns.violinplot(x=col, y=metric_1,data=df)
    ax2 = plt.twinx()
    sns.pointplot(x=col, y=metric_2,ax=ax2, data=df)
    
    return fig

# Test
df=cv_results_df
metric_1="mean_test_score"
metric_2="mean_fit_time"
col="param_classifier__max_depth"

fig=serve_param_figure(cv_results_df,col,metric_1,metric_2)

fig

In [None]:
import sklearn.linear_model as linear_model
import statsmodels.api as sm
import numpy as np
from scipy import stats
from IPython.display import display as display

# Needs review... use builtin instead
def serve_regression_stats(df,score_result):

    def drop_constant_columns(dataframe):
        return dataframe.loc[:, (dataframe != dataframe.iloc[0]).any()]

    all_reg=drop_constant_column(df)

    reg_cols=[]
    cat_cols=[]
    for col in all_reg.columns:
        if  col.startswith("param_") or col==score_result:
            reg_cols.append(col)

    all_reg=all_reg[reg_cols]
    all_reg=all_reg.convert_objects(convert_numeric=True)
    all_reg

    cat_cols=all_reg.select_dtypes(include=['object','category','bool']).columns
    cat_cols
    all_reg=pd.get_dummies(all_reg, columns=cat_cols,drop_first=True).fillna(0)
    all_reg

    y=all_reg[score_result].values
    X=all_reg.drop(score_result,axis=1)
    X_df=X

    X = pd.DataFrame(X, columns = X_df.columns)
    
    lm = linear_model.LinearRegression(fit_intercept=True, normalize=True)
    lm.fit(X,y)
    params = np.append(lm.intercept_,lm.coef_)
    predictions = lm.predict(X)

    # https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression/46912457
    newX = pd.DataFrame({"Constant":np.ones(len(X))}).join(pd.DataFrame(X))
    MSE = (sum((y-predictions)**2))/(len(newX)-len(newX.columns))

    var_b = MSE*(np.linalg.inv(np.dot(newX.T,newX)).diagonal())
    sd_b = np.sqrt(var_b)
    ts_b = params/ sd_b

    p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-1))) for i in ts_b]

    sd_b = np.round(sd_b,3)
    ts_b = np.round(ts_b,3)
    p_values = np.round(p_values,3)
    params = np.round(params,4)

    results_df = pd.DataFrame()
    results_df["Column"]=X_df.columns.insert(0,"intercept/default")
    results_df["Coefficients"],results_df["Standard Errors"],results_df["t values"],results_df["Probabilites"] = [params,sd_b,ts_b,p_values]

    results_df["AbsCoef"]=abs(results_df["Coefficients"])

    results_df["p?"]=np.where(results_df['Probabilites']<.005, '***', np.where(results_df['Probabilites']<.05, '*', ''))

    results_df.sort_values(["p?","AbsCoef"],ascending=False,inplace=True)
    
    results_df.drop("t values",axis=1,inplace=True)
    return results_df

# Test
serve_regression_stats(cv_results_df,"mean_test_score")


In [None]:
import matplotlib.pyplot as plt
from io import BytesIO
import base64
import dash
from dash.dependencies import Input, Output, State,Event
import dash_core_components as dcc
import dash_html_components as html
import dash_table_experiments as dt
import json
import plotly
from IPython import display
from matplotlib import rcParams

app = dash.Dash()

metrics_list=cv_results_df.columns.values

def fig_to_uri(in_fig, close_all=True, **save_args):
    rcParams.update({'figure.autolayout': False})
    out_img = BytesIO()
    # cut issue: https://stackoverflow.com/questions/29901422/matplotlib-with-annotation-cut-off-from-the-saved-figure/29901470
    in_fig.savefig(out_img, format='png',bbox_inches="tight", **save_args)
    if close_all:
        in_fig.clf()
        plt.close('all')
    out_img.seek(0)  # rewind file
    encoded = base64.b64encode(out_img.read()).decode("ascii").replace("\n", "")
    return "data:image/png;base64,{}".format(encoded)

app.layout = html.Div([
    # Reference: https://github.com/plotly/dash-svm/
    # .container class is fixed, .container.scalable is scalable
    html.Div(className="banner", children=[
        html.Div(className='container scalable', children=[
            html.H2(html.A(
                'Dash Search CV Eval - DevScope AI Lab',
                href='https://github.com/DevScope/ai-lab',
                style={
                    'text-decoration': 'none',
                    'color': 'inherit'
                }
            )),

            html.A(
                html.Img(src="https://s3-us-west-1.amazonaws.com/plotly-tutorials/logo/new-branding/dash-logo-by-plotly-stripe-inverted.png"),
                href='https://plot.ly/products/dash/'
            )
        ]),
    ]),
    html.Div(id='body', className='container scalable', children=[
         html.Div([
            html.Div(
                [
                    dt.DataTable(
                        rows=cv_results_df.to_dict('records'),
                        editable=False,
                        sortable=True,
                        columns=cv_results_df.columns.values,
                        row_selectable=False,
                        filterable=True,
                        max_rows_in_viewport=4,
                        id='score_table'
                        ),
                    
                    dt.DataTable(
                        rows=[{'No Rows': ''}],
                        editable=False,
                        sortable=True,
                        #columns=cv_results_df.columns.values,
                        row_selectable=False,
                        filterable=True,
                        max_rows_in_viewport=4,
                        id='regression_table'
                        ),
                ],className="six columns"),
             html.Div(
                 [
                     html.Div(className="row",children=[
                         dcc.Dropdown(
                             id='metric_1',
                             options=[{'label':label,'value':label} for label in metrics_list],
                             value="mean_test_score",
                             placeholder="Main axis",
                             className="six columns"
                            ),
                        dcc.Dropdown(
                             id='metric_2',
                             options=[{'label':label,'value':label} for label in metrics_list],
                             value="mean_fit_time",
                             placeholder="Secondary axis",
                            className="six columns"
                            )]),
                    html.Div(id="output",style={'height':'500px','overflow-y': 'scroll'})
                   ],id="results",className="six columns")
        ],className="row")
     ])
])

@app.callback(
   Output('regression_table', 'rows'),
   [Input("score_table","rows"),Input("metric_1","value"),Input("metric_2","value")])
def update_regression_table(rows,metric_1,metric_2):
    if len(rows)==0:
        return []
    
    children=[]
    
    df_selected = pd.DataFrame(rows)
    
    df_regression= serve_regression_stats(df_selected,metric_1)
    
    return df_regression.to_dict('records')
    
@app.callback(
   Output('output', 'children'),
   [Input("score_table","rows"),Input("metric_1","value"),Input("metric_2","value")])
def update_output_table(rows,metric_1,metric_2):
    if len(rows)==0:
        return []
    
    children=[]
    
    #rebuild dataframe from received data
    df_selected = pd.DataFrame(rows)
    
    for col in df_selected.columns:
        if col.startswith("param_") and len(df_selected[col].unique())>1:
            col_figure = serve_param_figure(df_selected,col,metric_1,metric_2)
            children.append(html.Img(src = fig_to_uri(col_figure)))
    
    return children

# Reference: https://github.com/plotly/dash-svm/
external_css = [
    # Normalize the CSS
    "https://cdnjs.cloudflare.com/ajax/libs/normalize/7.0.0/normalize.min.css",
    # Fonts
    "https://fonts.googleapis.com/css?family=Open+Sans|Roboto",
    "https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css",
    # Base Stylesheet, replace this with your own base-styles.css using Rawgit
    "https://rawgit.com/xhlulu/9a6e89f418ee40d02b637a429a876aa9/raw/f3ea10d53e33ece67eb681025cedc83870c9938d/base-styles.css",
    # Custom Stylesheet, replace this with your own custom-styles.css using Rawgit
    "https://cdn.rawgit.com/plotly/dash-svm/bb031580/custom-styles.css"
]

for css in external_css:
    app.css.append_css({"external_url": css})
    

In [None]:
# use <esc> i+i on Jupyter to quick interrupt & get control back to jupyter
lab.show_app(app=app,port=10003)
