In [None]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import nbinteract as nbi

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
# HIDDEN
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

## Entropy

In [None]:
def entropy(node):
    pass

In [None]:
def loss(split):
    pass

In [None]:
def p0(node):
    pass

In [None]:
N = 100
parents = ...
ps = ...
entropys = ...

In [None]:
sns.lineplot(ps, entropys)
plt.xlabel(r'$p_{C=0}$')
plt.ylabel(r'Entropy')
plt.title('Entropy for all possible two-class nodes')

In [None]:
def plot_split_entropy(split):
    parent = split.sum(axis=0)
    child_p = [p0(split[0]), p0(split[1])]
    child_S = [entropy(split[0]), entropy(split[1])]
    
    plt.figure(figsize=(8, 5))
    sns.lineplot(ps, entropys)
    plt.scatter(p0(parent), entropy(parent), s=150, label='Parent')
    
    plt.scatter(child_p, child_S, s=150, label='Children')
    plt.plot(child_p, child_S, linestyle='--')
    plt.scatter(p0(parent), loss(split), s=150, label='Split entropy')
    plt.plot([p0(parent), p0(parent)], [loss(split), entropy(parent)],
             label='Info gain')
    
    plt.xlabel(r'$p_{C=0}$')
    plt.ylabel(r'Entropy')
    plt.legend()

In [None]:
s = np.array([
    [1, 20], [40, 10],
])

plot_split_entropy(s)

## Decision Trees

In [None]:
import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
cancer = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
cancer['bias'] = 1.0
# Target data_dict['target'] = 0 is malignant; 1 is benign
cancer['malignant'] = 1 - data_dict['target']
cancer

In [None]:
df_interact(cancer)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(cancer, test_size=0.25, random_state=100)
print("Training Data Size: ", len(train))
print("Test Data Size: ", len(test))

In [None]:
plt.figure(figsize=(7, 5))
sns.scatterplot('mean radius', 'mean texture', hue='malignant',
                s=40, data=train)

In [None]:
def points_for_boundary(X, clf):
    x_min, x_max = X['mean radius'].agg(['min', 'max'])
    y_min, y_max = X['mean texture'].agg(['min', 'max'])
    xs = np.linspace(x_min, x_max, 100)
    ys = np.linspace(y_min, y_max, 100)
    points = pd.DataFrame({
        'xs': np.tile(xs, len(ys)),
        'ys': np.repeat(ys, len(xs)),
    })
    return points.assign(pred=clf.predict(points))

In [None]:
def decision_boundary(pred):
    plt.figure(figsize=(12, 6))
    
    plt.subplot(121)
    sns.scatterplot('xs', 'ys', hue='pred', data=pred, alpha=0.2, s=40,
                    legend=False)
    sns.scatterplot('mean radius', 'mean texture', hue='malignant',
                    s=60, legend=False, data=train)
    
    plt.subplot(122)
    sns.scatterplot('xs', 'ys', hue='pred', data=pred, s=40, legend=False)

In [None]:
from sklearn.linear_model import LogisticRegression

...

In [None]:
from sklearn.tree import DecisionTreeClassifier

...