In [None]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import nbinteract as nbi

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
# HIDDEN
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

## Decision Boundaries

In [None]:
import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
cancer = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
cancer['bias'] = 1.0
# Target data_dict['target'] = 0 is malignant; 1 is benign
cancer['malignant'] = 1 - data_dict['target']
cancer

In [None]:
df_interact(cancer)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(cancer, test_size=0.25, random_state=100)
print("Training Data Size: ", len(train))
print("Test Data Size: ", len(test))

In [None]:
plt.figure(figsize=(7, 5))
sns.scatterplot('mean radius', 'mean texture', hue='malignant',
                s=40, data=train)

In [None]:
from sklearn.linear_model import LogisticRegression

X_train = train[['mean radius', 'mean texture']]
y_train = train['malignant']

clf = LogisticRegression(penalty='none', solver='saga', max_iter=10000)
clf.fit(X_train, y_train)

In [None]:
def points_for_boundary(X, clf):
    x_min, x_max = X['mean radius'].agg(['min', 'max'])
    y_min, y_max = X['mean texture'].agg(['min', 'max'])
    xs = np.linspace(x_min, x_max, 100)
    ys = np.linspace(y_min, y_max, 100)
    points = pd.DataFrame({
        'xs': np.tile(xs, len(ys)),
        'ys': np.repeat(ys, len(xs)),
    })
    return points.assign(pred=clf.predict(points))

In [None]:
pred = points_for_boundary(X_train, clf)
pred

In [None]:
def decision_boundary(pred):
    plt.figure(figsize=(12, 6))
    
    plt.subplot(121)
    sns.scatterplot('xs', 'ys', hue='pred', data=pred, alpha=0.2, s=40,
                    legend=False)
    sns.scatterplot('mean radius', 'mean texture', hue='malignant',
                    s=60, legend=False, data=train)
    
    plt.subplot(122)
    sns.scatterplot('xs', 'ys', hue='pred', data=pred, s=40, legend=False)

In [None]:
decision_boundary(pred)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [None]:
def poly_model(degree):
    pass

Remember the k-nearest-neighbors classifier from Data 8?

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [None]:
pred_nn = points_for_boundary(X_train, knn)
decision_boundary(pred_nn)

Why won't GD converge on linearly separable data?

In [None]:
toy = pd.DataFrame({
    'X': [-5, -4, -3, -2, -1, 1, 2, 3, 4, 5],
    'Y': [ 0,  0,  0,  0,  0, 1, 1, 1, 1, 1]
})
plt.scatter(toy['X'], toy['Y']);

In [None]:
from scipy.special import expit as sigmoid

theta = 1
xs = np.linspace(-5, 5, 100)
ys = sigmoid(theta * xs)

plt.scatter(toy['X'], toy['Y'])
plt.plot(xs, ys);

In [None]:
theta = 10
xs = np.linspace(-5, 5, 100)
ys = sigmoid(theta * xs)

plt.scatter(toy['X'], toy['Y'])
plt.plot(xs, ys);

## Validation Curves

In [None]:
cancer

In [None]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

In [None]:
%%time

from sklearn.linear_model import LogisticRegressionCV

clf = make_pipeline(
    StandardScaler(),
    LogisticRegressionCV(cv=5, solver='saga', max_iter=10000, n_jobs=4),
)
clf.fit(X, y);

In [None]:
def plot_score_curve(scores, column_names, x_axis, y_axis, **kwargs):
    cv = pd.melt(pd.DataFrame(scores, columns=column_names),
                 var_name=x_axis, value_name=y_axis)
    sns.pointplot(x=x_axis, y=y_axis, data=cv, **kwargs)

In [None]:
scores = clf.named_steps['logisticregressioncv'].scores_[1]
Cs = [f'{C:.1g}' for C in clf.named_steps['logisticregressioncv'].Cs_]

plt.figure(figsize=(10, 4))
plot_score_curve(scores, Cs, 'C', 'valid_acc')

## Learning Curves

In [None]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = data['target']
X

In [None]:
sns.scatterplot(X['MedInc'], y)

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.linear_model import Ridge

train_sizes, train_scores, valid_scores = learning_curve(
    Ridge(alpha=0.0001), X, y, train_sizes=np.arange(1000, 17000, 1000),
    scoring='neg_mean_squared_error', cv=5)

In [None]:
plt.figure(figsize=(10, 4))
plot_score_curve(np.log(-train_scores.T), train_sizes / 1000,
                 'Thousands of samples', 'training_acc')
plot_score_curve(np.log(-valid_scores.T), train_sizes / 1000,
                 'Thousands of samples', 'valid_acc', color='gold')
plt.ylabel('log(MSE)');

## Regularized Logistic Regression

As with linear regression, one common way of reducing the variance of the parameter estimator is to add a regularization term to the empirical risk objective. E.g.,

\begin{align*}
R(\beta, x, y, \lambda) &= - \frac{1}{n}\sum_{i=1}^n \left[ y_i x_i^T\beta + \log \sigma(-x_i^T\beta) \right] + \frac{1}{2} C \sum_{j=1}^J \beta_j^2 \\[10pt]
\nabla_{\beta} R(\beta, x, y, \lambda) &=  - \frac{1}{n}\sum_{i=1}^n \left(y_i - \sigma(x_i^T\beta)\right) x_i + C \beta \\[10pt]
\end{align*}

In [None]:
def regularized_logistic_regression(x, y, c):
    """Train a logistic regression classifier using gradient descent."""

    def l2_regularized_gradient(beta, x, y):
        return risk_gradient(beta, x, y) + c * beta

    beta0 = np.zeros(x.shape[0])
    beta = gradient_descent(x, y, beta0, l2_regularized_gradient)
    return beta    

def search_for_c(features):
    for c in 2.0 ** np.arange(-10, 10, 2):
        print("c =", c)
        beta = regularized_logistic_regression(features(train), y_train, c)
        print("sum(beta**2) = ", sum(beta**2))
        evaluate(beta, features)
        print()
        
search_for_c(all_features)

In [None]:
from sklearn import preprocessing

def inputs(t):
    return t.drop('malignant', axis=1).values

scaler = preprocessing.StandardScaler().fit(inputs(train))

def scaled_features(t):
    return scaler.transform(inputs(t)).T

search_for_c(scaled_features)

In [None]:
model = LogisticRegression(C=4, solver='lbfgs')
model.fit(scaled_features(train).T, y_train)
y_hat = model.predict(scaled_features(test).T)
print_ratio(sum(y_hat == y_test), len(y_test))

### Multiclass classification

\begin{align*}
P(Y=y|X) &= \frac{\exp(X^T\beta_{y})}{\sum_{z=0}^d \exp(X^T\beta_z)} \\[10pt]
L(\beta_0,\dots,\beta_d, x_i, y_i) &= - \log \frac{\exp(x_i^T\beta_{y_i})}{\sum_{z=0}^d \exp(x_i^T\beta_z)} \\[10pt]
\frac{\partial}{\partial \beta_w} L(\beta_0,\dots,\beta_d, x_i, y_i) &= -\left(1[w=y_i] - \frac{\exp(x_i^T\beta_w)}{\sum_{z=0}^d \exp(x_i^T\beta_z)}\right) x_i  \\[10pt]
1[w=y_i] &= \begin{cases}
1 & \text{if}\ w=y_i \\
0 & \text{otherwise}
\end{cases}
\end{align*}
