<a href="https://colab.research.google.com/github/CptK/Deep_and_Machine_Learning_Projects/blob/master/CS441_Class_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Wisconsin Breast Cancer Dataset Example (CS 441)

In [1]:
# initialization code
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X = data.data
y = data.target
xnames = data.feature_names
ynames = data.target_names

print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [2]:
# What does my data mean?

print('Feature names: {}'.format(xnames))
print('Label names: {}'.format(ynames))
for i in [0,100,300, 500]:
  print('Example {}'.format(i))
  print('Features {}'.format(X[i, :]))
  print('Prediction {}'.format(y[i]))

Feature names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Label names: ['malignant' 'benign']
Example 0
Features [1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
Prediction 0
Example 100
Features [1.361e+01 2.498e+01 8.805e+01 5.82

In [3]:
# How common is each class?
Py0 = np.sum(y==0) / len(y)
Py1 = 1 - Py0
print('Class prior: P(y=0)={:.2f}, P(y=1)={:.2f}'.format(Py0, Py1))

Class prior: P(y=0)=0.37, P(y=1)=0.63


In [4]:
# Create a training and validation set
N_tr = 469

np.random.seed(seed=0)
ind = np.random.permutation(len(y))
X_tr = X[ind[:N_tr], :]
y_tr = y[ind[:N_tr]]
X_val = X[ind[N_tr:]]
y_val = y[ind[N_tr:]]

print(X_tr.shape)
print(X_val.shape)

(469, 30)
(100, 30)


In [5]:
# 1-Nearest neighbor classifier/regressor
def nn(X_tr, y_tr, X_test):
  y_test = np.zeros(len(X_test),)
  for i in range(len(X_test)):
    dist = np.sum((X_tr - X_test[i])**2, axis=1)
    j = np.argmin(dist) # use argsort if you need more than one value
    y_test[i] = y_tr[j]
  return y_test

y_pred = nn(X_tr, y_tr, X_val)
err = 1-np.mean(y_val==y_pred)
print('Err: {:0.4f}'.format(err))

Err: 0.0800


In [None]:
# Let's understand the data a little better
for m in range(X_tr.shape[1]):
  space = '                       '
  space = space[len(xnames[m]):]
  print('{}:{}\tmu={:.2f} std={:0.2f}   mu_0={:.2f}\tmu_1={:.2f}'.format(xnames[m], space, X_tr[:,m].mean(), X_tr[:,m].std(), X_tr[y_tr==0,m].mean(), X_tr[y_tr==1,m].mean()))

In [None]:
# normalize by mean and std
X_mu = np.mean(X_tr, axis=0)
X_std = np.std(X_tr, axis=0)
X_tr_n = (X_tr-X_mu) / X_std
X_val_n = (X_val - X_mu) / X_std # note: divide val by same mean and std as train

In [None]:
# Repeat nearest neighbor on normalized values
y_pred = nn(X_tr_n, y_tr, X_val_n)
err = 1-np.mean(y_val==y_pred)
print('Err: {:0.4f}'.format(err))

In [None]:
# Naive Bayes where P(x_i|y) ~ N(mu[i,y], sig[i,y]^2)
def nb_gauss_train(X, y, eps):
  mu = np.zeros((X.shape[1],2))
  sig = np.zeros((X.shape[1],2))
  py0 = np.mean(y==0)
  for i in range(X.shape[1]):
    mu[i, 0] = np.mean(X[y==0, i])
    mu[i, 1] = np.mean(X[y==1, i])
    sig[i, 0] = np.std(X[y==0, i]) + eps
    sig[i, 1] = np.std(X[y==1, i]) + eps
  return mu, sig, py0

def nb_gauss_predict(X, mu, sig, py0):
  log_pxy = np.zeros((len(X),2))
  for i in range(X.shape[1]):
    log_pxy[:,0] += -(mu[i, 0]-X[:, i])**2 / sig[i, 0]**2
    log_pxy[:,1] += -(mu[i, 1]-X[:, i])**2 / sig[i, 1]**2
  log_pxy[:,0] += np.log(py0)
  log_pxy[:,1] += np.log(1-py0)
  pred_y = np.argmax(log_pxy, axis=1)
  return pred_y

[mu, sig, py0] = nb_gauss_train(X_tr, y_tr, 1/len(X_tr))
y_pred = nb_gauss_predict(X_val, mu, sig, py0)
err = 1-np.mean(y_val==y_pred)
print('Err: {:0.4f}'.format(err))

[mu, sig, py0] = nb_gauss_train(X_tr_n, y_tr, 1/len(X_tr))
y_pred = nb_gauss_predict(X_val_n, mu, sig, py0)
err_n = 1-np.mean(y_val==y_pred)
print('Err: {:0.4f} (X is normalized)'.format(err_n))

# Why doesn't normalization make any difference here?

In [None]:
# Let's try linear regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1,max_iter=10000).fit(X_tr, y_tr)
y_pred = model.predict(X_val)
print('LR Err: {:0.4f} (not normalized)'.format(1-np.mean(y_val==y_pred)))

model = LogisticRegression(C=1,max_iter=10000).fit(X_tr_n, y_tr)
y_pred = model.predict(X_val_n)
print('LR Err: {:0.4f} (normalized)'.format(1-np.mean(y_val==y_pred)))

In [None]:
# ROC curve
import sklearn.metrics
y_conf = model.predict_proba(X_val_n)
[fpr, tpr, thresh] = sklearn.metrics.roc_curve(y_val==0, y_conf[:,0])
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')



In [None]:
# let's look closer at feature importance
model_l2 = LogisticRegression(C=1,max_iter=10000).fit(X_tr_n, y_tr)
print(model_l2.coef_)
model_l1 = LogisticRegression(C=1,max_iter=10000,penalty="l1",solver="liblinear").fit(X_tr_n, y_tr)
y_pred = model.predict(X_val_n)
print('LR1 Err: {:0.4f} (normalized)'.format(1-np.mean(y_val==y_pred)))
for i in range(X.shape[1]):
  space = '                          '
  space = space[len(xnames[i]):]
  print('{}:{}w_2={:.2f}  w_1={:.2f}'.format(xnames[i], space, model_l2.coef_[0,i], model_l1.coef_[0,i]))


In [None]:
# tree display
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=0)
model.fit(X_tr, y_tr)
y_pred = model.predict(X_val)
print('Full tree Err: {:0.4f} '.format(1-np.mean(y_val==y_pred)))
plt.figure(figsize=(20,20))
tree.plot_tree(model, feature_names=xnames)
plt.show()

model = DecisionTreeClassifier(random_state=0, max_depth=2)
model.fit(X_tr, y_tr)
y_pred = model.predict(X_val)
print('Short tree Err: {:0.4f} '.format(1-np.mean(y_val==y_pred)))
plt.figure(figsize=(20,20))
tree.plot_tree(model, feature_names=xnames)
plt.show()


In [None]:
# 10-fold cross-validation experiment for final tests
cv_err_nn = np.zeros(10,)
cv_err_l1 = np.zeros(10,)
np.random.seed(seed=0)
rpind = np.random.permutation(len(y))
for split in range(10):
  
  # split into train/val
  val_ind = rpind[np.arange(split,len(X), 10)]
  X_val = X[val_ind]
  y_val = y[val_ind]
  tr_ind = np.delete(np.arange(len(X)), val_ind)
  X_tr = X[tr_ind]
  y_tr = y[tr_ind]

  # normalize features based on train mu/std
  X_mu = np.mean(X_tr, axis=0)
  X_std = np.std(X_tr, axis=0)
  X_tr_n = (X_tr-X_mu) / X_std
  X_val_n = (X_val - X_mu) / X_std # note: divide val by same mean and std as train

  # nn
  y_pred = nn(X_tr_n, y_tr, X_val_n)
  cv_err_nn[split] = 1-np.mean(y_val==y_pred)

  # lr
  model = LogisticRegression(C=1,max_iter=10000,penalty="l1",solver="liblinear").fit(X_tr_n, y_tr)
  y_pred = model.predict(X_val_n)
  cv_err_l1[split] = 1-np.mean(y_val==y_pred)

print(cv_err_nn)
print(cv_err_l1)
print('Nearest Neighbor: mean err={:0.3f}  standard error of mean={:0.3f}  95% confidence interval={:0.3f}'.format(np.mean(cv_err_nn), np.std(cv_err_nn)/np.sqrt(10), np.std(cv_err_nn)/np.sqrt(10)*1.96))
print('L1 Logistic Regression: mean err={:0.3f}  standard error of mean={:0.3f}  95% confidence interval={:0.3f}'.format(np.mean(cv_err_l1), np.std(cv_err_l1)/np.sqrt(10), np.std(cv_err_l1)/np.sqrt(10)*1.96))
