In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
from mpl_toolkits.mplot3d import Axes3D
plots.style.use('fivethirtyeight')
%matplotlib inline

# Classification Examples: Medicine



In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

This dataset represents a doctor's patients

**Question:** How many patients?

These patients are classified by whether they have kidney disease or not?

**Question:** Which column do you think indicates this?

In [None]:
ckd.labels

**Question:** Now that we know it is `Class`, how many patients had kidney disease and how many didn't?
    <details>,
<summary>Solution</summary>
  ckd.group('Class')
</details>


**Question:** Can we predict whether a patience has kidney disease based on their Glocuse levels?
    <details>,
<summary>Solution</summary>
  Maybe
</details>


**Question:** What's our method to determine if we can?
       <details>
<summary>Solution</summary>
    <h3>Visualize then quantify</h3>
    
</details>

In [None]:
# skip

In [None]:
# skip

In [None]:
# skip

In [None]:
# skip

**Question:** How can we visualzie?

In [None]:
# skip 

In [None]:
# skip

In [None]:
# skip

In [None]:
ckd.scatter('Glucose', 'Class')

In [None]:
ckd.scatter('White Blood Cell Count', 'Class')

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose')

But let's color the different patients based on their class

In [None]:
# skip

In [None]:
# skip

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

Let's fix this error by reading the documentation:

In [None]:
#help(ckd.scatter)

In [None]:
# fix is below

In [None]:
# fix is below

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', group='Class')

Let's talk about this visualization. How would we classify whether a patience had kidney disease based on their gluclose levels and white blood cell count?

What ideas does the class have?

(skip)

**Question:** What about predicting kidney disease based on Hemoglobin and Glucose?

In [None]:
# make the visualization here

# Classification Examples: Counterfeit Banknotes

let's look at another example

In [None]:
banknotes = Table.read_table('banknote.csv')
banknotes

**Question:** Which column here indicates for us whether the bill is counterfeit or not?

**Question:** What are the possible values for this column? Is it binary (yes or no), or maybe trinary (yes or no or maybe)?


**Question:** What columns should we use to predict whether a bill was counterfeit or not?

In [None]:
banknotes.labels

In [None]:
# skip

In [None]:
# skip

In [None]:
#skip

In [None]:
banknotes.scatter('WaveletVar', 'WaveletCurt', group='Class')

In [None]:
banknotes.scatter('WaveletSkew', 'Entropy', group='Class')

**Question:** Which of these are better?

**Question:** What if we try making the prediction based on all of the features?


In [None]:
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'), 
           banknotes.column('WaveletVar'), 
           banknotes.column('WaveletCurt'), 
           c=banknotes.column('Class'),
           cmap='viridis',
           s=50);

(back to slides "Classifier")

# Defining a Classifier

In [None]:
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients.show(5)

In [None]:

patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', group='Class')

In [None]:
def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size=len(a))

jittered = Table().with_columns([
        'Bland Chromatin (jittered)', 
        randomize_column(patients.column('Bland Chromatin')),
        'Single Epithelial Cell Size (jittered)', 
        randomize_column(patients.column('Single Epithelial Cell Size')),
        'Class',
        patients.column('Class')
    ])

jittered.scatter('Bland Chromatin (jittered)', 'Single Epithelial Cell Size (jittered)', group='Class')

# Distance

In [None]:
Table().with_columns(['X', [0, 2, 3], 'Y', [0, 2, 4]]).scatter('X', 'Y')

In [None]:
def distance(pt1, pt2):
    """Return the distance between two points (represented as arrays)"""
    return np.sqrt(np.sum((pt1 - pt2) ** 2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(make_array(row1), make_array(row2))

In [None]:
attributes = patients.drop('Class')
attributes.show(3)

In [None]:
row_distance(attributes.row(0), attributes.row(1))

In [None]:
row_distance(attributes.row(0), attributes.row(2))

In [None]:
row_distance(attributes.row(0), attributes.row(0))

# Classification Procedure

In [None]:
def distances(training, example):
    """Compute a table with the training set and distances to the example for each row in the training set."""
    dists = []
    attributes = training.drop('Class')
    for row in attributes.rows:
        dist = row_distance(row, example)
        dists.append(dist)
    return training.with_column('Distance', dists)

In [None]:
def closest(training, example, k):
    """Return a table of the k closest neighbors to example"""
    return distances(training, example).sort('Distance').take(np.arange(k))

In [None]:
patients.take(12)

In [None]:
example = patients.drop('Class').row(12)
example

In [None]:
closest(patients, example, 5)

In [None]:

closest(patients.exclude(12), example, 5)

In [None]:
def majority_class(neighbors):
    """Return the class that's most common among all these neighbors."""
    return neighbors.group('Class').sort('count', descending=True).column('Class').item(0)

In [None]:
def classify(training, example, k):
    "Return the majority class among the k nearest neighbors."
    nearest_neighbors = closest(training, example, k)
    return majority_class(nearest_neighbors)

In [None]:
classify(patients.exclude(12), example, 5)

(back to slides)

# Evaluation