In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter('ignore')

# Lecture 11: Prediction

## Regression Review

In original units, the regression line has this equation:

$$\frac{\text{estimate of } y - \text{average of } y}{\text{SD of } y} = r \times \frac{\text{the given } x - \text{average of } x}{\text{SD of } x}$$

The general equation for a line is:
$$y = \mathrm{slope} \times x + \mathrm{intercept}$$

So we can then compute the slope of the regression line as $$r \times \frac{\text{SD of } y}{\text{SD of } x}$$ 

with a $y$-intercept of average of $y - $ slope $\times$ average of $x$.

In [None]:
def standard_units(x):
    """ Convert any array of numbers to standard units. """
    return (x - np.mean(x))/np.std(x)

def correlation(t, x, y):
    """ return the correlation coeffient of two variables
    specified by columns x and y. """
    x_su = standard_units(t.column(x))
    y_su = standard_units(t.column(y))
    return np.mean(x_su * y_su)

def slope(t, x, y):
    """ return the slope of the regression line """
    r = correlation(t, x, y)
    return r * np.std(t.column(y))/np.std(t.column(x))
    
def intercept(t, x, y):
    """ return the intercept of the regression line """
    m = slope(t, x, y)
    return np.mean(t.column(y)) - m *  np.mean(t.column(x))

def fitted_values(t, x, y):
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a * t.column(x) + b

## Least Squares

In [None]:
little_women = Table.read_table('little_women.csv')
little_women = little_women.move_to_start('Periods')
little_women.show(3)


In [None]:
# Plot the data
little_women.scatter(0, 1)

In [None]:
# What's the correlation?
correlation(little_women, 0, 1)

In [None]:
# Add a column to the little_women table with the linear fit 

### Mean-Squared Error

Consider the following function:

$$f(x) = (x-3)^2 + 1$$

What value of $x$ minimizes $f(x)$?

In [None]:
def f(x):
    return (x - 3) ** 2 + 1

In [None]:
# Compute f for values 1, 2, 3, 4, 5

Given the following definition of error:
$$\text{error }= \text{actual value }− \text{ estimate}$$

We'd like to compute the *mean-squared error* (mse) and the *root mean-squared error*.

In [None]:
def lw_mse(any_slope, any_intercept):
    """ Compute mean-squared error for little_women given a slope & intercept """
    return None


In [None]:
lw_mse(50, 10000)

In [None]:
## root-mean squared error
lw_mse(50, 10000) ** 0.5

Compute slope and intercept that minimize `lw_mse`. What values should they be>

# Classification

## Brittany Wenger's Experiment

In [None]:
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients.show(5)

In [None]:
patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', colors='Class')

In [None]:
def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size=len(a))

jittered = Table().with_columns([
        'Bland Chromatin (jittered)', 
        randomize_column(patients.column('Bland Chromatin')),
        'Single Epithelial Cell Size (jittered)', 
        randomize_column(patients.column('Single Epithelial Cell Size')),
        'Class',
        patients.column('Class')
    ])

jittered.scatter(0, 1, colors='Class')

### Distance ###

In [None]:
def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    # (1, 2, 3)<-> (1, 4, 8)
    # TODO: complete function
    return None

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    # TODO: complete function
    return None

In [None]:
attributes = patients.drop('Class')
attributes.show(3)

In [None]:
row_distance(attributes.row(0), attributes.row(1))

In [None]:
row_distance(attributes.row(0), attributes.row(0))

In [None]:
row_distance(attributes.row(1), attributes.row(0))

### Classification Procedure ###

In [None]:
def distances(training, example):
    """Compute distance between example and every row in training.
    Return training table augmented with Distance column"""
    # TODO: complete function
    return None


In [None]:
patients.take(15)

In [None]:
example = attributes.row(15)

In [None]:
distances(patients, example)

In [None]:
def closest(training, example, k):
    """Return a table of the k closest neighbors to example"""
    # TODO: complete function
    return None

In [None]:
def majority_class(topk):
    """Return the class with the highest count"""
    # TODO: complete function
    return None

def classify(training, example, k):
    "Return the majority class among the k nearest neighbors of example"
    # TODO: complete function
    return None

In [None]:
classify(patients.exclude(15), example, 5)

### Evaluation ###

In [None]:
patients.num_rows

In [None]:
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
training_set = shuffled.take(np.arange(342))
test_set  = shuffled.take(np.arange(342, 683))

In [None]:
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    # TODO: complete function
    return None

In [None]:
evaluate_accuracy(training_set, test_set, 5)

## Chronic Kidney Disease

In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
ckd.num_rows

In [None]:
ckd.group('Class')

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', colors='Class')

## Counterfeit Banknotes

In [None]:
banknotes = Table.read_table('banknote.csv')
banknotes

In [None]:
banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')

In [None]:
banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')

In [None]:
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'),
          banknotes.column('WaveletVar'),
          banknotes.column('WaveletCurt'),
          c = banknotes.column('Class'),
          s=50)