# Breast Cancer Diagnostic with Adaline

In [1]:
import pandas as pd # for loading data csv into dataframe and cleaning data
import os # for building url path
import numpy as np 
import matplotlib.colors # will use ListedColorMap to plot results
import matplotlib.pyplot as pp

## Read cancer data from ML database

In [2]:
cancer_data_url = os.path.join ('https://archive.ics.uci.edu', 'ml', 'machine-learning-databases', 'breast-cancer-wisconsin', 'breast-cancer-wisconsin.data')
cancer_data = pd.read_csv(cancer_data_url, header=None, encoding='utf-8')
cancer_data.columns = ['id', 'thickness', 'size uniformity', # assign columns more useful names
                       'shape uniformity', 'marginal adhesion',
                       'epi cell size', 'bare nuclei', 'bland chromatin',
                       'normal nucleoi', 'mitoses', 'class']
cancer_data.head()

Unnamed: 0,id,thickness,size uniformity,shape uniformity,marginal adhesion,epi cell size,bare nuclei,bland chromatin,normal nucleoi,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## Helper Fn to return a list of every unique grouping of three values in a given list

In [3]:
def every_unique_pair(mylist):
    """returns every positionally unique pair in a given list: assumes that no elements in the list appear more than once"""
    ret_list = []
    list_copy = mylist[1:].copy()
    for i in mylist:
        for j in list_copy:
            ret_list.append([i,j])
        list_copy = list_copy[1:]
    return ret_list

# figure,subplotx = pp.subplots(len(every_unique_pair(cancer_data.columns[1:len(cancer_data.columns)-1])))
# figure.suptitle('Visual Comparison of Linear Separability')
# figure.set_size_inches(20, 40)
# for i, pair in enumerate(every_unique_pair(cancer_data.columns[1:len(cancer_data.columns) - 1])):
#     c1,c2 = pair
#     plot_classes_by_columns(subplotx[i], cancer_data, c1, c2)

## Adaline SGD Classifier Class

### As implemented in Ch02

In [4]:
class AdalineSGD(object):
    """ADAptive LInear NEuron classifier."""
    
    # I added the keyword parameter threshold to allow the user to specify the threshold
    def __init__(self, learning_rate=0.01, epochs=10, shuffle=True, random_seed=None, threshold=0):
        """Initialize the Adaline object"""
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights_initialized = False
        self.shuffle = shuffle
        self.random_seed = random_seed
        self.threshold = threshold
        
    def fit(self, X, y):
        """Initialize and iteratively update weights"""
        self._initialize_weights(X.shape[1])
        self.cost_ = []
        for i in range(self.epochs):
            if self.shuffle:
                X, y = self._shuffle(X, y)
            cost = []
            for xi, target in zip(X, y):
                cost.append(self._update_weights(xi, target))
            avg_cost = sum(cost) / len(y)
            self.cost_.append(avg_cost)
        return self

    def partial_fit(self, X, y):
        """Fit training data without reinitializing the weights"""
        if not self.weights_initialized:
            self._initialize_weights(X.shape[1])
        if y.ravel().shape[0] > 1:
            for xi, target in zip(X, y):
                self._update_weights(xi, target)
        else:
            self._update_weights(X, y)
        return self

    def _shuffle(self, X, y):
        """Shuffle training data"""
        r = self.rgen.permutation(len(y))
        return X[r], y[r]
    
    def _initialize_weights(self, m):
        """Initialize weights to small random numbers"""
        self.rgen = np.random.RandomState(self.random_seed)
        self.weights = self.rgen.normal(loc=0.0, scale=0.01, size=1 + m)
        self.weights_initialized = True
        
    def _update_weights(self, xi, target):
        """Apply Adaline learning rule to update the weights"""
        output = self.activation(self.net_input(xi))
        error = (target - output)
        self.weights[1:] += self.learning_rate * xi.dot(error)
        self.weights[0] += self.learning_rate * error
        cost = 0.5 * error**2
        return cost
    
    def net_input(self, X):
        """Calculate net input"""
        return np.dot(X, self.weights[1:]) + self.weights[0]

    def activation(self, X):
        """Compute linear activation"""
        return X

    def predict(self, X):
        """Return class label after unit step"""
        return np.where(self.activation(self.net_input(X)) >= self.threshold, 1, -1)

# Fns for running Adaline and analyzing results

In [5]:
def accuracy_and_misclasses(prediction, labels):
    """Fn to determine accuracy"""
    missclassifications = 0
    correct_predictions = len(labels)
    for a,b in zip(prediction, labels):
        if a != b:
            missclassifications += 1
            correct_predictions -= 1
    return (correct_predictions / len(labels), missclassifications)

In [6]:
def split_fit_test(c1, c2, c3, testtrain_ratio, dataframe=cancer_data, verbose=False, learning_rate=0.1, epochs=50, threshold=0):
    """split data from feature columns c1 and c2 into train and test sets at tt_ratio proportions and fit/test a perceptron"""
    
    # get the integer indeces corresponding to the column names passed to split_fit_test
    c1_idx = dataframe.columns.get_loc(c1)
    c2_idx = dataframe.columns.get_loc(c2)
    c3_idx = dataframe.columns.get_loc(c2)
    
    # number of rows of dataframe which will belong to the training set (we know the number in the test set from this implicitly)
    num_train = len(dataframe.index) - int(len(dataframe.index) * testtrain_ratio)
    
    # Training set
    y_train = cancer_data.iloc[:num_train,10].values # the array of target values: 2 for benign, 4 for malignant
    y_train = np.where(y_train == 2, -1, 1) # change class labels 2 and 4 to -1 and 1 respectively
    X_train = cancer_data.iloc[:num_train, [c1_idx,c2_idx,c3_idx]].values
    
    # feature scaling to standardize the distribution of values in our training set
    X_train_std = np.copy(X_train)
    X_train_std[:, 0] = (X_train[:, 0] - X_train[:, 0].mean()) / X_train[:, 0].std()
    X_train_std[:, 1] = (X_train[:, 1] - X_train[:, 1].mean()) / X_train[:, 1].std()
    X_train_std[:, 2] = (X_train[:, 2] - X_train[:, 2].mean()) / X_train[:, 2].std()
    
    # Testing set
    y_test = cancer_data.iloc[num_train:,10].values # analagous to above
    y_test = np.where(y_test == 2, -1, 1)
    X_test = cancer_data.iloc[num_train:, [c1_idx, c2_idx, c3_idx]].values
    
    # Feature scaling for test set
    X_test_std = np.copy(X_train)
    X_test_std[:, 0] = (X_train[:, 0] - X_train[:, 0].mean()) / X_train[:, 0].std()
    X_test_std[:, 1] = (X_train[:, 1] - X_train[:, 1].mean()) / X_train[:, 1].std()
    X_test_std[:, 2] = (X_train[:, 2] - X_train[:, 2].mean()) / X_train[:, 2].std()
    
    # instantiate and train an Adaline object
    ada = AdalineSGD(learning_rate=learning_rate, epochs=epochs, threshold=threshold)
    ada.fit(X_train_std, y_train)

    # predict the classes of the test set and calculate accuracy
    prediction = ada.predict(X_test_std)
    accuracy,misclasses = accuracy_and_misclasses(prediction, y_test)
    if verbose:
        print("For features", c1, ",", c2, "and", c3, ", and test/train ratio", testtrain_ratio, "the perceptron had", misclasses, "missclassifications and had an accuracy of", accuracy, "\n")
    return (accuracy, misclasses)

## Test Run:::

In [7]:
split_fit_test('thickness', 'bland chromatin', 'shape uniformity', 0.3, verbose=True)

For features thickness , bland chromatin and shape uniformity , and test/train ratio 0.3 the perceptron had 82 missclassifications and had an accuracy of 0.6076555023923444 



(0.6076555023923444, 82)

We're already off to a better start in terms of accuracy than with the perceptron implementation!

## Maximizing Accuracy

Using the classes and functions we've created so far, we can now actually utilize and test the Adaline concept on the data-set we loaded in. 

### Pass 1: Maximize accuracy with respect to test/training set ratio

Here we try to maximize the accuracy by the way we split out training and test sets. 

In [8]:
best_accuracy = 0
misses = 0
best_prop = 0
 
for prop in [0.25, 0.3, 0.35, 0.40, 0.45]:
    acc,miss = split_fit_test('thickness', 'bland chromatin', 'shape uniformity', prop)
    if acc > best_accuracy:
        best_accuracy = acc
        misses = miss
        best_prop = prop
        print(acc, misses)
        
        
print("The highest accuracy was", best_accuracy, "for test/train proportion", best_prop, "with", misses, "missclassifications.")

0.6149425287356322 67
The highest accuracy was 0.6149425287356322 for test/train proportion 0.25 with 67 missclassifications.


No significant improvement in accuracy yet, but still slightly better nonetheless! Our best test/train ratio will be 1:3

### Pass 2: Maximize accuracy by learning rate

Next we will maximize accuracy with respect to learning rate. Notice we carry over our test/train ratio which we chose the previous pass.

In [9]:
best_accuracy = 0
misses = 0
best_rate = 0

for rate in [0.0001,0.0005, 0.001, 0.005, 0.01]:
    acc,miss = split_fit_test('thickness', 'bland chromatin', 'shape uniformity', 0.25, learning_rate=rate)
    if acc > best_accuracy:
        best_accuracy = acc
        misses = miss
        best_rate = rate

print("The highest accuracy was", best_accuracy, "for learning rate", best_rate, "with", misses, "missclassifications.")

The highest accuracy was 0.6149425287356322 for learning rate 0.0001 with 67 missclassifications.


No improvement here, but based on the results we can assume that 0.0001 is a reasonable learning rate

### Pass 3: Maximize accuracy with respect to number of epochs

In [10]:
best_accuracy = 0
misses = 0
best_num_epochs = 0

for n in [10, 20, 30, 40, 50, 75, 100]:
    acc,miss = split_fit_test('thickness', 'bland chromatin', 'shape uniformity', 0.25, epochs=n, learning_rate=0.0001)
    if acc > best_accuracy:
        best_accuracy = acc
        misses = miss
        best_num_epochs = n

print("The highest accuracy was", best_accuracy, "for", best_num_epochs, "epochs with", misses, "missclassifications.")

The highest accuracy was 0.6149425287356322 for 30 epochs with 67 missclassifications.


Once again, no change in accuracy: but we now know that we need about 30 epochs for our weights to converge to the ideal linear seperation

### Pass 4: Maximize accuracy with respect to threshold

In [11]:
best_accuracy = 0
misses = 0
best_threshold = 0
for theta in [0, 0.1, 0.01, 0.2, 0.5, 1.0, 2.0, 3.0]:
    acc,miss = split_fit_test('thickness', 'bland chromatin', 'shape uniformity', 0.25, epochs=50, learning_rate=0.0005, threshold=theta)
    if acc > best_accuracy:
        best_accuracy = acc
        misses = miss
        best_threshold = theta
        
print("The highest accuracy was", best_accuracy, "for the threshold", best_threshold, "with", misses, "missclassifications.")

The highest accuracy was 0.7873563218390804 for the threshold 1.0 with 37 missclassifications.


It seems that increasing our threshold pretty drastically improved our accuracy! Compared to the perceptron, Adaline performs pretty well even without tuning our parameters. By tuning the parameters as we did in the perceptron implementation, we got a slightly higher accuracy than in the perceptron model.