In [35]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

# uncomment once you paste your myclassifiers.py into mysklearn package
import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MyDummyClassifier, MyNaiveBayesClassifier, MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

# Breast Cancer Prediction Mid Demo

We start by cleaning the data. Cancer_Type, BMI, and Patient Study ID are being removed due to irrelavence, missing data, and because they were unlisted on the dataset preview and were thus were not needed.

In [19]:
data = MyPyTable()
data.load_from_file("input_data/breastcancer.csv")
data.rem_column("Cancer_Type")
data.rem_column("Body_Mass_Index")
data.rem_column("Patients_Study_ID")
data.save_to_file("input_data/breast_cancer_clean.csv")
y = data.get_column("Radiologists_Assessment")
data.rem_column("Radiologists_Assessment")
X = data.data

In [53]:
nvb_clf = MyNaiveBayesClassifier()
knn_clf = MyKNeighborsClassifier(3)

split = myevaluation.stratified_kfold_split(X[:1000], y[:1000], 10, 10, True)

In [54]:
X_train = []
y_train = []
X_test = []
y_test = []
for train, test in split:
    for i in train:
        X_train.append(X[i])
        y_train.append(y[i])
    for i in test:
        X_test.append(X[i])
        y_test.append(y[i])

nvb_clf.fit(X_train, y_train)
knn_clf.fit(X_train, y_train)

knn_pred = knn_clf.predict(X_test, True)
nvb_pred = nvb_clf.predict(X_test)


In [55]:
knn_acc = myevaluation.accuracy_score(y_test, knn_pred)
nvb_acc = myevaluation.accuracy_score(y_test, nvb_pred)
print("===========================================")
print("STEP 1: Predictive Accuracy")
print("===========================================")
print("Stratified 10-Fold Cross Validation")
print("k Nearest Neighbors Classifier: accuracy = {}, error rate = {}".format(round(knn_acc, 2), round(1-knn_acc, 2)))
print("Naive Bayes Classifier: accuracy = {}, error rate = {}".format(round(nvb_acc, 2), round(1-nvb_acc, 2)))

STEP 1: Predictive Accuracy
Stratified 10-Fold Cross Validation
k Nearest Neighbors Classifier: accuracy = 0.81, error rate = 0.19
Naive Bayes Classifier: accuracy = 0.46, error rate = 0.54


In [61]:
knn_pre = myevaluation.binary_precision_score(y_test, knn_pred, pos_label="Negative")
nvb_pre = myevaluation.binary_precision_score(y_test, nvb_pred, pos_label="Negative")
knn_rec = myevaluation.binary_recall_score(y_test, knn_pred, pos_label="Negative")
nvb_rec = myevaluation.binary_recall_score(y_test, nvb_pred, pos_label="Negative")
knn_f1 = myevaluation.binary_f1_score(y_test, knn_pred, pos_label="Negative")
nvb_f1 = myevaluation.binary_f1_score(y_test, nvb_pred, pos_label="Negative")
print("===========================================")
print("STEP 2: Predictive Accuracy")
print("===========================================")
print("Stratified 10-Fold Cross Validation")
print("k Nearest Neighbors Classifier: precision = {}, recall = {}, f1 = {}".format(round(knn_pre, 2), round(knn_rec, 2), round(knn_f1, 2)))
print("Naive Bayes Classifier: precision = {}, recall = {}, f1 = {}".format(round(nvb_pre, 2), round(nvb_rec, 2), round(nvb_f1, 2)))

STEP 2: Predictive Accuracy
Stratified 10-Fold Cross Validation
k Nearest Neighbors Classifier: precision = 0.71, recall = 0.98, f1 = 0.83
Naive Bayes Classifier: precision = 0.44, recall = 0.94, f1 = 0.6


In [64]:
from tabulate import tabulate 

knn_matrix = myevaluation.confusion_matrix(y_test, knn_pred, ["Negative", "Benign findings", "Probably benign", "Needs additional imaging", "Suspicious abnormality"])
nvb_matrix = myevaluation.confusion_matrix(y_test, nvb_pred, ["Negative", "Benign findings", "Probably benign", "Needs additional imaging", "Suspicious abnormality"])

print("===========================================")
print("STEP 4: Confusion Matrices")
print("===========================================")
print("k Nearest Neighbors Classifier (Stratified 10-Fold Cross Validation Results):")

print(tabulate(knn_matrix, headers=["Negative", "Benign findings", "Probably benign", "Needs additional imaging", "Suspicious abnormality"]))

print("Naive Bayes Classifier (Stratified 10-Fold Cross Validation Results):")

print(tabulate(nvb_matrix, headers=["Negative", "Benign findings", "Probably benign", "Needs additional imaging", "Suspicious abnormality"]))

STEP 4: Confusion Matrices
k Nearest Neighbors Classifier (Stratified 10-Fold Cross Validation Results):
  Negative    Benign findings    Probably benign    Needs additional imaging    Suspicious abnormality
----------  -----------------  -----------------  --------------------------  ------------------------
       196                  4                  0                           0                         0
        49                147                  0                           4                         0
         2                  0                  3                           0                         0
        27                  6                  0                          42                         0
         1                  0                  0                           0                         1
Naive Bayes Classifier (Stratified 10-Fold Cross Validation Results):
  Negative    Benign findings    Probably benign    Needs additional imaging    Suspicious abnormality
-