Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ready: Feature 139 confusion matrix #144

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Binary file added docs/examples/images/confusionMatrix_3_0.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
59 changes: 59 additions & 0 deletions docs/examples/methods.rst
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,65 @@ heatmap in order for easy interpretation and detection.
.. image:: images/examples_32_0.png


Confusion Matrix Visualizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~

The ``ConfusionMatrix`` visualizer is a ScoreVisualizer that takes a
fitted scikit-learn classifier and a set of test X and y values and
returns a report showing how each of the test values predicted classes
compare to their actual classes. Data scientists use confusion matrices
to understand which classes are most easily confused. These provide
similar information as what is available in a ClassificationReport, but
rather than top-level scores they provide deeper insight into the
classification of individual data points.

Below are a few examples of using the ConfusionMatrix visualizer; more
information can be found by looking at the
sklearn.metrics.confusion\_matrix documentation.

.. code:: python

#First do our imports
import yellowbrick

from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

from yellowbrick.classifier import ConfusionMatrix

.. code:: python

# We'll use the handwritten digits data set from scikit-learn.
# Each feature of this dataset is an 8x8 pixel image of a handwritten number.
# Digits.data converts these 64 pixels into a single array of features
digits = load_digits()
X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=11)

model = LogisticRegression()

#The ConfusionMatrix visualizer taxes a model
cm = ConfusionMatrix(model, classes=[0,1,2,3,4,5,6,7,8,9])

#Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X_train, y_train)

#To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
#and then creates the confusion_matrix from scikit learn.
cm.score(X_test, y_test)

#How did we do?
cm.poof()



.. image:: images%5CconfusionMatrix_3_0.png



ROCAUC
~~~~~~

Expand Down
313 changes: 313 additions & 0 deletions examples/nealhumphrey/confusionMatrix.ipynb

Large diffs are not rendered by default.

1,059 changes: 1,059 additions & 0 deletions examples/nealhumphrey/data/default_features_1059_tracks.txt

Large diffs are not rendered by default.

1,059 changes: 1,059 additions & 0 deletions examples/nealhumphrey/data/default_plus_chromatic_features_1059_tracks.txt

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions tests/test_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@

from sklearn.svm import LinearSVC
from sklearn.metrics import *
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

##########################################################################
## Data
Expand Down Expand Up @@ -68,3 +71,55 @@ def test_class_report(self):
model.fit(X,y)
visualizer = ClassificationReport(model, classes=["A", "B"])
visualizer.score(X,y)

class ConfusionMatrixTests(VisualTestCase):
def __init__(self, *args, **kwargs):
super(ConfusionMatrixTests, self).__init__(*args, **kwargs)
#Use the same data for all the tests
self.digits = load_digits()

X = self.digits.data
y = self.digits.target

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=11)
self.X_train = X_train
self.X_test = X_test
self.y_train = y_train
self.y_test = y_test

def test_confusion_matrix(self):
model = LogisticRegression()
cm = ConfusionMatrix(model, classes=[0,1,2,3,4,5,6,7,8,9])
cm.fit(self.X_train, self.y_train)
cm.score(self.X_test, self.y_test)

def test_no_classes_provided(self):
model = LogisticRegression()
cm = ConfusionMatrix(model)
cm.fit(self.X_train, self.y_train)
cm.score(self.X_test, self.y_test)

def test_raw_count_mode(self):
model = LogisticRegression()
cm = ConfusionMatrix(model)
cm.fit(self.X_train, self.y_train)
cm.score(self.X_test, self.y_test, percent=False)

def test_zoomed_in(self):
model = LogisticRegression()
cm = ConfusionMatrix(model, classes=[0,1,2])
cm.fit(self.X_train, self.y_train)
cm.score(self.X_test, self.y_test)

def test_extra_classes(self):
model = LogisticRegression()
cm = ConfusionMatrix(model, classes=[0,1,2,11])
cm.fit(self.X_train, self.y_train)
cm.score(self.X_test, self.y_test)
self.assertTrue(cm.selected_class_counts[3]==0)

def test_one_class(self):
model = LogisticRegression()
cm = ConfusionMatrix(model, classes=[0])
cm.fit(self.X_train, self.y_train)
cm.score(self.X_test, self.y_test)
26 changes: 26 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,32 @@ def test_classifier_visualizer(self):
model = ScoreVisualizer(RandomForestClassifier())
self.assertTrue(is_classifier(model))


class DivSafeTests(unittest.TestCase):

def test_div_1d_by_scalar(self):
result = div_safe( [-1, 0, 1], 0 )
self.assertTrue(result.all() == 0)

def test_div_1d_by_1d(self):
result =div_safe( [-1, 0 , 1], [0,0,0])
self.assertTrue(result.all() == 0)

def test_div_2d_by_1d(self):
numerator = np.array([[-1,0,1,2],[1,-1,0,3]])
denominator = [0,0,0,0]
result = div_safe(numerator, denominator)

def test_invalid_dimensions(self):
numerator = np.array([[-1,0,1,2],[1,-1,0,3]])
denominator = [0,0]
with self.assertRaises(ValueError):
result = div_safe(numerator, denominator)

def test_div_scalar_by_scalar(self):
with self.assertRaises(ValueError):
result = div_safe(5, 0)

##########################################################################
## Decorator Tests
##########################################################################
Expand Down