In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix

In [None]:
# loads built in hand written digits data set
dataset = load_digits()
# breaking data set into features and labels
X, y = dataset.data, dataset.target

# showing how many of each class there are in the data set
# note that np.bincount counts the number of occurances of each value in
# an array of non-negative ints
for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name, class_count)

In [None]:
# note that the previous distribution shows that each class is pretty much uniformly distributed

# here we are creating a distribution where the only positive classes are the ones whos label
# is 1
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0

print("original labels: ", y[1:30])
print("new binary labels: ", y_binary_imbalanced[1:30])
# this shows us that there are 1615 negative classes and only 182 positive classes therefore
# we have a class imbalance
print("class balance: ", np.bincount(y_binary_imbalanced))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)

# This accuracy seems good, but we will check if this accuracy is truly indicative of a good model
# using comparisons between this model and dummy classifiers
print("model accuracy: ", svm.score(X_test, y_test))

# Dummy Classifiers

In [None]:
# this classifiers strategy is to always predict the most frequently occuring class.
# In our data set, that would be the negative class

# besides most_frequent, we could use one of the following parameters:
# stratified: random predictions based on test set distributions
# uniform: generates predictions uniformly at random
# constant: predicts a constant label provided by the user
dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train) 

y_dummy_predictions = dummy_majority.predict(X_test)

print("output of dummy classifiers predictions")
print(y_dummy_predictions)

In [None]:
# although this model only predicts the negative class for each instance, it still is
# calculated to have an accuracy of 90%. This means that accuracy is not always the
# best indicator that your model is of high quality
print("accuracy of dummy classifier: ", dummy_majority.score(X_test, y_test))

In [None]:
dummy_majority = DummyClassifier(strategy="most_frequent").fit(X_train, y_train)

y_majority_predicted = dummy_majority.predict(X_test)

# a confusion matrix displays how many true positive, false positives, false negatives, and true negatives
# there are in a certain prediction set
confusion = confusion_matrix(y_test, y_majority_predicted)

# the table is arranged like this:
# true negative          false positive
# false negative         true positive
# correct predictions are represented on the diaganol while errors are represented off the diagonal
print("most frequent class prediction (dummy classifier)\n", confusion)

In [None]:
dummy_classprop = DummyClassifier(strategy="stratified").fit(X_train, y_train)

y_classprop_predicted = dummy_classprop.predict(X_test)
confusion = confusion_matrix(y_test, y_classprop_predicted)

# here we can see that since we used the stratified strategyy instead of the most frequent
# strategy, we have some values coming up in the false and true positive sections
print("random class-proportional predictions (dummy classifier)\n", confusion)