In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

# data load

In [9]:
mnist = fetch_openml('mnist_784', version = 1)

X, y = mnist["data"], mnist["target"]
y = y.astype(np.uint8)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# binary classification

In [10]:
# make binary target variable
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [11]:
# linear classifier with SGD
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(X_train, y_train_5)

SGDClassifier(random_state=42)

In [12]:
# cross validation implementation
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits = 3, random_state = 42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print( n_correct / len(y_pred) )

# same with
# from sklearn.model_selection import cross_val_score
# cross_val_score(sgd_clf, X_train, y_train_5, cv = 3, scoring = "accuracy")



0.95035
0.96035
0.9604


In [15]:
# make naive classifier
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score

class Never5Classifier(BaseEstimator):
    def fit(self, X, y = None):
        pass
    # input의 길이만큼 0(False)을 채워 반환
    def predict(self, X):
        return np.zeros((len(X), 1), dtype = bool)

never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

array([0.91125, 0.90855, 0.90915])

# Metrics

In [16]:
# confusion matrix
# row : true class (-, +) / column : prediction (-, +) 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3)
print(confusion_matrix(y_train_5, y_train_pred))

y_train_perfect_pred = y_train_5
print(confusion_matrix(y_train_5, y_train_perfect_pred))

[[53892   687]
 [ 1891  3530]]
[[54579     0]
 [    0  5421]]
