In [123]:
import numpy as np
from scipy import stats
from sklearn.datasets import load_digits, load_iris, load_wine
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# Load digits dataset from scikit-learn; split into data and target/label
digits_data, digits_labels = load_digits(return_X_y=True)
# Get the number of samples (lines) and features (columns); also get the number of unique labels, i.e. the number of classes
(n_samples, n_features), n_digits = digits_data.shape, np.unique(digits_labels).size
print(f"- digits: {n_digits}; - samples: {n_samples}; - features: {n_features}")

# Initialize a dummy classifier (zeroR), and the Naive Bayes classifier (GaussianNB)
zR = DummyClassifier(strategy='most_frequent')
gNB = GaussianNB()

# Initialize a scalar pipeline to scale the data before feeding it to the classifier
scalar = StandardScaler()

pipeline_gNB = Pipeline([('transformer', scalar), ('estimator', gNB)])
pipeline_zR = Pipeline([('transformer', scalar), ('estimator', zR)])

# Initialize a stratified k-fold cross-validation object with seed provided by the professor
RSKF = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

# Use zeroR (DummyClassifier) and naiveBayes to predict the class of the data with stratified cross-validation (10-fold), repeated 3 times
score_gNB = cross_val_score(pipeline_gNB, digits_data, digits_labels, scoring='accuracy', cv=RSKF)
score_zR = cross_val_score(pipeline_zR, digits_data, digits_labels, scoring='accuracy', cv=RSKF)

# Print the accuracy scores for the classifiers
# gNB
mean_gNB = score_gNB.mean()
std_gNB = score_gNB.std()
lower_gNB, upper_gNB = stats.norm.interval(0.95, loc=mean_gNB, scale=std_gNB/np.sqrt(len(score_gNB)))
     
print("gNB score:\n", score_gNB)
print("gNB: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_gNB, std_gNB))
print("gNB: Accuracy Confidence Interval (95%%): (%0.2f, %0.2f)\n" % (lower_gNB, upper_gNB)) 
       
#zR                          
mean_zR = score_zR.mean()
std_zR = score_zR.std()
lower_zR, upper_zR = stats.norm.interval(0.95, loc=mean_zR, 
                               scale=std_zR/np.sqrt(len(score_zR)))

print("zR score:\n", score_zR)
print("zR: Mean Accuracy: %0.2f Standard Deviation: %0.2f" % (mean_zR, std_zR))
print("zR: Accuracy Confidence Interval (95\%%): (%0.2f, %0.2f)\n" % (lower_zR, upper_zR)) 


- digits: 10; - samples: 1797; - features: 64
gNB score:
 [0.77777778 0.80555556 0.80555556 0.75       0.72222222 0.76666667
 0.76666667 0.81564246 0.82122905 0.82681564 0.77777778 0.78333333
 0.78333333 0.77777778 0.74444444 0.76111111 0.75555556 0.77094972
 0.79888268 0.81005587 0.81111111 0.73888889 0.76111111 0.8
 0.83888889 0.77777778 0.77777778 0.8547486  0.75418994 0.79329609]
gNB: Mean Accuracy: 0.78 Standard Deviation: 0.03
gNB: Accuracy Confidence Interval (95%): (0.77, 0.80)

zR score:
 [0.1        0.1        0.1        0.1        0.1        0.1
 0.1        0.10055866 0.10614525 0.10614525 0.1        0.1
 0.1        0.1        0.1        0.1        0.1        0.10055866
 0.10614525 0.10614525 0.1        0.1        0.1        0.1
 0.1        0.1        0.1        0.10055866 0.10614525 0.10614525]
zR: Mean Accuracy: 0.10 Standard Deviation: 0.00
zR: Accuracy Confidence Interval (95\%): (0.10, 0.10)

