In [None]:
import cv2
import numpy as np
import kagglehub
from kagglehub import KaggleDatasetAdapter
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.linear_model import LogisticRegression


In [None]:
train_dataset = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "oddrationale/mnist-in-csv",
    "mnist_train.csv",
)

test_dataset = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "oddrationale/mnist-in-csv",
    "mnist_test.csv",
)


In [None]:
Xtr = train_dataset.iloc[1:, 1:]
Ytr = train_dataset.iloc[1:, 0]
Xte = test_dataset.iloc[1:, 1:]
Yte = test_dataset.iloc[1:, 0]

# Raw data

In [None]:
train_dataset.info()

axs = plt.subplots(2,3, figsize=(12, 8))[1].flatten()

for i in range(6):
    ax = axs[i]
    img = np.array(train_dataset.iloc[i+1, 1:])
    label = train_dataset.iloc[i+1, 0]
    ax.imshow(img.reshape(28,28), cmap="binary")
    ax.set_title(f"GT: {label}")

In [None]:
# NOT worth it...
if False:
    glm_results = np.ndarray(10, dtype=object)

    for k in range(10):
        y = np.zeros_like(Ytr)
        y[Ytr == k] = 1
        glm_results[k] = sm.GLM(y, Xtr).fit()
else:
    glm_results = None

In [None]:
if glm_results:
    probs = np.array([results.predict(exog=Xte) for results in glm_results]).reshape(-1, 10)
    print(f"{probs.shape=}")

    pred = np.argmax(probs, axis=1)

    accuracy = accuracy_score(Yte, pred)
    print(f"{accuracy=:.2f}")

    cm = confusion_matrix(Yte, pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap="Blues")
    plt.show()

In [None]:
ova_models = np.empty(10, dtype=object)

for k in range(10):
    y = np.zeros_like(Ytr)
    y[Ytr == k] = 1
    ova_models[k] = LogisticRegression(solver='lbfgs', max_iter=500)
    ova_models[k].fit(Xtr, y)

In [None]:
pred = np.array([m.predict(Xte) for m in ova_models])
pred = np.argmax(pred, axis=0)
ova_accuracy = accuracy_score(Yte, pred)
print(f"{ova_accuracy=:.2f}")

# Confusion matrix visualization
cm = confusion_matrix(Yte, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()

In [None]:
model = LogisticRegression(solver='lbfgs', max_iter=500)
model.fit(Xtr, Ytr)

In [None]:
pred = model.predict(Xte)
lr_accuracy = accuracy_score(Yte, pred)
print(f"{lr_accuracy=:.2f}")

# Confusion matrix visualization
cm = confusion_matrix(Yte, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()

In [None]:
model.coef_

In [None]:
fig, axs = plt.subplots(2,5, figsize=(15, 6))
axs = axs.flatten()
ims = np.empty(10, dtype=object)

for i in range(10):
    ax = axs[i]
    ims[i] = ax.imshow(model.coef_[i].reshape(28, 28))
    ax.set_title(f"Coefficients for: {i}")
    ax.axis("off")

plt.colorbar(ims[8], ax=axs, orientation="horizontal", fraction=0.02, pad=0.04)
plt.show()

In [None]:
NB = GaussianNB()
NB.fit(Xtr, Ytr)

pred = NB.predict(Xte)
NB_accuracy = accuracy_score(Yte, pred)
print(f"{NB_accuracy=:.2f}")

cm = confusion_matrix(Yte, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(Xtr, Ytr)

pred = knn_model.predict(Xte)
knn_accuracy = accuracy_score(Yte, pred)
print(f"{knn_accuracy=:.2f}")

cm = confusion_matrix(Yte, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()

In [None]:
# KNN with different K

knn_accuracy_for_different_k = np.zeros(15, dtype=float)

for k in range(knn_accuracy_for_different_k.shape[0]):
    knn_model = KNeighborsClassifier(n_neighbors=k+1)
    knn_model.fit(Xtr, Ytr)

    pred = knn_model.predict(Xte)
    knn_accuracy_for_different_k[k] = accuracy_score(Yte, pred)
    print(f"knn_accuracy(k={k+1}) = {knn_accuracy_for_different_k[k]:.2f}")

    # cm = confusion_matrix(Yte, pred)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    # disp.plot(cmap="Blues")
    # plt.show()

# OpenCV HOG

In [None]:

def openCVHOG(im):
    winSize = (20,20)
    blockSize = (10,10)
    blockStride = (5,5)
    cellSize = (10,10)
    nbins = 9
    derivAperture = 1
    winSigma = -1.
    histogramNormType = 0
    L2HysThreshold = 0.2
    gammaCorrection = 1
    nlevels = 64
    signedGradients = True

    hog = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins,derivAperture,winSigma,histogramNormType,L2HysThreshold,gammaCorrection,nlevels, signedGradients)
    descriptor = np.ravel(hog.compute(im))

    return descriptor

In [None]:
Xtr_hog = np.array(Xtr).reshape(-1, 28,28).astype(np.uint8)
Xtr_hog = np.array([openCVHOG(img) for img in Xtr_hog])

Xte_hog = np.array(Xte).reshape(-1, 28,28).astype(np.uint8)
Xte_hog = np.array([openCVHOG(img) for img in Xte_hog])

Xtr_hog.shape, Xte_hog.shape

In [None]:
ova_hog_models = np.empty(10, dtype=object)

for k in range(10):
    y = np.zeros_like(Ytr)
    y[Ytr == k] = 1
    ova_hog_models[k] = LogisticRegression(solver='lbfgs', max_iter=500)
    ova_hog_models[k].fit(Xtr_hog, y)

In [None]:
pred = np.array([m.predict(Xte_hog) for m in ova_hog_models])
pred = np.argmax(pred, axis=0)

ova_hog_accuracy = accuracy_score(Yte, pred)
print(f"{ova_hog_accuracy=:.2f}")

cm = confusion_matrix(Yte, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()

In [None]:
model_hog = LogisticRegression(solver='lbfgs', max_iter=500)
model_hog.fit(Xtr_hog, Ytr)

In [None]:
pred = model_hog.predict(Xte_hog)
lr_hog_accuracy = accuracy_score(Yte, pred)
print(f"{lr_hog_accuracy=:.2f}")

# Confusion matrix visualization
cm = confusion_matrix(Yte, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()

In [None]:
fig, axs = plt.subplots(2,5, figsize=(15, 6))
axs = axs.flatten()
ims = np.empty(10, dtype=object)

for i in range(10):
    ax = axs[i]
    ims[i] = ax.imshow(model_hog.coef_[i].reshape(9, 9))
    ax.set_title(f"Coefficients for: {i}")
    ax.axis("off")

plt.colorbar(ims[8], ax=axs, orientation="horizontal", fraction=0.02, pad=0.04)
plt.show()

In [None]:
NB = GaussianNB()
NB.fit(Xtr_hog, Ytr)

In [None]:
pred = NB.predict(Xte_hog)
NB_hog_accuracy = accuracy_score(Yte, pred)
print(f"{NB_hog_accuracy=:.2f}")

cm = confusion_matrix(Yte, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(Xtr_hog, Ytr)

pred = knn_model.predict(Xte_hog)
knn_hog_accuracy = accuracy_score(Yte, pred)
print(f"{knn_hog_accuracy=:.2f}")

cm = confusion_matrix(Yte, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()

In [None]:
# KNN with different K

knn_hog_accuracy_for_different_k = np.zeros(15, dtype=float)

for k in range(knn_accuracy_for_different_k.shape[0]):
    knn_model = KNeighborsClassifier(n_neighbors=k+1)
    knn_model.fit(Xtr_hog, Ytr)

    pred = knn_model.predict(Xte_hog)
    knn_hog_accuracy_for_different_k[k] = accuracy_score(Yte, pred)
    print(f"knn_accuracy(k={k+1}) = {knn_hog_accuracy_for_different_k[k]:.2f}")

    # cm = confusion_matrix(Yte, pred)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    # disp.plot(cmap="Blues")
    # plt.show()

# Models comparison

In [None]:
accuracies = np.array([
    ova_accuracy,
    lr_accuracy,
    NB_accuracy,
    knn_accuracy,
])

hog_accuracies = np.array([
    ova_hog_accuracy,
    lr_hog_accuracy,
    NB_hog_accuracy,
    knn_hog_accuracy,
])

labels = [
    "1vsALL LR",
    "Logistic",
    "Naive Bayes"
    "KNN(1)",
]

width = 0.35
x = np.arange(len(labels))

ax = plt.subplots()[1]

bars1 = ax.bar(x - width / 2, accuracies, width, label="Raw")
bars2 = ax.bar(x + width / 2, hog_accuracies, width, label="HOG")

ax.set_xticks(x)
ax.set_xticklabels(labels)

ax.set_ylabel("Accuracy")

ax.set_title("Models comparison")
ax.legend()