I found out that there are multiple wrongly labeled objects in MNIST. Discovering this made a massive boost in the achieved accuracy.

- Accuracy before datacleaning: 0.9762714285714286

In [104]:
import matplotlib.pyplot as plt
from torch.optim import Adam
from cleanlab.filter import find_label_issues

import torch
from torch import nn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from skorch import NeuralNetClassifier
import numpy as np
import time

In [44]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu"); print(device)

cpu


In [45]:
mnist = fetch_openml("mnist_784")  # Fetch the MNIST dataset

X = mnist.data.astype("float32").to_numpy() # 2D array (images are flattened into 1D)
X /= 255.0  # Scale the features to the [0, 1] range
X = X.reshape(len(X), 1, 28, 28)  # reshape into [N, C, H, W] for PyTorch
labels = mnist.target.astype("int64").to_numpy()

In [110]:
class ClassifierModule(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 6, 3, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(6),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(6, 16, 3, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.MaxPool2d(kernel_size=2, stride=2),

            #nn.Conv2d(16, 16, 1, bias=False),
            #nn.ReLU(),
            #nn.BatchNorm2d(16),
            #nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.out = nn.Sequential(
            nn.Flatten(),
            nn.Linear(200, 200),
            nn.ReLU(),

            nn.Dropout(0.3),
            nn.Linear(200, 10),
            nn.Softmax(dim=-1),
        )

    def forward(self, X):
        X = self.cnn(X)
        X = self.out(X)
        return X

In [62]:
model_skorch = NeuralNetClassifier(ClassifierModule)

In [63]:
num_crossval_folds = 3  # for efficiency; values like 5 or 10 will generally work better
pred_probs = cross_val_predict(
    model_skorch,
    X,
    labels,
    cv=num_crossval_folds,
    method="predict_proba",
)



  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.8145[0m       [32m0.8968[0m        [35m0.3606[0m  4.8892
      2        [36m0.2474[0m       [32m0.9290[0m        [35m0.2250[0m  4.6604
      3        [36m0.1660[0m       [32m0.9448[0m        [35m0.1715[0m  5.1241
      4        [36m0.1278[0m       [32m0.9551[0m        [35m0.1415[0m  5.2159
      5        [36m0.1059[0m       [32m0.9611[0m        [35m0.1235[0m  4.5570
      6        [36m0.0917[0m       [32m0.9652[0m        [35m0.1113[0m  4.3854
      7        [36m0.0817[0m       [32m0.9675[0m        [35m0.1025[0m  4.7257
      8        [36m0.0742[0m       [32m0.9703[0m        [35m0.0954[0m  4.6420
      9        [36m0.0683[0m       [32m0.9726[0m        [35m0.0900[0m  4.9586
     10        [36m0.0635[0m       [32m0.9735[0m        [35m0.0856[0m  4.2873




  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.7762[0m       [32m0.9178[0m        [35m0.3080[0m  4.3092
      2        [36m0.2171[0m       [32m0.9462[0m        [35m0.1885[0m  4.3045
      3        [36m0.1472[0m       [32m0.9580[0m        [35m0.1481[0m  4.2971
      4        [36m0.1162[0m       [32m0.9630[0m        [35m0.1264[0m  4.2354
      5        [36m0.0980[0m       [32m0.9663[0m        [35m0.1127[0m  4.4330
      6        [36m0.0857[0m       [32m0.9676[0m        [35m0.1032[0m  4.3475
      7        [36m0.0767[0m       [32m0.9704[0m        [35m0.0960[0m  4.4524
      8        [36m0.0698[0m       [32m0.9727[0m        [35m0.0903[0m  4.3977
      9        [36m0.0642[0m       [32m0.9741[0m        [35m0.0857[0m  4.4092
     10        [36m0.0596[0m       [32m0.9745[0m        [35m0.0821[0m  4.3762




  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.7856[0m       [32m0.9190[0m        [35m0.3171[0m  4.3299
      2        [36m0.2248[0m       [32m0.9483[0m        [35m0.1945[0m  4.5750
      3        [36m0.1581[0m       [32m0.9582[0m        [35m0.1520[0m  5.2035
      4        [36m0.1274[0m       [32m0.9644[0m        [35m0.1295[0m  4.4070
      5        [36m0.1088[0m       [32m0.9672[0m        [35m0.1154[0m  4.7813
      6        [36m0.0960[0m       [32m0.9689[0m        [35m0.1051[0m  4.5288
      7        [36m0.0865[0m       [32m0.9713[0m        [35m0.0977[0m  4.4919
      8        [36m0.0791[0m       [32m0.9733[0m        [35m0.0918[0m  4.6070
      9        [36m0.0730[0m       [32m0.9747[0m        [35m0.0870[0m  4.3921
     10        [36m0.0679[0m       [32m0.9756[0m        [35m0.0832[0m  4.2467


In [64]:
predicted_labels = pred_probs.argmax(axis=1)
acc = accuracy_score(labels, predicted_labels)
print(f"Cross-validated estimate of accuracy on held-out data: {acc}")

Cross-validated estimate of accuracy on held-out data: 0.9762714285714286


In [65]:
from cleanlab.filter import find_label_issues

ranked_label_issues = find_label_issues(
    labels,
    pred_probs,
    return_indices_ranked_by="self_confidence",
)

print(f"Cleanlab found {len(ranked_label_issues)} label issues.")
print(f"Top 15 most likely label errors: \n {ranked_label_issues[:15]}")

Cleanlab found 127 label issues.
Top 15 most likely label errors: 
 [59915 24798 61299 37038 31134 23824 18348  2720 28556  4334 21348 63520
 20773 49543 65973]


In [98]:
print(len(X), len(labels))
print(X.shape, labels.shape)
new_X = np.delete(X, ranked_label_issues, axis=0)
new_labels = np.delete(labels, ranked_label_issues, axis=0)
print(len(new_X), len(new_labels))

70000 70000
(70000, 1, 28, 28) (70000,)
69873 69873


### Trying again with new dataset with the errors and new model

In [121]:
model = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Flatten(),

            nn.Linear(128*7*7, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),

            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(inplace=True),
            
            nn.Dropout(0.3),
            nn.Linear(64, 10)
        )
        

In [122]:
Xtr, Ytr, Xdev, Ydev = train_test_split(new_X, new_labels, test_size = 0.1)

starting 


TypeError: conv2d() received an invalid combination of arguments - got (numpy.ndarray, Parameter, NoneType, tuple, tuple, tuple, int), but expected one of:
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: (!numpy.ndarray!, !Parameter!, !NoneType!, !tuple!, !tuple!, !tuple!, int)
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: (!numpy.ndarray!, !Parameter!, !NoneType!, !tuple!, !tuple!, !tuple!, int)
