Kaggle competition: https://www.kaggle.com/c/digit-recognizer/

## WANDB link: https://wandb.ai/ales-2000-09/digit_recognizer?workspace=user-ales-2000-09
## Alessandro Castelli code link: https://www.kaggle.com/code/alessandromajumba/ex3-ml

### Best Score = 0.979

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

We need to import Torch's libraries

In [None]:
import torchvision
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("Label")

import wandb
wandb.login(key=secret_value_0)
wandb.init(project='digit_recognizer', save_code=True)

## Data preparation

A custom dataset which uses the CSV from Kaggle, avoid downloading the dataset from internet

In [None]:
class MyMNISTDataset(Dataset):
    
    def __init__(self, file_path, transform = transforms.Compose([transforms.ToPILImage()]), test_data=False, use_gpu=torch.cuda.is_available()):
        # read the data
        df = pd.read_csv(file_path)
        # for test data we don't have any target
        # MNIST images are 28 by 28, grey colors
        if test_data:
            self.X = df.values.reshape((-1,28,28)).astype(np.uint8)[:,:,:,None]
            self.y = None
        else:
            self.X = df.iloc[:,1:].values.reshape((-1,28,28)).astype(np.uint8)[:,:,:,None]
            self.y = torch.from_numpy(df.iloc[:,0].values)
        self.transform = transform
        self.use_gpu = use_gpu
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        data = self.transform(self.X[idx])
        if self.y is not None:
            target = self.y[idx]
            if self.use_gpu:
                data = data.cuda()
                target = target.cuda()
            return data, target
        else:
            if self.use_gpu:
                data = data.cuda()
            return data

In [None]:
from torch.utils.data.sampler import SubsetRandomSampler

transformations=transforms.Compose([transforms.ToPILImage(), 
                                    transforms.ToTensor(), 
                                    transforms.Normalize(mean=(0.5,), std=(0.5,))])

train_dataset = MyMNISTDataset('/kaggle/input/digit-recognizer/train.csv', transform=transformations, test_data=False)
test_dataset = MyMNISTDataset('/kaggle/input/digit-recognizer/test.csv', transform=transformations, test_data=True)

# create data loader for train and test set

# Define the size of validation set
validation_split = 0.2
dataset_size = len(train_dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))

# Shuffle the indices
np.random.shuffle(indices)

# Split the indices into training and validation sets
train_indices, val_indices = indices[split:], indices[:split]

# Create the samplers
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

# Create the data loaders
train_loader = DataLoader(train_dataset, batch_size=64, sampler=train_sampler)
val_loader = DataLoader(train_dataset, batch_size=64, sampler=val_sampler)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)  # For the test set, we use shuffle=False


## MLP

### Define model architecture
You need to reach at least 70% accuracy on the test set

In [None]:
class Net(nn.Module):    
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 512) # Adjust input size based on your image dimensions
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10) # Output size corresponds to the number of classes
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = x.view(-1, 28 * 28)  # Flatten the input
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        x = self.dropout(x)
        
        x = self.fc3(x)
        return x

# Create an instance of the Net model
model = Net()#.to(device)

# If using GPU, move the model to GPU
if torch.cuda.is_available():
    model.cuda()

# Print the model architecture
print(model)


### Init the model and put it on GPU/TPU

In [None]:
# let use the model on the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

print(model)

lr = 0.1
wandb.log({'learning rate': lr})

#Cross-Entropy as Loss function
criterion = nn.CrossEntropyLoss()

#SGD optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
wandb.log({'optimizer':'SGD'})

#This is another optimizer
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# wandb.log({'optimizer':'ADAM'})

### Training loop
Log the accuracy and the loss to wandb

In [None]:
# number of epochs 
n_epochs = 40
mean_train = []
mean_valid = []
valid_acc = []
#Save on WANDB
wandb.log({'num_epochs': n_epochs})

#Start training
for epoch in range(n_epochs):
    train_losses = []
    valid_losses = []

    model.train() 
    for data, target in train_loader:
        data=data.to(device)
        target=target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        
#start evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in val_loader:
            data=data.to(device)
            target=target.to(device)
            output = model(data)
            loss = criterion(output, target)
            valid_losses.append(loss.item())
            
            _, predicted = torch.max(output.data, 1)
            correct += (predicted == target).sum().item()
            total += target.size(0)

    mean_train.append(np.mean(train_losses))
    mean_valid.append(np.mean(valid_losses))
    
    accuracy = 100*correct/total
    valid_acc.append(accuracy)
    #Print the results
    print('epoch : {}, train loss : {:.4f}, valid loss : {:.4f}, valid acc : {:.2f}%'\
         .format(epoch+1, np.mean(train_losses), np.mean(valid_losses), accuracy))
    
    #Save on WANDB
    wandb.log({'train_loss':mean_train[-1]})
    wandb.log({'val_loss':mean_valid[-1]})
    wandb.log({'val_accuracy':valid_acc[-1]})

### Make prediction
And submit to Kaggle for grading

In [None]:
with torch.no_grad():
    model.eval()
    test_pred = torch.LongTensor()
    for i, data in enumerate(test_loader):
        output = model(data)
        _, predicted = torch.max(output.data, 1)
        predicted = predicted.cpu()
        test_pred = torch.cat((test_pred, predicted), dim=0)
    out_df = pd.DataFrame(np.c_[np.arange(1, len(test_dataset)+1)[:,None], test_pred.numpy()], columns=['ImageId', 'Label'])
    out_df.to_csv('submission.csv', index=False)

### Top 10 misclassified images by class probability

In [None]:
import matplotlib.pyplot as plt

batch_size=64
unshuffle_train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, shuffle=False)
with torch.no_grad():
    model.eval()
    missclasified = torch.DoubleTensor()
    for batch_idx, (data, target) in enumerate(unshuffle_train_loader):
        output = model(data)
        prob, predicted = torch.max(output.data, 1)
        predicted = predicted.cpu()
        target = target.cpu()
        prob = prob.cpu().double()
        missclassified_prob = torch.where(predicted == target, 0., prob)
        missclasified = torch.cat((missclasified, missclassified_prob), dim=0)
    most_misclassified = torch.argsort(missclasified, descending=True)
    top_ten_misclassified = most_misclassified[:10]

In [None]:
for misclassified in top_ten_misclassified:
    plt.imshow(train_dataset[misclassified][0].cpu().reshape(28,28))
    with torch.no_grad():
        data, target = train_dataset[misclassified]
        data = data.reshape(1, 1, 28,28)
        output = model(data)
        _, predicted = torch.max(output.data, 1)
        plt.title(f'Predicted: {predicted.item()}, Ground truth: {target}')
    plt.show()