# MNIST 2

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
from torchvision import transforms
import numpy as np
from IPython import display
import matplotlib.pyplot as plt
from tqdm import tqdm

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load MNIST from Hugging Face
print("Loading MNIST dataset from Hugging Face...")
dataset = load_dataset('mnist')

# Define transform to convert PIL images to tensors and normalize
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean and std
])

Using device: cuda
Loading MNIST dataset from Hugging Face...


In [2]:
class MNISTDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset, transform=None):
        self.dataset = hf_dataset
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item['image']
        label = item['label']
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

In [3]:
num_train_samples = 4000  # Use only 5000 training samples instead of 60000

train_subset = dataset['train'].select(range(num_train_samples))
train_dataset = MNISTDataset(train_subset, transform=transform)
test_dataset = MNISTDataset(dataset['test'], transform=transform)

print(f"Using {len(train_dataset)} training samples (full dataset: 60000)")
print(f"Using {len(test_dataset)} test samples")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

Using 4000 training samples (full dataset: 60000)
Using 10000 test samples


- n = 4000
- d = 784
- K = 10 classes
- number of parameters is (d+1)·H +(H +1)·K
- 785H+(H+1)10
- 795H+10
- 40000=795H+10
- H=39990/795=50.3 -> smaller than I thought.

In [4]:
width=50
(784+1)*width+(width+1)*10

39760

- There might be some intersting visualizations of this network's weights we could do!

In [5]:
class SimpleNN(nn.Module):
    def __init__(self, num_hidden_units=128):
        super(SimpleNN, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28 * 28, num_hidden_units)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(num_hidden_units, 10)
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        # x = self.dropout(x)
        x = self.fc2(x)
        return x

In [6]:
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Statistics
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    avg_loss = total_loss / len(loader)
    accuracy = 100. * correct / total
    return avg_loss, accuracy

In [7]:
def test(model, loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    avg_loss = total_loss / len(loader)
    accuracy = 100. * correct / total
    return avg_loss, accuracy

In [8]:
num_epochs = 50
model = SimpleNN(width).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    test_loss, test_acc = test(model, test_loader, criterion)
    
    print(f'Epoch [{epoch+1}/{num_epochs}]')
    print(f'  Train Loss: {train_loss:.5f}, Train Acc: {train_acc:.2f}%')
    print(f'  Test Loss:  {test_loss:.5f}, Test Acc:  {test_acc:.2f}%')

Epoch [1/50]
  Train Loss: 0.94772, Train Acc: 74.12%
  Test Loss:  0.46163, Test Acc:  87.21%
Epoch [2/50]
  Train Loss: 0.36862, Train Acc: 89.60%
  Test Loss:  0.37316, Test Acc:  89.13%
Epoch [3/50]
  Train Loss: 0.28541, Train Acc: 91.85%
  Test Loss:  0.34784, Test Acc:  89.37%
Epoch [4/50]
  Train Loss: 0.23676, Train Acc: 93.47%
  Test Loss:  0.31623, Test Acc:  90.33%
Epoch [5/50]
  Train Loss: 0.20591, Train Acc: 94.35%
  Test Loss:  0.32035, Test Acc:  90.27%
Epoch [6/50]
  Train Loss: 0.17892, Train Acc: 95.22%
  Test Loss:  0.29658, Test Acc:  91.02%
Epoch [7/50]
  Train Loss: 0.15624, Train Acc: 95.75%
  Test Loss:  0.28987, Test Acc:  91.26%
Epoch [8/50]
  Train Loss: 0.13130, Train Acc: 96.60%
  Test Loss:  0.28274, Test Acc:  91.42%
Epoch [9/50]
  Train Loss: 0.11228, Train Acc: 97.25%
  Test Loss:  0.28797, Test Acc:  91.51%
Epoch [10/50]
  Train Loss: 0.09986, Train Acc: 97.50%
  Test Loss:  0.28877, Test Acc:  91.37%
Epoch [11/50]
  Train Loss: 0.08938, Train Acc: 9

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
from torchvision import transforms
import numpy as np
from IPython import display
import matplotlib.pyplot as plt
from tqdm import tqdm

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load MNIST from Hugging Face
print("Loading MNIST dataset from Hugging Face...")
dataset = load_dataset('mnist')

# Define transform to convert PIL images to tensors and normalize
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean and std
])

class MNISTDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset, transform=None):
        self.dataset = hf_dataset
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item['image']
        label = item['label']
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

num_train_samples = 4000  # Use only 5000 training samples instead of 60000

train_subset = dataset['train'].select(range(num_train_samples))
train_dataset = MNISTDataset(train_subset, transform=transform)
test_dataset = MNISTDataset(dataset['test'], transform=transform)

print(f"Using {len(train_dataset)} training samples (full dataset: 60000)")
print(f"Using {len(test_dataset)} test samples")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

width=50
num_params=(784+1)*width+(width+1)*10

class SimpleNN(nn.Module):
    def __init__(self, num_hidden_units=128):
        super(SimpleNN, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28 * 28, num_hidden_units)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(num_hidden_units, 10)
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        # x = self.dropout(x)
        x = self.fc2(x)
        return x

def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Statistics
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    avg_loss = total_loss / len(loader)
    accuracy = 100. * correct / total
    return avg_loss, accuracy

def test(model, loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    avg_loss = total_loss / len(loader)
    accuracy = 100. * correct / total
    return avg_loss, accuracy

num_epochs = 50
model = SimpleNN(width).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    test_loss, test_acc = test(model, test_loader, criterion)
    
    print(f'Epoch [{epoch+1}/{num_epochs}]')
    print(f'  Train Loss: {train_loss:.5f}, Train Acc: {train_acc:.2f}%')
    print(f'  Test Loss:  {test_loss:.5f}, Test Acc:  {test_acc:.2f}%')

Using device: cuda
Loading MNIST dataset from Hugging Face...
Using 4000 training samples (full dataset: 60000)
Using 10000 test samples


In [9]:
# num_epochs=20
# eval_freq=128 #Batches

# train_loss=[]
# test_loss=[]
# train_accuracy=[]
# test_accuracy=[]
# #Might be cool to track weights norm as we go too?

# model = SimpleNN(width).to(device)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# global_step=0
# total_loss = 0
# correct = 0
# total = 0

# for epoch in tqdm(range(num_epochs)):
#     for images, labels in train_loader:

#         images, labels = images.to(device), labels.to(device)

#         outputs = model(images)
#         loss = criterion(outputs, labels)

#         # Statistics
#         total_loss += loss.item()
#         _, predicted = outputs.max(1)
#         total += labels.size(0)
#         correct += predicted.eq(labels).sum().item()
        
#         if global_step%eval_freq==0:
#             train_accuracy.append(100. * correct / total)
#             train_loss.append(total_loss / eval_freq)
            
#             tl, ta = test(model, test_loader, criterion)
#             test_loss.append(tl); test_accuracy.append(ta)
            
#             plt.clf()
#             fig=plt.figure(0, (12, 8))
#             fig.add_subplot(2,1,1)
#             plt.plot(train_loss, 'b', alpha=0.5)
#             plt.plot(test_loss, 'g', alpha=0.5)
#             plt.title('Train Loss = '+ str(round(train_loss[-1], 5))+ ', Test Loss = '+ str(round(test_loss[-1], 5)))
#             fig.add_subplot(2,1,2)
#             plt.plot(train_accuracy, 'b', alpha=0.5)
#             plt.plot(test_accuracy, 'g', alpha=0.5) 
#             plt.title('Train Acc = '+ str(train_accuracy[-1])+ ', Test Acc = '+ str(test_accuracy[-1]))
#             display.clear_output(wait=True)
#             display.display(plt.gcf())

#             model.train()
#             total_loss = 0
#             correct = 0
#             total = 0

        
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()


    
#         global_step+=1

In [10]:
# np.min(test_loss), np.max(test_accuracy), np.min(train_loss), np.max(train_accuracy)

---