In [3]:
import numpy as np
from matplotlib import pyplot as plt
import torch
from torchvision import datasets, transforms

transform = transforms.ToTensor()

train_dataset = datasets.MNIST(root="./data", train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root="./data", train=False, transform=transform, download=True)

x_train = train_dataset.data.float() / 255  #κανονικοποιούμε τις τιμές των pixel
y_train = train_dataset.targets

x_test = test_dataset.data.float() / 255
y_test = test_dataset.targets


W_1 = np.random.rand(784,10) - 0.5
b_1 = np.random.rand(10, 1) - 0.5 
W_2 = np.random.rand(10, 10) - 0.5
b_2 = np.random.rand(10, 1) - 0.5 #ορίζουμε τυχαία τους πίνακς των biases και weights με τις απαιτούμενες διαστάσεις
A_layers = []
for i in range(len(x_train)):
   A_layers.append(x_train[i].reshape(784, 1))

def ReLU(x):
  return np.maximum(0, x)

def softmax(Z):
  Z = Z - np.max(Z)  
  exp_values = np.exp(Z)
  sum_exp_values = np.sum(exp_values)     #απλά εφαρμόζουμε τον τύπο της softmax και κάνουμε τις τιμές των 10 νευρώνων μία κατανομή διακριτής τυχαίας μεταβλητής
  A_out_prob = exp_values / sum_exp_values
  return A_out_prob


def forwardProp(A , W_1 , b_1, W_2, b_2): #με A να είναι ένας 784x1 πινακας η ίσοδος layer 0 στο νευρωνικό δύκτιο , W να έιναι ένας
                                          #784x10 πίνακας με τα βάρη των ακμών και b να είναι ένας 10x1 πίνακας τα biases όλων των νέων κορυφών
  A_1 = np.dot(W_1.T , A)
  Z_1 = A_1 + b_1
  A_1 = ReLU(Z_1)
  A_2 = np.dot(W_2.T, A_1)
  Z_2 = A_2 + b_2
  A_2 = softmax(Z_2)
  return A_1, Z_1, A_2 , Z_2

def d_ReLU(x):
  return x > 0


def backProp(A, A_1, A_2, Z_1, Z_2, W_1, W_2, mean):
    vector = np.zeros((10, 1))
    vector[mean, 0] = 1

    
    dZ2 = 2 * (A_2 - vector) #10x1
    dW2 = np.dot(A_1, dZ2.T)
    db2 = dZ2

    dA1 = np.dot(W_2, dZ2) #(10x10) x (10x1)
    dZ1 = dA1 * d_ReLU(Z_1) #10x1
    dW1 = np.dot(A, dZ1.T) #(784x1) x (1x10)  
    db1 = dZ1
    

    return dW1, db1, dW2, db2

    


def gradientDescent(W_1, b_1, W_2, b_2, learning_rate, epochs):
    accuracy_list = []
    
    for epoch in range(epochs):
      correct_predictions = 0
      total_samples = len(x_train)
      for i in range(total_samples):
          A = A_layers[i]
          A_1, Z_1, A_2 , Z_2 = forwardProp(A, W_1, b_1, W_2, b_2)

          predicted_label = np.argmax(A_2)
          if predicted_label == y_train[i]:
              correct_predictions += 1

          dW_1, db_1, dW_2, db_2 = backProp(A, A_1, A_2, Z_1, Z_2, W_1, W_2 ,y_train[i])

          W_1 = W_1 - learning_rate * dW_1
          b_1 = b_1 - learning_rate * db_1
          W_2 = W_2 - learning_rate * dW_2
          b_2 = b_2 - learning_rate * db_2
        
      accuracy = (correct_predictions / total_samples) * 100
      accuracy_list.append(accuracy)
      print(f"Epoch {epoch+1}, Accuracy: {accuracy:.2f}%")

      if (epoch + 1) % 5 == 0:
        learning_rate *= 0.5
        print(f"Learning rate decayed to {learning_rate}")

    return accuracy_list[-1], W_1, b_1, W_2, b_2

In [5]:
accuracy, c_W_1, c_b_1, c_W_2, c_b_2 = gradientDescent(W_1, b_1, W_2, b_2, 0.01, 8)

  A_1 = np.dot(W_1.T , A)
  dW1 = np.dot(A, dZ1.T) #A.dot(dZ_1)


Epoch 1, Accuracy: 86.62%
Epoch 2, Accuracy: 90.54%
Epoch 3, Accuracy: 91.16%
Epoch 4, Accuracy: 91.40%
Epoch 5, Accuracy: 91.72%
Learning rate decayed to 0.005
Epoch 6, Accuracy: 93.29%
Epoch 7, Accuracy: 93.59%
Epoch 8, Accuracy: 93.67%


In [None]:

def evaluate_test_accuracy(W_1, b_1, W_2, b_2, x_data, y_data):
    correct_predictions = 0
    total_samples = len(x_data)
    wrong_index = []

    for n in range(total_samples):
        input_data = x_data[n].reshape(784, 1)
        
        A_1, Z_1, A_2, Z_2 = forwardProp(input_data, W_1, b_1, W_2, b_2)
        
        predicted_label = np.argmax(A_2)
    
        if predicted_label == y_data[n]:
            correct_predictions += 1
        else:
            wrong_index.append(n)
    
    accuracy_1 = (correct_predictions / total_samples) * 100
    
    return accuracy_1, wrong_index


test_accuracy, wrong = evaluate_test_accuracy(c_W_1, c_b_1, c_W_2, c_b_2, x_test, y_test)

print(f"Training accuracy: {test_accuracy:.2f}%")
print(len(wrong))


for i in range(3875,3885):
    input_data = x_test[i].reshape(784, 1)
    A_1, Z_1, A_2 , Z_2 = forwardProp(input_data, c_W_1, c_b_1 , c_W_2, c_b_2)
    predicted_label = np.argmax(A_2)
    print("Label:" , y_test[i])
    print("Prediction:", predicted_label)
    plt.imshow(x_test[i], cmap='gray')
    plt.title(f"MNIST Digit: {y_test[i]}") 
    plt.show()



In [52]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define transformations with data augmentation for training
transform_train = transforms.Compose([
    transforms.RandomRotation(10),              # Rotate images by up to 10 degrees
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),  # Random translations
    transforms.ToTensor(),                      # Convert images to PyTorch tensors
    transforms.Normalize((0.1307,), (0.3081,))  # Normalize with MNIST mean and std
])

# Transformations for validation and test sets (no augmentation)
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Load MNIST dataset
train_dataset_full = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform_test)

# Split training dataset into training (50,000) and validation (10,000) sets
train_dataset = torch.utils.data.Subset(train_dataset_full, range(0, 50000))
val_dataset = torch.utils.data.Subset(
    torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform_test),
    range(50000, 60000)
)

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the CNN model with batch normalization and dropout
class AdvancedCNN(nn.Module):
    def __init__(self):
        super(AdvancedCNN, self).__init__()
        # Convolutional layers with batch normalization
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool = nn.MaxPool2d(2, 2)  # Reduce spatial dimensions
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 7 * 7, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)
        self.dropout = nn.Dropout(0.5)  # Dropout for regularization

    def forward(self, x):
        x = self.pool(self.bn1(torch.relu(self.conv1(x))))  # Conv1 -> BN -> ReLU -> Pool
        x = self.pool(self.bn2(torch.relu(self.conv2(x))))  # Conv2 -> BN -> ReLU -> Pool
        x = self.bn3(torch.relu(self.conv3(x)))             # Conv3 -> BN -> ReLU
        x = x.view(-1, 128 * 7 * 7)                        # Flatten
        x = self.dropout(torch.relu(self.fc1(x)))           # FC1 -> ReLU -> Dropout
        x = self.dropout(torch.relu(self.fc2(x)))           # FC2 -> ReLU -> Dropout
        x = self.fc3(x)                                     # FC3 (logits)
        return x

# Instantiate the model and move to device
model = AdvancedCNN().to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Suitable for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Adam with L2 regularization

# Learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# Accuracy computation function
def accuracy(output, target):
    _, pred = torch.max(output, 1)
    correct = (pred == target).sum().item()
    return correct / target.size(0)

# Training loop with early stopping
num_epochs = 5
best_val_acc = 0.0
patience = 5
trigger_times = 0

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    train_acc = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
        train_acc += accuracy(outputs, labels) * inputs.size(0)
    
    train_loss /= len(train_loader.dataset)
    train_acc /= len(train_loader.dataset)
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    val_acc = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            val_acc += accuracy(outputs, labels) * inputs.size(0)
    
    val_loss /= len(val_loader.dataset)
    val_acc /= len(val_loader.dataset)
    
    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
          f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    
    # Step the scheduler
    scheduler.step(val_loss)
    
    # Early stopping logic
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        trigger_times = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping triggered")
            break

# Load the best model and evaluate on test set
model.load_state_dict(torch.load('best_model.pth'))
model.eval()
test_acc = 0.0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        test_acc += accuracy(outputs, labels) * inputs.size(0)

test_acc /= len(test_loader.dataset)
print(f'Test Accuracy: {test_acc:.4f}')

Epoch 1/5, Train Loss: 0.2357, Train Acc: 0.9278, Val Loss: 0.0473, Val Acc: 0.9862
Epoch 2/5, Train Loss: 0.1097, Train Acc: 0.9695, Val Loss: 0.0440, Val Acc: 0.9875
Epoch 3/5, Train Loss: 0.0909, Train Acc: 0.9752, Val Loss: 0.0410, Val Acc: 0.9887
Epoch 4/5, Train Loss: 0.0827, Train Acc: 0.9786, Val Loss: 0.0379, Val Acc: 0.9902
Epoch 5/5, Train Loss: 0.0768, Train Acc: 0.9807, Val Loss: 0.0381, Val Acc: 0.9903
Test Accuracy: 0.9915
