In [4]:
!pip install nnfs

Collecting nnfs
  Downloading nnfs-0.5.1-py3-none-any.whl.metadata (1.7 kB)
Downloading nnfs-0.5.1-py3-none-any.whl (9.1 kB)
Installing collected packages: nnfs
Successfully installed nnfs-0.5.1


In [18]:
import numpy as np
import pandas as pd
import nnfs
from nnfs.datasets import vertical_data
from nnfs.datasets import spiral_data
import numpy as np
nnfs.init()
class Dense_Layer:
  def __init__(self , n_inputs , n_neurons):
    self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
    self.biases = np.zeros((1, n_neurons))

  def forward(self, inputs):
        # Save input for backward pass if needed
        self.inputs = inputs
        # Linear transformation: output = XW + b
        self.output = np.dot(inputs , self.weights) + self.biases

class Activation_Relu:
  def forward(self, inputs):
    self.output = np.maximum(0, inputs)


class Activation_Softmax:
  def forward(self, inputs):
    exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
    probabilities = exp_values / np.sum(exp_values, axis=1,keepdims=True)
    self.output = probabilities

class Loss_CategoricalCrossentropy:
    def forward(self, y_pred, y_true):
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        correct_confidences = y_pred_clipped[range(len(y_pred)), y_true]
        negative_log_likelihoods = -np.log(correct_confidences)
        return np.mean(negative_log_likelihoods)



In [6]:
X,y = vertical_data(samples = 1000 , classes=3)
layer1 = Dense_Layer(2,3)
activation1 = Activation_Relu()
layer1.forward(X)
activation1.forward(layer1.output)
layer2 = Dense_Layer(3,3)
activation2 = Activation_Softmax()
layer2.forward(activation1.output)
activation2.forward(layer2.output)
loss = Loss_CategoricalCrossentropy()
loss_value = loss.forward(activation2.output , y)
print(activation2.output)
print(loss_value)

[[0.33334285 0.33331126 0.33334592]
 [0.33334944 0.33331323 0.3333373 ]
 [0.33334312 0.3333162  0.33334067]
 ...
 [0.33331752 0.33329922 0.33338326]
 [0.3333213  0.3333001  0.33337855]
 [0.33332026 0.33330002 0.33337972]]
1.098578


In [7]:
import torch
import torch.nn as nn
from nnfs.datasets import vertical_data
import nnfs

nnfs.init()

# --- Data ---
X, y = vertical_data(samples=1000, classes=3)
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

# --- Model ---
model = nn.Sequential(
    nn.Linear(2, 3),
    nn.ReLU(),
    nn.Linear(3, 3)  # logits
)

# --- Forward pass ---
logits = model(X)  # raw scores
softmax = nn.Softmax(dim=1)
probabilities = softmax(logits)  # convert logits to probabilities

# --- Compute Cross-Entropy manually ---
# Clip probabilities to avoid log(0)
probabilities = torch.clamp(probabilities, 1e-7, 1 - 1e-7)
# Gather the probability of the correct class for each sample
correct_probs = probabilities[range(len(y)), y]
# Negative log likelihood
loss = -torch.log(correct_probs)
# Mean loss over batch
loss = loss.mean()

print("Loss:", loss.item())
print("Probabilities (first 5 samples):")
print(probabilities[:5])

# --- Backward pass ---
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
optimizer.zero_grad()
loss.backward()  # PyTorch computes gradients automatically
optimizer.step()


Loss: 1.1502777338027954
Probabilities (first 5 samples):
tensor([[0.4069, 0.2306, 0.3625],
        [0.4023, 0.2285, 0.3691],
        [0.4047, 0.2296, 0.3656],
        [0.4086, 0.2314, 0.3600],
        [0.4073, 0.2308, 0.3618]], grad_fn=<SliceBackward0>)


## Initializing Random Weights

In [17]:
X , y = vertical_data(samples = 100 , classes = 3)

layer1 = Dense_Layer(2,3)
activation1 = Activation_Relu()
layer1.forward(X)
activation1.forward(layer1.output)
layer2 = Dense_Layer(3,3)
activation2 = Activation_Softmax()
layer2.forward(activation1.output)
activation2.forward(layer2.output)
loss = Loss_CategoricalCrossentropy()
loss_value = loss.forward(activation2.output , y)


loswest_loss = 999999
best_dense_weight1 = layer1.weights.copy()
best_dense_bias1 = layer1.biases.copy()
best_dense_weight2 = layer2.weights.copy()
best_dense_bias2 = layer2.biases.copy()

for iteration in range(10000):
  layer1.weights = 0.5 * np.random.randn(2, 3)
  layer1.biases = 0.5 * np.random.rand(1, 3)
  layer2.weights = 0.5 * np.random.randn(3, 3)
  layer2.biases = 0.5 * np.random.rand(1, 3)

  layer1.forward(X)
  activation1.forward(layer1.output)
  layer2.forward(activation1.output)
  activation2.forward(layer2.output)

  loss_value = loss.forward(activation2.output , y)

  prediction = np.argmax(activation2.output , axis = 1)
  accuracy = np.mean(prediction == y)

  if loss_value < loswest_loss:
    print("New set of weight found with the iteration : " , iteration , "Loss :" , loss_value , "Accuracy : " , accuracy)
    best_dense_weight1 = layer1.weights.copy()
    best_dense_bias1 = layer1.biases.copy()
    best_dense_weight2 = layer2.weights.copy()
    best_dense_bias2 = layer2.biases.copy()
    lowest_loss = loss_value
  else:
    layer1.weights = best_dense_weight1.copy()
    layer1.biases = best_dense_bias1.copy()
    layer2.weights = best_dense_weight2.copy()
    layer2.biases = best_dense_bias2

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
New set of weight found with the iteration :  5000 Loss : 1.0645791938381957 Accuracy :  0.63
New set of weight found with the iteration :  5001 Loss : 1.088704616838552 Accuracy :  0.6166666666666667
New set of weight found with the iteration :  5002 Loss : 1.0472428271116654 Accuracy :  0.33666666666666667
New set of weight found with the iteration :  5003 Loss : 1.1692119632740017 Accuracy :  0.23333333333333334
New set of weight found with the iteration :  5004 Loss : 1.1155757601357486 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5005 Loss : 1.164646271005053 Accuracy :  0.32666666666666666
New set of weight found with the iteration :  5006 Loss : 1.1480142039018253 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5007 Loss : 1.1274837797644006 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5008 Loss : 1.210005184346329 Accuracy

## Randomly adjust the weight of the parameters to check whether its works better then the random inputing of weights and biases

In [15]:
X , y = vertical_data(samples = 100 , classes = 3)

layer1 = Dense_Layer(2,3)
activation1 = Activation_Relu()
layer1.forward(X)
activation1.forward(layer1.output)
layer2 = Dense_Layer(3,3)
activation2 = Activation_Softmax()
layer2.forward(activation1.output)
activation2.forward(layer2.output)
loss = Loss_CategoricalCrossentropy()
loss_value = loss.forward(activation2.output , y)


lowest_loss = 999999
best_dense_weight1 = layer1.weights.copy()
best_dense_bias1 = layer1.biases.copy()
best_dense_weight2 = layer2.weights.copy()
best_dense_bias2 = layer2.biases.copy()

# Here in the weightt and biase we are adding 0.5 * np.random.randn(2, 3) to adjust to get the min loss as possible
for iteration in range(10000):
  layer1.weights += 0.5 * np.random.randn(2, 3)
  layer1.biases += 0.5 * np.random.rand(1, 3)
  layer2.weights += 0.5 * np.random.randn(3, 3)
  layer2.biases += 0.5 * np.random.rand(1, 3)

  layer1.forward(X)
  activation1.forward(layer1.output)
  layer2.forward(activation1.output)
  activation2.forward(layer2.output)

  loss_value = loss.forward(activation2.output , y)

  prediction = np.argmax(activation2.output , axis = 1)
  accuracy = np.mean(prediction == y)


  if loss_value < loswest_loss:
    print("New set of weight found with the iteration : " , iteration , "Loss :" , loss_value , "Accuracy : " , accuracy)
    best_dense_weight1 = layer1.weights.copy()
    best_dense_bias1 = layer1.biases.copy()
    best_dense_weight2 = layer2.weights.copy()
    best_dense_bias2 = layer2.biases.copy()
    lowest_loss = loss_value
  else:
    layer1.weights = best_dense_weight1.copy()
    layer1.biases = best_dense_bias1.copy()
    layer2.weights = best_dense_weight2.copy()
    layer2.biases = best_dense_bias2

New set of weight found with the iteration :  0 Loss : 1.1196024 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  19 Loss : 1.1213549 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  26 Loss : 1.11226 Accuracy :  0.3433333333333333
New set of weight found with the iteration :  41 Loss : 1.0419246 Accuracy :  0.37666666666666665
New set of weight found with the iteration :  47 Loss : 1.1226361 Accuracy :  0.43
New set of weight found with the iteration :  56 Loss : 1.1541717 Accuracy :  0.38666666666666666
New set of weight found with the iteration :  65 Loss : 1.1545941 Accuracy :  0.3933333333333333
New set of weight found with the iteration :  72 Loss : 1.0117002 Accuracy :  0.43333333333333335
New set of weight found with the iteration :  73 Loss : 0.8037456 Accuracy :  0.5366666666666666
New set of weight found with the iteration :  74 Loss : 0.797057 Accuracy :  0.7833333333333333
New set of weight found with the iteration : 

Here the loss is reduced and the accuracy is improved then why have to study the gradient descent and the backpropogation i can do it by using this randomly adjusting the weightt and biases right ?

But The answer for this Question = Its not so much robust to all kind of data for example if we take the spiral data and fit the model will it gives the best accuracy like the vertical data . Lets See Below is the code for the spiral data


In [20]:
X ,y = spiral_data(samples = 100 , classes = 3)



layer1 = Dense_Layer(2,3)
activation1 = Activation_Relu()
layer1.forward(X)
activation1.forward(layer1.output)
layer2 = Dense_Layer(3,3)
activation2 = Activation_Softmax()
layer2.forward(activation1.output)
activation2.forward(layer2.output)
loss = Loss_CategoricalCrossentropy()
loss_value = loss.forward(activation2.output , y)


lowest_loss = 999999
best_dense_weight1 = layer1.weights.copy()
best_dense_bias1 = layer1.biases.copy()
best_dense_weight2 = layer2.weights.copy()
best_dense_bias2 = layer2.biases.copy()

# Here in the weightt and biase we are adding 0.5 * np.random.randn(2, 3) to adjust to get the min loss as possible
for iteration in range(10000):
  layer1.weights += 0.5 * np.random.randn(2, 3)
  layer1.biases += 0.5 * np.random.rand(1, 3)
  layer2.weights += 0.5 * np.random.randn(3, 3)
  layer2.biases += 0.5 * np.random.rand(1, 3)

  layer1.forward(X)
  activation1.forward(layer1.output)
  layer2.forward(activation1.output)
  activation2.forward(layer2.output)

  loss_value = loss.forward(activation2.output , y)

  prediction = np.argmax(activation2.output , axis = 1)
  accuracy = np.mean(prediction == y)


  if loss_value < loswest_loss:
    print("New set of weight found with the iteration : " , iteration , "Loss :" , loss_value , "Accuracy : " , accuracy)
    best_dense_weight1 = layer1.weights.copy()
    best_dense_bias1 = layer1.biases.copy()
    best_dense_weight2 = layer2.weights.copy()
    best_dense_bias2 = layer2.biases.copy()
    lowest_loss = loss_value
  else:
    layer1.weights = best_dense_weight1.copy()
    layer1.biases = best_dense_bias1.copy()
    layer2.weights = best_dense_weight2.copy()
    layer2.biases = best_dense_bias2


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
New set of weight found with the iteration :  5000 Loss : 10.745398 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5001 Loss : 10.745398 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5002 Loss : 10.745398 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5003 Loss : 10.745398 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5004 Loss : 10.745398 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5005 Loss : 10.745398 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5006 Loss : 10.745398 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5007 Loss : 10.745398 Accuracy :  0.3333333333333333
New set of weight found with the iteration :  5008 Loss : 10.745398 Accuracy :  0.3333333333333333
New set of weight found with the iteration :

## See the above code gives an accuracy of 0.33 and the loss is 10.74 which is relatively huge and it means that its not suitable for all kind of data .


## This kind of strategy works better than the radomly initializing weight and biases , but it fails miserably in complex data