In [88]:
"""
Adam Nurlign 7/2/2025

Hello there! In this notebook I will be implementing a comprehensive Neural Network Deep Learning Model Framework
in which you can build and customise your own neural networks which can be trained and evaluated on datasets of your choosing.

There are many modules in Python such as PyTorch and Scikit-learn that give you access to machine learning frameworks and
allow you to build your own models made up of layers. I thought i would be a good exercise to be able to implement some of these
features from scratch. I hope you enjoy!

Here are some current constraints to my Machine Learning modules:

-Must have a linear, activation, linear, activation .... linear network structure
-The activations must be sigmoid or relu
-Can only perform stochastic gradient descent

"""

'\nAdam Nurlign 7/2/2025\n\nHello there! In this notebook I will be implementing a comprehensive Neural Network Deep Learning Model Framework\nin which you can build and customise your own neural networks which can be trained and evaluated on datasets of your choosing.\n\nThere are many modules in Python such as PyTorch and Scikit-learn that give you access to machine learning frameworks and \nallow you to build your own models made up of layers. I thought i would be a good exercise to be able to implement some of these\nfeatures from scratch. I hope you enjoy!\n\nHere are some current constraints to my Machine Learning modules:\n\n-Must have a linear, activation, linear, activation .... linear network structure\n-The activations must be sigmoid or relu\n-Can only perform stochastic gradient descent\n\n'

In [89]:
import numpy as np

In [90]:
#Layer Superclass
class Layer():
  def __init__(self):
    pass
  def apply(self,x):
    pass

In [91]:
class LinearLayer(Layer):
  def __init__(self,in_dim,out_dim,activation=None):
    super().__init__()
    #We should have a matrix with the dimension out_dim by in_dim which draws each element from the normal distribution with mean 0 and variance 4/(in_dim+out_dim)
    variance = 4/(in_dim+out_dim)
    std_dev=np.sqrt(variance)
    self.weights = np.random.normal(loc=0.0, scale=std_dev, size=(out_dim, in_dim))
    self.bias=np.zeros((out_dim,1))
    #storing the derivative of objective with respect to weight matrix
    self.weightGrad=None
    #storing the deravitive of objective with respect to bias vector
    self.biasGrad=None
    #storing the deravitive of objective with respect to this layers output (linear output z)
    self.outputGrad=None

    #storing the forward pass output value
    self.outputValue=None

    self.activation=None

    if (activation=="sigmoid"):
      self.activation=Sigmoid()
    elif (activation=="relu"):
      self.activation=reLU()
    else:
      pass


  def apply(self,x):
    return self.weights@x+self.bias

In [92]:
class reLU(Layer):
  def __init__(self):
    super().__init__()
    self.outputValue=None
  def apply(self,x):
    return np.maximum(x,0)

In [93]:
class Sigmoid(Layer):
  def __init__(self):
    super().__init__()
    self.outputValue=None
  def apply(self,x):
    return 1/(1+np.exp(-x))

In [94]:
def derivOfActWrtLinearInput(activationType,outputVal):
  if (activationType=="sigmoid"):
    #The deravitive of objective with respect to linear output depends on derivative of activation with respect to linear output which depends on the activation function
    #For the sigmoid function the deravitive with respect to input = input * (1-input)
    fi=np.copy(outputVal)
    oneMinusfi=np.ones(fi.shape)-fi
    derivOfAct=fi*oneMinusfi #this is the deravitive of activation with respect to linera input
    derivOfActMatrix=np.diag(derivOfAct) #has DixDi dimensions where Di is the dimension of input into linear layer
    return derivOfActMatrix
  elif (activationType=="relu"):
    #derivative of relu activation function with respect to input (linear output) is a diagonal matrix of 1's in the diagonal
    #entries where the linear output is >0 and 0 elsewhere
    fi=np.copy(outputVal)
    OnesAndZeroes=(fi>0).astype(int)
    OnesAndZeroes=OnesAndZeroes.flatten()
    OnesAndZeroesMatrix=np.diag(OnesAndZeroes)
    return OnesAndZeroesMatrix
  else:
    pass

In [95]:
class NeuralNetwork():
  def __init__(self,layers):
    self.layers=layers

  def predict(self,x,storeValues=False):
    val=np.copy(x)
    if (val.ndim==1):
      val=val.reshape((val.shape[0],1))

    for layer in self.layers:
      val=layer.apply(val)
      if (storeValues==True):
        layer.outputValue=val
      if (layer.activation!=None):
        activationLayer=layer.activation
        val=activationLayer.apply(val)
        if (storeValues==True):
          activationLayer.outputValue=val
    return val

  #This will specifically use sgd to train the model. The steps of gradient descent in general is to perform
  #the forward pass, then backward pass #1 to obtain the gradient of objective with respect to each linear output, then
  #the backward pass #2 to obtain the gradient of objective with respect to each parameter
  def train_sgd(self,train_x,train_y,num_epochs,learning_rate,batch_size=1):
    for epoch in range(num_epochs):
      perm = np.random.permutation(train_x.shape[0])
      # Apply permutation to both x and y
      train_x_shuffled = train_x[perm]
      train_y_shuffled = train_y[perm]

      for i in range(len(train_x_shuffled)):
        x=train_x_shuffled[i].reshape(-1,1)
        y=train_y_shuffled[i].reshape(-1,1)
        #The following is the forward pass
        yHat=self.predict(x,storeValues=True)
        #While we perform the forward pass in each layer we store the output of the layer so we have access to all the intermediate values
        #Now we will perform the backward pass #1 to get the gradient of objective with respect to each linear output

        #backward pass #1
        #starting from the output layer and working backwards through only the linear layers. This assumes a linear-activation, linear-activation...
        #structure to the nueral networks

        #should only be iterating through linear layers assuming structure and skipping activation layers
        for i in range(len(self.layers)-1,-1,-1):
          layer=self.layers[i]
          #if we are dealing with the last layer=output layer
          if (i==len(self.layers)-1):
            #Assume the loss function is squared error.
            self.layers[i].outputGrad=2*(yHat-y)
            continue

          else:
            term1=None
            if (layer.activation==None):
              term1=np.eye(layer.outputValue.shape[0])

            elif isinstance(layer.activation, Sigmoid):
              term1=derivOfActWrtLinearInput("sigmoid",layer.outputValue.flatten())
            elif isinstance(layer.activation,reLU):
              term1=derivOfActWrtLinearInput("relu",layer.outputValue.flatten())
            else:
              pass

            #obtaining the transpose of weight matrix of next linear layer
            term2=np.transpose(self.layers[i+1].weights)

            #this is the gradient of objective with respect to next linear layers output which we calucated in the previous iteration of this backward pass #1
            term3=self.layers[i+1].outputGrad
            self.layers[i].outputGrad=term1@term2@term3

      #backward pass #2
      #now we calculate the gradient of objective with respect to weights and biases in linear layers
        for i in range(len(self.layers)-1,-1,-1):
          layer=self.layers[i]
          derivObjWrtBias=layer.outputGrad
          layer.bias=layer.bias-learning_rate*derivObjWrtBias

          # Special case for the first linear layer
          derivObjWrtWeights=None
          if i==0:
              derivObjWrtWeights=layer.outputGrad@np.transpose(x)
          else:
              derivObjWrtWeights=layer.outputGrad@np.transpose(self.layers[i-1].activation.outputValue)

          layer.weights=layer.weights-learning_rate*derivObjWrtWeights

In [96]:
def squared_error_loss(yHat,y):
      #in Mathematics the squared error loss function is the sum of the squares of the difference between the two vectors

      difference=yHat-y
      return (np.transpose(difference))@difference

In [97]:
def absolute_error_loss(yHat,y):
  difference=yHat-y
  error_vector=np.abs(difference)
  return np.sum(error_vector)

In [98]:
#Loading and Preparing the Dataset

data=np.loadtxt("sample_data/concrete.csv",delimiter=",",skiprows=1)
np.random.shuffle(data)
splitIndex=int(0.8*len(data))
ConcreteStrengthX=data[:splitIndex,:-1]
ConcreteStrengthY=data[:splitIndex,-1].reshape(-1,1)
print(ConcreteStrengthX.shape)
print(ConcreteStrengthY.shape)
ConcreteStrengthXTest=data[splitIndex:,:-1]
ConcreteStrengthYTest=data[splitIndex:,-1].reshape(-1,1)


(824, 8)
(824, 1)


In [99]:
def standardize_data(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    return (data - mean) / std, mean, std

# Standardize training set and store stats
ConcreteStrengthX, train_mean, train_std = standardize_data(ConcreteStrengthX)

# Standardize test set using **training mean and std**
WConcreteStrengthXTest = (ConcreteStrengthXTest - train_mean) / train_std


In [100]:
"""
These will be tests for SGD on linear, act, linear, act, linear, act, linear (output) network architecture that was spelled out by the textbook
"""


ListOfLayers=[LinearLayer(8,3,activation="relu"),LinearLayer(3,3,activation="relu"),LinearLayer(3,3,activation="relu"),LinearLayer(3,1)]

network=NeuralNetwork(ListOfLayers)






In [101]:
network.train_sgd(ConcreteStrengthX,ConcreteStrengthY,100,0.02)


In [102]:
#I am giving a 8x1 vector as expected to pass through the neural network. I should literally get a continous value spit out
print(network.predict(np.array([1,1,1,1,1,1,1,1])))

[[32.63243723]]


In [105]:
yHat=np.transpose(np.apply_along_axis(network.predict,0,(np.transpose(ConcreteStrengthXTest))))
yHat=yHat.flatten()
yHat=yHat.reshape(yHat.shape[0],1)

#Number of datapoints in the validation dataset
num_points=yHat.shape[0]
print("The number of validation data points is: "+str(num_points))

se_loss_array=squared_error_loss(yHat,ConcreteStrengthYTest)

se_loss=float(se_loss_array)

mse_loss=se_loss/num_points


ae_loss=absolute_error_loss(yHat,ConcreteStrengthYTest)
mae_loss=ae_loss/num_points


print("Squared error on the validation dataset: " + str(se_loss))
print("Mean squared error on the validation dataset: "+ str(mse_loss))
print("Absolute error on the validation dataset: " + str(ae_loss))
print("Mean absolute error on the validation dataset: "+ str(mae_loss))




The number of validation data points is: 206
Squared error on the validation dataset: 45659.04732781089
Mean squared error on the validation dataset: 221.645860814616
Absolute error on the validation dataset: 2469.145878725384
Mean absolute error on the validation dataset: 11.986145042356233


  se_loss=float(se_loss_array)
