<a href="https://colab.research.google.com/github/AnthonyCampos123/SURP-Neural-Network-Model-Distillation-and-Pruning-for-Pattern-Recognition-and-Recommendations/blob/main/Neural_Network_From_Scratch_Python_SURP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import numpy as np
import matplotlib
#!pip install nnfs
import nnfs
from nnfs.datasets import spiral_data
import math



# neuron in a densely connected feed-forward multilayer perceptron model, neuron will take in 3 inputs
# each unique input is going to have a unique weight associated to it
# inputs and weights are arbitrary in this example
# each unique neuron has a unique bias

# fully connected neural network
# selected 3 neurons with 4 inputs each (each neuron will have a unique set of weights for each unique input)
inputs = [1, 2, 3, 2.5]

weights = [[0.2, 0.8, -0.5, 1.0], [0.5, -0.91, 0.26, -0.5], [-0.26, -0.27, 0.17, 0.87]]

biases = [2, 3, 0.5]

layer_outputs = [] # output of current layer
for neuron_weights, neuron_bias in zip(weights, biases): # zip comhines 2 lists (lists of lists)
  neuron_output = 0 # output of current neuron
  for n_input, weight in zip(inputs, neuron_weights):
      neuron_output += n_input*weight
  neuron_output += neuron_bias
  layer_outputs.append(neuron_output)

print(layer_outputs)

# shape of arrays:
# given an array, l = [1, 5, 6, 2],
# shape: (4,)
# type: 1d array, vector (in math)
# given an array, list_of_lists = [[1,5,6,7],[3,2,1,3]],
# shape: (2,4)
# type: 2d array, matrix (in math)
# thus, arrays must be homologous (at each dimension, must have the same size)
# given an array list_of_lists_lists = [
#                                       [[1,5,6,2],[3,2,1,3]],
#                                       [[5,2,1,2],[6,4,8,4]],
#                                       [[2,8,5,3],[1,1,9,4]]
#                                                            ]
# shape: (3,2,4)
# type: 3d array

# tensors: objects that can be represented as an array (within the context of deep learning)
# dot product, a=[1,2,3], b=[2,3,4], a . b = 1*2 + 2*3 + 3*4

# second example:
# weights are a matrix containing vectors, 3 weights, 3 neurons
inputs1 = [1, 2, 3, 2.5]
weights1 = [[0.2, 0.8, -0.5, 1.0], [0.5, -0.91, 0.26, -0.5], [-0.26, -0.27, 0.17, 0.87]]
biases1 = [2,3,0.5]

# dot product is done three times

#output1 = np.dot(weights1, inputs1) + biases1 ######*****************
print(output1)

# third example: (batches)
# features #are inputs
# larger batch size, less movement in fitment line
# not all samples passsed at once, bad for generaliation of out-of-sample data

inputs2 = [[1, 2, 3, 2.5],
           [2.0, 5.0, -1.0,2.0],
           [-1.5, 2.7, 3.3, -0.8]]

weights2 = [[0.2, 0.8, -0.5, 1.0], [0.5, -0.91, 0.26, -0.5], [-0.26, -0.27, 0.17, 0.87]]
biases2 = [2,3,0.5]

# matrix product, take transpose of weights (to avoid shape error)

#output2 = np.dot(inputs2, np.array(weights).T) + biases2 ######*****************
print(output2)


# fourth example:

biases3 = [-1,2,-0.5]

inputs3 = [[1, 2, 3, 2.5],
           [2.0, 5.0, -1.0,2.0],
           [-1.5, 2.7, 3.3, -0.8]]

weights3 = [[0.1, -0.14, 0.5], [-0.5, 0.12, -0.33], [-0.44, 0.73, -0.13]]


#layer1_outputs = np.dot(inputs2, np.array(weights).T) + biases2 ######*****************

layer2_outputs = np.dot(layer1_outputs, np.array(weights3).T) + biases3

print(layer2_outputs)

# fifth example (object oriented programming)

np.random.seed(0)

# standard in machine learning to denote inputs as X
nnfs.init()

# X = [[1, 2, 3, 2.5],
#            [2.0, 5.0, -1.0,2.0],
#            [-1.5, 2.7, 3.3, -0.8]]

# X, y = spiral_data(100,3)


# spiral dataset
# def create_data(points, classes):
#   X = np.zeros((points*classes, 2))
#   y = np.zeros(points*classes, dtype='uint8')
#   for class_number in range(classes):
#       ix = range(points*class_number, points*(class_number+1))
#       r = np.linspace(0.0,1,points) # radius
#       t = np.linspace(class_number*4, (class_number+1)*4, points) + np.random.randn(points)*0.2
#       X[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]
#       y[ix] = class_number
#   return X, y

# import matplotlib.pyplot as plt
# print("here")
# X, y = create_data(100,3)
# plt.scatter(X[:,0], X[:,1])
# plt.show()

# plt.scatter(X[:,0], X[:,1],c=y, cmap="brg")
# plt.show()



class Layer_Dense():
  def __init__(self, n_inputs, n_neurons):
    self.weights = 0.01*np.random.randn(n_inputs, n_neurons) # n denotes number of
    self.biases = np.zeros((1, n_neurons))
  def forward(self, inputs):
    self.output = np.dot(inputs, self.weights) + self.biases # dot does cross product

class Activation_ReLU():
  def forward(self, inputs):
    self.output = np.maximum(0, inputs)

class Activation_Softmax():
  def forward(self, inputs):
    exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
    probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
    self.output = probabilities

class Loss():
  def calculate(self, output, y):
    sample_losses = self.forward(output, y)
    data_loss = np.mean(sample_losses)
    return data_loss

class Loss_CategoricalCrossEntropy(Loss): # inherits loss class
  def forward(self, y_pred, y_true):
    samples = len(y_pred)
    y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7) # avoid infinity issue

    if len(y_true.shape) == 1: # passed scalar values
      correct_confidences = y_pred_clipped[range(samples), y_true]
    elif len(y_true.shape) == 2:
      correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)
    negative_log_likelihoods = -np.log(correct_confidences)
    return negative_log_likelihoods

X,y = spiral_data(samples=100, classes=3)
dense1 = Layer_Dense(2,3)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(3, 3)
activation2 = Activation_Softmax()

dense1.forward(X)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
activation2.forward(dense2.output)

print(activation2.output[:5])

loss_function = Loss_CategoricalCrossEntropy()
loss = loss_function.calculate(activation2.output, y)

print("Loss:", loss)





# layer1 = Layer_Dense(2,5) # number of inputs/features = 4
# activation1 = Activation_ReLU()

# layer1.forward(X)
# #print(layer1.output)

# activation1.forward(layer1.output)
# print(activation1.output)





# step function, ex. y + { 1 if x>0, 0 if <= 0
# output either a 0 or 1, regardless of inputs, weights, biases
# alternative, Sigmoid activation function like y = 1/(1+e^-x)
# however, has issue of vanishing gradient problem
# rectified linear unit (reLU) activation function, y = { x if x> 0, 0 if x <= 0
# can only fit linear functions, or for non-linear data, approximate with a lienar function
# reLU close to being linear, but rectified (clipping at 0) makes it otherwise
# optimizer: Daniel optimizer
# increase weight, reLU slope gets steeper, decrease weight, gets shallower and if goes into the negatievs curves down
# individual neurons become responsible for small sections of the ovreall neural network function

X1 = [[1, 2, 3, 2.5],
           [2.0, 5.0, -1.0,2.0],
           [-1.5, 2.7, 3.3, -0.8]]

inputs4 = [0, 2, -1, 3.3, -2.7, 1.1, 2.2, -100]
output4 = []

for i in inputs4:
  if i > 0:
    output4.append(i)
  elif i <= 0:
    output4.append(max(0,i))
print(output4)

# softmax function

# first step is to determine how wrong is a model (its loss as opposed to accuracy)
layer_outputs_comparison1 = [4.8, 1.21, 2.385] # less loss
layer_outputs_comparison2 = [4.8, 4.79, 4.25]

# in example of image (cat/dog) data in classification model,
# we want the 2 output values to be a probabaility distribution
# (1) uniform from sample to sample
# (2) from neuron to neuron things will be normalized
# (3) can calculate rightness and wrongness
# if everything was perfect, the classification would be a 1.0
# applying the expoenentiation function (y=e^x) ensures that no value can be negative
# but retaining the value/meaning of that negativity (its on a scale)

layer_outputs2 = [[4.8, 1.21, 2.385], [8.9, -1.81, 0.2], [1.41, 1.051, 0.026]]

#E = math.e # Euler's number

exp_values = np.exp(layer_outputs2)
norm_values = exp_values / np.sum(exp_values, axis=1, keepdims=True)

print(norm_values)
print(np.sum(layer_outputs2, axis=1, keepdims=True)) # axis on matrix is sum of columbs

# an issue of expoennetiation is the explosion of values/growth of exponentiation function
# (does not take too long to reach an overflow)


# next step is to normalize the values
# use function y = u / (summation n i=1 u_i)
# (single outputs neuron's values divided by the sum of
# all the other output neurons in that output layer)
# gives probability distribution

#  norm_base = sum(exp_values)
#  norm_values = []

# for value in exp_values:
#   norm_values.append(value / norm_base)

# print(norm_values)
# print(sum(norm_values))

# we now have the following implemented
# input -> exponentiate -> normalize -> output
# combination of exponentiation and normalization makes up the Softmax activation function
# this is defined by S_i,j = e^(z_i,j) / (summation L l=1 e^(z_i,j))
# we will have a batch of inputs and a batch of outputs


# a neural network outputs a porbability distribution, predicts with a certain % confidence
# optimizers make tweaks to weights and biases
# an example of a loss function is mean absolute error (usually regression)
# loss function is thus more useful/informative than accuracy
# we know from training data what was the intended target value
# the loss function of choice is Categorical Cross-Entropy
# L i = - summation y y_i,j log(y(hat)_i,j)
# L_i = sample loss value, i = i-th sample in a set, j = label/output index, y(hat) = predicted values
# simplifies to L_i = -log(y(hat)_i, k) where k = target label index, index of correct class proabaility
# one-hot encoding, vector is n classes long. vector is filled with zeros except at the index of the target class
# ex. classes: 2, label: 1, one-hot: [0,1]
# natural log, y = log_e x = ln(x)
# for categorical cross entropy, we start by taking the negative sum of the target value times the log of
# the predicted value for each of the values in the distributions


# softmax_output = [0.7, 0.1, 0.2]
# target_output = [1,0,0] #target_class = 0
# loss = -(math.log(softmax_output[0])* target_output[0]+
#          math.log(softmax_output[1])* target_output[1]+
#          math.log(softmax_output[2])* target_output[2])
# print(loss)

# loss = math.log(softmax_output[0])
# print(loss)
# print(math.log(0.7))
# print(math.log(0.5))



softmax_outputs = np.array([[0.7, 0.1, 0.2],
                            [0.1, 0.5, 0.4],
                            [0.02, 0.9, 0.08]])
class_targets = [0,1,1]
predictions = np.argmax(softmax_outputs, axis=1)
accuracy = np.mean(predictions == class_targets)
print('acc:', accuracy)

print(softmax_outputs[[0,1,2], class_targets])

print("start here")

# optimization, adjustment of weights and biases
# involves calculus componenets (derrivatives/tangent lines)
# adjust parameters by calculuating the impact of those
# parameters on the output of the function
# for each sample we do a forward pass, calculating the loss at the end
# then, for each weight and bias, individually, add the sma;l delta (change)
# do another forward pass, calculating the loss again, revert the parameter
# to its original state, choose a new one, add the delta, repeat the forward
# pass and continue for every single weight and bias in the network
# for multivariate functions we can use partial
# derivatives (instead of doing multiple forward passes for the network)





[4.8, 1.21, 2.385]
[4.8   1.21  2.385]
[[ 4.8    1.21   2.385]
 [ 8.9   -1.81   0.2  ]
 [ 1.41   1.051  0.026]]
[[ 0.50310004 -1.04185009 -2.03875005]
 [ 0.24339998 -2.73320007 -5.76329994]
 [-0.99314     1.41254002 -0.35654999]]
[[0.33333334 0.33333334 0.33333334]
 [0.3333332  0.3333332  0.33333364]
 [0.3333329  0.33333293 0.3333342 ]
 [0.3333326  0.33333263 0.33333477]
 [0.33333233 0.3333324  0.33333528]]
Loss: 1.0986104
[0, 2, 0, 3.3, 0, 1.1, 2.2, 0]
[[8.95282664e-01 2.47083068e-02 8.00090293e-02]
 [9.99811129e-01 2.23163963e-05 1.66554348e-04]
 [5.13097164e-01 3.58333899e-01 1.28568936e-01]]
[[8.395]
 [7.29 ]
 [2.487]]
start here
acc: 1.0
[0.7 0.5 0.9]
