<a href="https://colab.research.google.com/github/Gabrielsandbox/AI-ML-Codebase/blob/main/NeuralNetworksFromScratch_Research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Notes from BloombergGPT talk : https://www.youtube.com/watch?v=m2Scj2SO85Y&ab_channel=TorontoMachineLearningSeries%28TMLS%29

# Model size and data size
# - model size (number of parameters) and with
# - training data size (number of tokens)
#
# Before the "Chinchilla paper" (March 2022)
# - Most large language models trained on ~300B tokens
# - More compute budget => build a bigger model (same training set)
#
# The Chinchilla paper investigated  the best way to allocate a given compute budget
# between model size and training set size
#
# Conclusion:
# - Most large language model projects should have used smaller models trained on more data
# - Optimal tradeoff : if you double model size, you should double training set size
#
# BloombergGPT : 50B total parameters - 569B data size (in tokens)
# * less parameters and more data
#
#
#
#

In [None]:
# EP. 1 https://www.youtube.com/watch?v=Wo5dMEP_BbI&ab_channel=sentdex

In [None]:
#zip
x = [1,2]
y = [3,4]

for i, j in zip(x,y):
  print(i)
  print(j)

In [None]:
#Low-level working of a neuron in a neural net
#Note: One bias for each different neuron
inputs = [1, 2, 3, 2.5]


weights = [[0.2, 0.8, -0.5, 1.0],
           [0.5, -0.91, 0.26, -0.5],
           [-0.26, -0.27, 0.17, 0.87]]

biases = [2, 3, 0.5]

layer_outputs = []
for neuron_weights, neuron_bias in zip(weights,biases):
  neuron_output = 0 #Output of given neuron
  for neuron_input, weight in zip(inputs, neuron_weights):
    neuron_output += neuron_input*weight
  neuron_output += neuron_bias
  layer_outputs.append(neuron_output)


#output = [inputs1[0]*weights[0][0] + inputs1[1]*weights[0][1] + inputs1[2]*weights[0][2] + inputs1[3]*weights[0][3] + biases[0],
#          inputs2[0]*weights[1][0] + inputs2[1]*weights[1][1] + inputs2[2]*weights[1][2] + inputs2[3]*weights[1][3] + biases[1],
#          inputs3[0]*weights[2][0] + inputs3[1]*weights[2][1] + inputs3[2]*weights[2][2] + inputs3[3]*weights[2][3] + biases[2]]

print(layer_outputs)

In [None]:
#zip
x = [1,2]
y = [3,4]

for i, j in zip(x,y):
  print(i)
  print(j)

In [None]:
#The struggle of deep learning is figuring out how to best tune those weights and those biases
#Why weights and biases are necessary?
#A: One cannot directly edit inputs because they usually come from other functions, but the way to do this is through weights and biases,
#by tweaking them we are able to tweak the output of a neuron and the input of the next neuron.

#DL definition question : "How do we adjust weights and biases to get the output values
#                          that we want from the fast array of inputs that we might have"

In [None]:
#vector = 1D array
#Lists of vectors = 2D array
#Lists of lists of vectors = 3D array
#Tensor = objects that can be represented as array

#when it comes to a array >= 2D then it must be homologous in shape.

#dot_product/matrix product
vector1 = [1,2,3]
vector2 = [1,2,3]

dot_product = vector1[0] * vector2[0] + vector1[1] * vector2[1] + vector1[2] * vector2[2]
print(dot_product) #dot product results in a scalar single value

In [None]:
import numpy as np

#dot product of one neuron
#Modeling one neuron
inputs = [2, 4, 9]
weights = [0.2, 3, 4.5]
bias = 2

dot_product = np.dot(weights, inputs) + bias
print(dot_product)

In [None]:
import numpy as np

#dot product of a layer of neurons
#Modeling 3 neurons
inputs = [1, 2, 3, 2.5] # One sample with four features
weights = [[0.2, 0.8, -0.5, 1.0],
           [0.5, -0.91, 0.26, -0.5],
           [-0.26, -0.27, 0.17, 0.87]]

biases = [2, 3, 0.5]

dot_product = np.dot(weights, inputs) + bias #weights must come first as it is the 2D array, it sorts of dictates what the output array is gonna look like
print(dot_product)                           #there is something called the shape problem in deep learning that a lot of beginners have and this essentialy
                                             #solves it


In [None]:
#Overfitting hurts generalization when you present all samples at once causing the overfitting to happen
#Batch size is usually 24 or 23
#weights and biases are associated with individual/unique neurons

In [None]:
#Introducing batches
import numpy as np

inputs = [[1.0, 2.0, 3.0, 2.5],
          [2.0, 5.0, -1.0, 2.0],
          [-1.5, 2.7, 3.3, -0.8]]

weights = [[0.2, 0.8, -0.5, 1.0],
           [0.5, -0.91, 0.26, -0.5],
           [-0.26, -0.27, 0.17, 0.87]]

biases = [2, 3, 0.5]

dot_product = np.dot(inputs, np.array(weights).T) + biases
print(dot_product)

[[ 4.8    1.21   2.385]
 [ 8.9   -1.81   0.2  ]
 [ 1.41   1.051  0.026]]


In [None]:
pip install nnfs

Collecting nnfs
  Downloading nnfs-0.5.1-py3-none-any.whl (9.1 kB)
Installing collected packages: nnfs
Successfully installed nnfs-0.5.1


In [None]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.10 * np.random.randn(n_inputs, n_neurons) #randn does a gaussian distribution bounded around 0
        self.biases = np.zeros((1, n_neurons))
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases

class Activation_ReLU:
  def forward(self, inputs):
    self.output = np.maximum(0, inputs)

class Activation_Softmax:
  def forward(self, inputs):
    exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) #Subtracting by the max value of the batch prevents value overflow as exponentiation happens
    probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
    self.output = probabilities

X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 3)
activation1 = Activation_ReLU()

dense2 = Layer_Dense(3, 3)
activation2 = Activation_Softmax()

dense1.forward(X)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
activation2.forward(dense2.output)

print(activation2.output[:5])




In [None]:
#Ep. 5
#Activationg Functions
# Every neuron has an activation function associated with it
# Generally the output layer has a different activation function than the hidden layers
#
#  - Step activation function
#
#            {  1 x > 0  }
#        y = {           }
#            {  0 x <= 0 }
#
#  - Sigmoid activation function (A little more reliable than Step to train neural net due to the granularity of the output)
#    (Has the vanishing gradient problem)
#
#        y = ___1___
#            1 + e^-x
#
#  - Rectified Linear Unit activation function (ReLU) (It's granular so optimizable and it's fast due to simplicity) (Most popular)
#
#            {  x x > 0  }
#        y = {           }
#            {  0 x <= 0 }
#
#
import numpy as np

np.random.seed(0)

X = [[1.0, 2.0, 3.0, 2.5],
     [2.0, 5.0, -1.0, 2.0],
     [-1.5, 2.7, 3.3, -0.8]]

inputs = [0, 2, -1, 3.3, -2.7, 1.1, 2.2, -100]
output = []

#(ReLU)
for i in inputs:
  if i > 0:
    output.append(i)
  elif i <= 0:
    output.append(0)

#another way:
# output.append(max(0, i))

print(output)


[0, 2, 0, 3.3, 0, 1.1, 2.2, 0]


In [None]:
#Function to generate a dataset, it is possible to tweak the values and get many different data shapes and complexity

def spiral_data(points, classes):
    X = np.zeros((points*classes, 2))
    y = np.zeros(points*classes, dtype='uint8')
    for class_number in range(classes):
        ix = range(points*class_number, points*(class_number+1))
        r = np.linspace(0.0, 1, points)  # radius
        t = np.linspace(class_number*4, (class_number+1)*4, points) + np.random.randn(points)*0.2
        X[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]
        y[ix] = class_number
    return X, y

In [None]:
#Softmax Activation Function
#Input -> Exponentiate -> Normalize -> Output

#Exponentiation Function
# It's meant to solve value clipping so to not loose the meaning of negative values
import math
import numpy as np

layer_outputs = [4.8, 1.21, 2.385]

E = math.e

exp_layer = np.exp(layer_outputs) #Numpy normally impacts all values in the list
print("Exponentiated values:")
print(exp_layer)
print('')

#Normalization
# It's a probability distribution of the output layer
#    y = ___u____
#        Σni=1 ui
# Divide the value of one output neuron by the sum of all the output neurons

norm_values = exp_layer / np.sum(exp_layer)

print("Normalized values:")
print(norm_values)
print("")
print(sum(norm_values))
print("")

#Up until this point:
# Input -> Exponentiate -> Normalize -> Output
# That is equal to:
# Input -> Softmax -> Output






Exponentiated values:
[121.51041752   3.35348465  10.85906266]

Normalized values:
[0.89528266 0.02470831 0.08000903]

0.9999999999999999



In [None]:
#Ep. 6 Calculating loss with categorical cross-entropy

#Example of loss function: Mean absolute error and it is used with regression
#as it gets closer to the value the mean absolute error gets lower and lower

#Categorical cross-entropy
#One-hot encoding

#logarithm(ln()/log):
# solving for x -> e ** x = b  (b) is the input to the log / (e) is the eulers number that serves as the base for natural logs
'''
import numpy as np
import math

b = 5.2

print(math.e ** np.log(b))
print(np.log(b))
'''

import math

softmax_output = [0.7, 0.1, 0.2]
target_output = [1, 0, 0]
target_class = 0

loss = -(math.log(softmax_output[0]) * target_output[0] +
         math.log(softmax_output[1]) * target_output[1] +
         math.log(softmax_output[2]) * target_output[2])

print(loss)
loss = -math.log(softmax_output[0])
print(loss)

#softmax output represents confidence, when confidence raises the loss lowers and when the confidence lowers the loss raises, through this formula

In [None]:
pip install nnfs

Collecting nnfs
  Downloading nnfs-0.5.1-py3-none-any.whl (9.1 kB)
Installing collected packages: nnfs
Successfully installed nnfs-0.5.1


In [None]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.10 * np.random.randn(n_inputs, n_neurons) #randn does a gaussian distribution bounded around 0
        self.biases = np.zeros((1, n_neurons))
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases

class Activation_ReLU:
  def forward(self, inputs):
    self.output = np.maximum(0, inputs)

class Activation_Softmax:
  def forward(self, inputs):
    exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) #Subtracting by the max value of the batch prevents value overflow as exponentiation happens
    probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
    self.output = probabilities

class Loss:
  def calculate(self, output, y):
     sample_losses = self.forward(output, y)
     data_loss = np.mean(sample_losses)
     return data_loss

class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):
       samples = len(y_pred)
       y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)

       if len(y_true.shape) == 1:
          correct_confidences = y_pred_clipped[range(samples), y_true]

       elif len(y_true.shape) == 2:
          correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

       negative_log_likehoods = -np.log(correct_confidences)
       return negative_log_likehoods


X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 3)
activation1 = Activation_ReLU()

dense2 = Layer_Dense(3, 3)
activation2 = Activation_Softmax()

dense1.forward(X)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
activation2.forward(dense2.output)

print(activation2.output[:5])

loss_function = Loss_CategoricalCrossEntropy()
loss = loss_function.calculate(activation2.output, y)

print('Loss: ', loss)

[[0.33333334 0.33333334 0.33333334]
 [0.33331734 0.3333183  0.33336434]
 [0.3332888  0.33329153 0.33341965]
 [0.33325943 0.33326396 0.33347666]
 [0.33323312 0.33323926 0.33352762]]
Loss:  1.098445
