### We look for the derivative, derivative how fast is the function changing, the opposite is the acceleration this shows how fast is changing the derivative 

In [2]:
import numpy as np
from plotly.graph_objs import *
import random 
import plotly.express as px 

$$z = (x-1)^2 + (y-2)^2$$

In [3]:
x = np.linspace(-150, 150)
y = np.linspace(-150, 150)


xx,yy=np.meshgrid(x,y)
zz = (xx-1)**2+(yy-2)**2

In [5]:
fig = Figure(data = Surface(x = xx, y = yy, z= zz))
fig.show()

In [6]:
fig = Figure(data=Contour(x=x,y=y,z=zz,colorscale='viridis', contours_coloring='lines'),
            layout = Layout(width = 600, height = 600))
fig.show()

$$z = (x-1)^2 + (y-2)^2$$

$$\frac{\partial z}{\partial x} = 2\cdot(x-1)$$
$$\frac{\partial z}{\partial y} = 2\cdot(y-2)$$


$$x_{n+1} = x_n-\alpha\cdot \frac{\partial z}{\partial x}$$

$$y_{n+1} = y_n-\alpha\cdot \frac{\partial z}{\partial y}$$

alpha = learning rate


In [7]:
def gradient_descent(learning_rate=0.1,
                     iterations = 100,
                     initial_x = 0 , initial_y = 0):
  x, y = initial_x, initial_y #initialisation
  x_list, y_list = [x], [y]
  z_list = [(x-1)**2+(y-2)**2]

  for i in range(iterations):
    gradient_x = 2*(x-1)
    gradient_y = 2*(y-2)

    x = x - learning_rate*gradient_x
    y = y - learning_rate*gradient_y
    x_list.append(x)
    y_list.append(y)
    z_list.append((x-1)**2 + (y-2)**2)

  return x_list, y_list, z_list

In [8]:
x_gd, y_gd, z_gd = gradient_descent()

In [9]:
x_gd[-1]

0.9999999997962964

In [10]:
y_gd[-1]

1.9999999995925928

In [11]:
x, y = np.linspace(-2,2), np.linspace(-2,2)
xx, yy = np.meshgrid(x,y)
zz = (xx-1)**2+(yy-2)**2

fig = Figure(data = [Surface(x = xx,
                             y = yy,
                             z = zz,
                             opacity = 0.4),
                     Scatter3d(x = x_gd,
                              y = y_gd,
                               z = z_gd)],
             layout = Layout(width = 600,
                             height = 600))
fig.show()

### Linear Regression 

We use squared bc should be not negative, we need the absolute values, either squared or absolutely but absolute are not differentiable

$$\hat y = w_1 + w_2\cdot x $$

Minimize $(y_i-\hat{y_i})^2$ for every $i$

$$L = \frac{1}{n}\sum_{i=1}^n(w_1 + w_2\cdot x_i - y_i)^2$$


$$\frac{\partial L}{\partial w_1} = \frac{2}{n}\sum_{i=1}^n(w_1 + w_2\cdot x_i - y_i)$$

$$\frac{\partial L}{\partial w_2} = \frac{2}{n}\sum_{i=1}^n x_i(w_1 + w_2\cdot x_i - y_i)$$

In [12]:
x = np.linspace(0,100,100)
y = 2*x+1 + np.random.randn(100)*20

px.scatter(x=x,y=y)

the gradient can exploded or vanishing, in this case we dont want explode then we clipped. 

In [17]:
def gradient_descent(x, y , learning_rate = 0.0005,
                     iterations = 1000,
                     initial_w1 = 4,
                     initial_w2 = 4):
  w1 = initial_w1
  w2 = initial_w2

  w1_list, w2_list = [w1], [w2]
  n = len(x)

  for _ in range(iterations):

    gradient_w1 = (2/n)*np.sum(w1+w2*x-y)
    gradient_w2 = (2/n)*np.sum(x*(w1+w2*x-y))

    gradient_w1 = np.clip(gradient_w1, -1000, 1000)
    gradient_w2 = np.clip(gradient_w2, -1000, 1000)

    w1 = w1 - learning_rate*gradient_w1
    w2 = w2 - learning_rate*gradient_w2

    w1_list.append(w1)
    w2_list.append(w2)

  return w1_list, w2_list

In [18]:
w1_list, w2_list = gradient_descent(x,y)

In [16]:
w1_list[-1]

np.float64(1.1117441074056698)

In [19]:
w1_list[-1]

np.float64(1.1117441074056698)

In [20]:
w2_list[-1]

np.float64(2.2542997590061624)

In [21]:
fig = px.scatter(x=x, y = y)
fig.add_trace(Scatter(x = x,
                       y = w1_list[-1]+x*w2_list[-1]))
fig.show()

Batch size then whe have a loop, without batch size we use the epochs, mini batch approach is for not use the entire dataset for reduce memory 

In [22]:
def mini_batch_gradient_descent(x, y , learning_rate = 0.0005,
                     iterations = 1000,
                     initial_w1 = 4, # parameters
                     initial_w2 = 4, #
                     batch_size = 10): # we will not use the entire data set just the batch
  w1 = initial_w1
  w2 = initial_w2

  w1_list, w2_list = [w1], [w2]
  n = len(x)

  for _ in range(iterations):
    indicies = np.random.permutation(n)
    for i in range(0,n, batch_size):
        batch_indicies = indicies[i:i+batch_size]
        x_batch = x[batch_indicies]
        y_batch = y[batch_indicies]

        gradient_w1 = (2/n)*np.sum(w1+w2*x-y)
        gradient_w2 = (2/n)*np.sum(x*(w1+w2*x-y))

        gradient_w1 = np.clip(gradient_w1, -1000, 1000)
        gradient_w2 = np.clip(gradient_w2, -1000, 1000)

        w1 = w1 - learning_rate*gradient_w1
        w2 = w2 - learning_rate*gradient_w2

    w1_list.append(w1)
    w2_list.append(w2)

  return w1_list, w2_list

In [23]:
w1_list, w2_list = mini_batch_gradient_descent(x,y)

In [24]:
fig = px.scatter(x=x, y = y)
fig.add_trace(Scatter(x = x,
                       y = w1_list[-1]+x*w2_list[-1]))
fig.show()

The list stable method is the stocasthic gradient 

The goal is to find the parameters, weights and biases in the linear operation. 