In [1]:
import numpy as np

In [2]:
## N training examples
## M features

X = np.array([[1, 2, 3],
              [2, 3, 4],
              [3, 4, 5],
              [4, 5, 6],
              [5, 6, 7]])

Y = Y = np.array([[6],
              [7],
              [8],
              [9],
              [10]])
print(X)
print(Y) ## Nx1 matrix

[[1 2 3]
 [2 3 4]
 [3 4 5]
 [4 5 6]
 [5 6 7]]
[[ 6]
 [ 7]
 [ 8]
 [ 9]
 [10]]


###### X is a NxM matrix and Y is a Nx1 matrix

#### Implementation of Linear Regression with minimizing Least Square error cost

###### We add the column of X0 where all X0s are 1s to X so that the matrix multiplication in the next steps becomes easier. This makes X a NxM+1 matrix

In [3]:

X = np.c_[np.ones((X.shape[0], 1)), X]
X 

array([[1., 1., 2., 3.],
       [1., 2., 3., 4.],
       [1., 3., 4., 5.],
       [1., 4., 5., 6.],
       [1., 5., 6., 7.]])

###### We set our initial learning rate. We can play around with this rate, to control the amount of gradient descent has effect on params. Larger rate makes the descent rapid but can cause cost to sway other way, where as smaller rate can cause the gradient descent steps to have very less effect.
###### We set the number of iterations as well. Can play around till the cost is minimized after a point.
###### Updated m = M+1 and n = N

In [4]:
learning_rate = 0.005
n_iterations = 1000
m = X.shape[1] 
n = X.shape[0] 

###### Setting the initial params values. Can set all to 0 as well.

In [5]:
##Initialize random weights
weights = np.random.rand(m, 1)
weights.shape ## Mx1 matrix

(4, 1)

#### Batch Gradient Descent

###### Running batch gradient descent where the cost is computed for all the examples first and then the params are updated by multiplying with the learning rate.

In [6]:
for _ in range(n_iterations):
  ## Batch grad descent
  weights = weights -  learning_rate * (X.T @ ((X @ weights) - Y)) ## theta = theta - alpha*((prediction - y) * X)
weights

array([[ 1.83259682],
       [-0.81277905],
       [ 0.45974968],
       [ 1.35332073]])

###### The predictions given by multiplying the X and params

In [7]:
predictions = X @ weights
predictions

array([[ 5.99927932],
       [ 6.99957067],
       [ 7.99986203],
       [ 9.00015339],
       [10.00044475]])

#### Stochastic Gradient Descent

In [43]:
weights = np.random.rand(m, 1)
weights.shape

(4, 1)

In [44]:
for _ in range(n_iterations):
  for i in range(n):
    weights = weights -  learning_rate * (X[i].reshape(m, 1) @ ((X[i].reshape(1, m) @ weights) - Y[i].reshape(1, 1)))
weights

array([[ 1.62776816],
       [-0.7983339 ],
       [ 0.22499107],
       [ 1.57343173]])

In [45]:
predictions = X @ weights
predictions

array([[ 5.99971157],
       [ 6.99980046],
       [ 7.99988935],
       [ 8.99997824],
       [10.00006713]])