In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 0. Simulate dataset (*)

Simulate datasets according to these rules:

- set random seed to 42
- (1000,2) samples from $X \sim \mathcal{U}(0,1)$ , i.e. 1000 rows, 2 columns. 
- 1000 samples from $\epsilon \sim \mathcal{N}(0,1)$
- $y = 3x_1 + 5x_2 + 3 + \epsilon$ , where $x_i$ is column $i$ of $X$

Finally add a column of ones for the intercept to $X$.

---

In [84]:
np.random.seed(42)

X = np.random.rand(1000, 2)
X = np.c_[np.ones(len(X)), X]

epsilon = np.random.normal(loc=0, scale=1, size=1000)

y_data = lambda x, epsilon: (3*x[:,1]) + (5*x[:,2]) + 3 + epsilon

y = y_data(x= X, epsilon= epsilon)

In [85]:
X

array([[1.        , 0.37454012, 0.95071431],
       [1.        , 0.73199394, 0.59865848],
       [1.        , 0.15601864, 0.15599452],
       ...,
       [1.        , 0.75137509, 0.65695516],
       [1.        , 0.95661462, 0.06895802],
       [1.        , 0.05705472, 0.28218707]])

## 1. Gradient descent - learning rate (*)

Use gradient descent to calculate $\vec{\theta} = (\theta_0, \theta_1, \theta_2)^T$ 

&nbsp; a) Use $\eta = 0.1$ and calculate $\vec{\theta}$ for each fifth epoch from 1 to 500. So the procedure is as follows:
- calculate $\vec{\theta}$ for epochs = 1
- calculate $\vec{\theta}$ for epochs = 6
- ...
- calculate $\vec{\theta}$ for epochs = 496

Plot these $\vec{\theta}$ values against epochs. (*)

&nbsp; b) Do the same as for a) but with learning rate $\eta = 0.01$, 5000 epochs and for each 20th epoch. What do you notice when changing the learning rate? (*)

&nbsp; c) Experiment with larger and smaller $\eta$ and see what happens.

---

**a)**

In [108]:
def gradient_descent(X, y, learning_rate=0.1, epochs=100):
    
    i = 1

    dict_epochs = {}
    
    m = len(X)
    theta = np.random.randn(X.shape[1], X.shape[0])
    while i <= epochs:

        for _ in range(i):
            gradient = 2 / m * X.T @ (X @ theta - y)
            theta -= learning_rate * gradient

        print(theta)
        #dict_epochs[f"epochs = {i}"] = theta

        i += 5

    return dict_epochs

    
ep = gradient_descent(X, y, learning_rate=0.1, epochs= 500)

[[ 1.51358103  1.33835709  0.41257269 ...  1.40963481  1.10216699
   1.61848479]
 [ 1.78172927  1.61679932  0.84076098 ...  1.13698613 -1.66149731
   0.48225638]
 [ 0.2660586  -0.65138732  1.96689971 ...  0.68419905  1.13485584
   0.0671065 ]]
[[ 4.79861461  4.65598609  1.76340331 ...  6.37162827  3.32321836
   2.11062647]
 [ 3.10979559  2.97680554  1.37556854 ...  3.30149235 -0.47139551
   0.65925247]
 [ 1.8190566   1.00311981  2.43172286 ...  3.00380096  2.09595004
   0.29718506]]
[[ 5.51283246  5.32162355  2.15589331 ...  7.38357016  3.70457911
   2.23021864]
 [ 2.86080723  2.73036177  1.29895549 ...  3.14571949 -0.22395415
   0.59288057]
 [ 1.84231941  1.13835814  2.20947142 ...  2.96618436  1.92270408
   0.30237546]]
[[ 5.93236302  5.66976025  2.46278577 ...  7.92822968  3.85370798
   2.30993632]
 [ 2.32263208  2.19497302  1.10302843 ...  2.58853759 -0.10627348
   0.47835288]
 [ 1.60213897  1.02885122  1.83555288 ...  2.52217291  1.54556169
   0.26771882]]
[[ 6.37083427  6.0312788