In [None]:
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interactive
import ipywidgets as widgets
from ipywidgets import fixed
from mpl_toolkits.mplot3d import Axes3D

def generate_x_widget(name, value):
    return widgets.FloatSlider(
        value=value,
        min=0.,
        max=1.,
        step=0.01,
        description=name + ': ',
        continuous_update= False)
def generate_y_widget(name):
    return widgets.FloatSlider(
        value=0.5,
        min=0.,
        max=1.,
        step=0.01,
        description=name + ': ',
        continuous_update= False)
def generate_w_coord_widget(name):
    return widgets.FloatSlider(
        value=0,
        min=-1.,
        max=1.,
        step=0.02,
        description=name + ': ',
        continuous_update= False)
def generate_iter_num_widget(max_iter):
    return widgets.IntSlider(
        value=max_iter//10,
        min=0,
        max=max_iter-1,
        step=2,
        description='iteration: ',
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='d')

def generate_parametrization_choice_widget():
    return  widgets.Dropdown(
        options=['usual', 'nn'],
        description='parametrization: ',
        disabled=False
    )


# Introduction

In this notebook we will deal with a toy example of learning overparametrized model with SGD: we will learn a quadratic function on $[0,1]$ from only two samples. The choice of this problem is due to visualization capabilities: we can plot in at most 3 dimensions, so we need at most 3 model parameters. On the other hand, we need at least 2 samples for SGD to be meaningful.



We are going to compare the following two parametrizations of quadratic functions:
\begin{align*}
\{w_0x^2 + w_1x + w_2\}_{{\bf w} \in \mathbb{R}^3}\text{ and }\{w_1(w_0x +1)^2 + w_2\}_{{\bf w} \in \mathbb{R}^3}
\end{align*}
in terms of interaction with SGD. The first one corresponds to our usual polynomial features regression, and the second one is more like a "neural net style" parametrization.



# Part 1

So we consider quadratic functions with two parametrizations:
$$
q_{usual}({\bf w}, x) := w_0x^2 + w_1x + w_2
$$
and
$$
q_{nn}({\bf w}, x) := w_1(w_0x +1)^2 + w_2.
$$

In the next cell you will implement the functions that compute the corresponding stochastic gradients. Each of those functions takes 3 inputs: $x_{train} \in \mathbb{R}$, $y_{train} \in \mathbb{R}$ and ${\bf w} = [w_0, w_1, w_2] \in \mathbb{R}^3$. The output should be
$$
\nabla_{{\bf w}}\left(\frac12 (q\left({\bf w}, x_{train}) - y_{train}\right)^2\right),
$$
where $q = q_{usual}$ for the first function, and $q = q_{nn}$ for the second.

**Implement the functions in the cell below.**


In [None]:
def usual_parametrization_grad(x_train, y_train, w):
    ### start usual_grad ###

    ### end usual_grad ###

def nn_parametrization_grad(x_train, y_train, w):
    ### start nn_grad ###

    ### end nn_grad ###



# Part 2

In this part we are going to see how SGD interacts with both parametrizations. The following cell gives an implementation of SGD. You don't need to write any code here, just **run the next cell**.


In [None]:
def SGD(one_sample_grad, step_size, n_iter, X_train, y_train, w_initial):

    w_current = np.copy(w_initial)
    trajectory = [np.copy(w_current)]
    point_indices = np.random.randint(low=0, high=len(y_train), size=n_iter)

    for iter_num in range(n_iter):
        current_pt_index = point_indices[iter_num]
        w_current -= step_size * one_sample_grad(X_train[current_pt_index], y_train[current_pt_index], w_current)
        trajectory.append(np.copy(w_current))

    return trajectory



In the next cell we run SGD for both parametrizations for 3000 steps. We initialize weights at zero and set the step size to be constant $0.05$. After that we visualize the learned functions for different iterations.

**Run the following cell. For each parametrization report after how many iterations SGD converged to the interpolating solution.**


In [None]:
%matplotlib inline

X_train = [0.3, 0.7]
y_train = [0.09, 0.49]
n_iter = 3000
w_initial=np.zeros(3)
step_size = 0.05

usual_traj = SGD(one_sample_grad=usual_parametrization_grad,
               step_size=step_size,
               n_iter=n_iter,
               X_train=X_train,
               y_train=y_train,
               w_initial=w_initial)

nn_traj = SGD(one_sample_grad=nn_parametrization_grad,
               step_size=step_size,
               n_iter=n_iter,
               X_train=X_train,
               y_train=y_train,
               w_initial=w_initial)

def plot_iterations(iter_num):
    usual_res = usual_traj[iter_num]
    nn_res = nn_traj[iter_num]
    x_test = np.linspace(0,1,100)
    plt.close()
    plt.plot(x_test, usual_res[0] * x_test**2 + usual_res[1] * x_test + usual_res[2], label='usual')
    plt.plot(x_test, nn_res[1] *(nn_res[0] * x_test + 1)**2 + nn_res[2], label='nn style')
    plt.scatter(X_train, y_train, label='training points')
    plt.legend()
    plt.show()

interactive_plot = interactive(plot_iterations,
                               iter_num=generate_iter_num_widget(n_iter))
interactive_plot


# Part 3

Now we visualize the training process in the weight space. Run the next cel to see how weights change during the SGD. You can choose the parametrization and the initial weights. The 3d plot should be interactive (you should be able to rotate it with your mouse). Darker points correspond to earlier iterations.

- **Explain how you can visually observe the result of part b of the problem by looking at the trajectory for the usual parametrization.**
- **Do you observe anything analogous for the "neural network" style parametrization?**


In [None]:
#Here we call %matplotlib notebook several times because sometimes it doesn't work as expected on the first try.
#If the output of this cell doesn't look right, try rerunning it
%matplotlib notebook
%matplotlib notebook
%matplotlib notebook
%matplotlib notebook
%matplotlib notebook
%matplotlib notebook

def plot_training_trajectory(parametrization, w_0, w_1, w_2):
    if parametrization == 'nn':
        one_sample_grad=nn_parametrization_grad
    else:
        one_sample_grad=usual_parametrization_grad
    traj = SGD(one_sample_grad=one_sample_grad,
                   step_size=step_size,
                   n_iter=n_iter,
                   X_train=X_train,
                   y_train=y_train,
                   w_initial=np.array([w_0, w_1, w_2]))

    traj = np.array(traj)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(traj[:, 0], traj[:,1], traj[:,2], c=np.linspace(0,10, len(traj))**0.2)
    plt.show()

interactive_plot = interactive(plot_training_trajectory,
                           parametrization=generate_parametrization_choice_widget(),
                              w_0=generate_w_coord_widget('$w_0$'),
                              w_1=generate_w_coord_widget('$w_1$'),
                              w_2=generate_w_coord_widget('$w_2$'))
interactive_plot


# Part 4

Finally, we compare the learned functions that we obtain from two parametrizations. The next cell lets you choose the training data and runs SGD for both parametrizations for 3000 steps.
- **Observe that SGD doesn't always converge to the interpolating solution. What values of the training data seem to be the hardest for convergence? For which parametrization is it harder to converge?**
- **Now restrict yourself only to regimes where the SGD converges. How would you describe the difference between the learned functions?**


In [None]:
%matplotlib inline


n_iter = 3000
w_initial=np.zeros(3)
step_size = 0.05

def plot_learned_functions(x_train_1, x_train_2, y_train_1, y_train_2 ):
    X_train = np.array([x_train_1, x_train_2])
    y_train = np.array([y_train_1, y_train_2])
    usual_traj = SGD(one_sample_grad=usual_parametrization_grad,
                   step_size=step_size,
                   n_iter=n_iter,
                   X_train=X_train,
                   y_train=y_train,
                   w_initial=w_initial)

    nn_traj = SGD(one_sample_grad=nn_parametrization_grad,
                   step_size=step_size,
                   n_iter=n_iter,
                   X_train=X_train,
                   y_train=y_train,
                   w_initial=w_initial)


    usual_res = usual_traj[n_iter]
    nn_res = nn_traj[n_iter]
    x_test = np.linspace(0,1,100)
    plt.close()
    plt.ylim(-0.1, 1.1)
    plt.plot(x_test, usual_res[0] * x_test**2 + usual_res[1] * x_test + usual_res[2], label='usual')
    plt.plot(x_test, nn_res[1] *(nn_res[0] * x_test + 1)**2 + nn_res[2], label='nn style')
    plt.scatter(X_train, y_train, label='training points')
    plt.legend()
    plt.show()

interactive_plot = interactive(plot_learned_functions,
                               x_train_1=generate_x_widget('$x_1$', 0.3),
                               x_train_2=generate_x_widget('$x_2$', 0.7),
                               y_train_1=generate_y_widget('$y_1$'),
                               y_train_2=generate_y_widget('$y_2$'))
interactive_plot
