In [None]:
import numpy as np
import matplotlib.pyplot as plt


# Part 1
In this part we study the learning problem from the pdf numerically: our data is one-dimensional $\{x_i\}_{i=1}^n \subseteq \mathbb{R}$  and the labels are $\{y_i = \mathrm{sign}(x_i > \theta)\}_{i=1}^n$ where $\theta$ is some unknown threshold (i.e. we have separable 1d data). We consider the following optimization problem:
$$\min_{m,c}\sum_{i=1}^n \log\left(1 + \exp(-m(x_i + c)y_i)\right).$$

As we saw in the theoretical part, the minimum of this objective is always located infinitely far from the origin, so we cannot take the true minimizer. Therefore, we consider the output of the gradient descent after some finite number of steps.

The next cell contains implementations of the loss, its gradient and the gradient descent with constant step size.


In [None]:
#implementation of the gradient descent
def loss_1d(x, y, m, c):
    return np.mean(np.log(1 + np.exp(-m * (x - c) * y)))

def grad_1d(x, y, m, c):
#     inputs:
#     x is a 1d vector
#     y is a 1d vector of +/- 1 of the same length as x
#     a, b are real numbers
    return np.mean(-np.vstack(((x -c) * y, -m * y))/
                   (1 + np.exp(m * (x - c) * y)), axis=1)

def grad_descent_1d(x, y, m_init, c_init, step_size, n_iter):
#     inputs:
#     x is a 1d vector
#     y is a 1d vector of +/- 1 of the same length as x
#     a_init, b_init and step_size are real numbers
#     n_iter is an  integer
    w = np.array([m_init, c_init], dtype=float)
    losses = [loss_1d(x, y, w[0], w[1])]
    traj = [np.copy(w)]
    for step_number in range(n_iter):
        w -= step_size * grad_1d(x, y, w[0], w[1])
        traj += [np.copy(w)]
        losses += [loss_1d(x, y, w[0], w[1])]
    return traj, losses


**1) Look at the code and the output of the following cell. What does "threshold with maximum margin" mean in this case?**


In [None]:
#generate 1d data
def gen_1d_data(n=7):
    x = np.sort(np.random.rand(n))
    y = np.ones(n)
    threshold_index = np.random.randint(n-1)
    y[:threshold_index + 1] = -1
    threshold = (x[threshold_index] + x[1 + threshold_index])/2#find the real max-margin threshold
    return x, y, threshold

x, y, threshold = gen_1d_data(n=7)

#plot the data
plt.scatter(x, np.zeros(len(x)), c = ['b' if label==1 else 'r' for label in y])
plt.plot([threshold] * 2, [-0.1, 0.1], c='g', label='threshold with maximum margin')
plt.legend()


**2) Run the next cell several times. Don't hesitate to play with the amount of data, step size and number of iterations. What classifier does gradient descent converge to? Does it coincide with the result of the theoretical part? Is the convergence fast?**


In [None]:
x, y, threshold = gen_1d_data(n=7)
traj_1d, losses_1d = grad_descent_1d(x, y, m_init=1, c_init=0, step_size=0.05, n_iter=int(1e4))
m_1d, c_1d = traj_1d[-1]
print("true max margin threshold: ", threshold, "\nlearned threshold: ", c_1d )

#plot how b evolves
plt.plot([w[1] for w in traj_1d], label='c')
plt.plot(threshold * np.ones(len(traj_1d)), label='max-margin treshold')
plt.xlabel("iteration")
plt.legend()


The next cell plots $m$ against iterations.

**3) Does the plot below support the results from the theoretical part? What do you think is the rate of growth of m?** Don't forget that you can change the step size and the number of iterations above.


In [None]:
#plot how m evolves
plt.plot([w[0] for w in traj_1d])


# Part 2
In the previous part we looked at learning an affine function parametrized as $m(x-c)$. This is different from the usual parametrization $w_0 x + w_1$. In this part we compare the two.

Note that learning an affine function $f(x) = w_0 x + w_1$ on 1d data $\{x_i\}_{i=1}^n$ is the same as learning a linear function $l({ \bf x}) = w_0 x_0 + w_1 x_1$ on the (lifted)   2d data $\{{\bf x}_i\}_{i=1}^n = \{(x_i, 1)^\top\}_{i=1}^n$. Therefore we can transition to linear classifiers for high-dimensional spaces: our data matrix is ${\bf X} \in \mathbb{R}^{n \times d}$ ($n$ is the number of data points, $d$ is the ambient dimension), and we still assume that we have two linearly separable classes: $y = \mathrm{sign}({\bf X\theta})$ where sign is applied elementwise and ${\bf \theta}\in \mathbb{R}^d$ is some unknown weight vector. To learn the weights we consider the following optimization problem:
$$\min_{w \in \mathbb{R}^d} \sum_{i=1}^n \log\left(1 + \exp(-{\bf w}^\top {\bf x}_i)\right).$$

For the same reason as before we consider the output of the gradient descent after some finite number of steps.

The next cell contains implementations of the loss function, its gradient and gradient descent with constant step size.


In [None]:
# implement logistic loss and its gradient
def log_losses(w, X, y):
    #vector of losses in each data point
    return np.log(1 + np.exp(-(X @ w) * y))

def log_loss(w, X, y):
    return np.mean(log_losses(w, X, y))

def log_loss_gradients(w, X, y):
    #n by d matrix: i-th row is the gradient in i-th data point
    return -(X.T * y / (1 + np.exp((X @ w) * y))).T

def log_loss_gradient(w, X, y):
    return np.mean(log_loss_gradients(w, X, y), axis=0)

#implement gradient descent
def grad_descent(X, y, w_init, step_size, n_iter):
    w = np.copy(w_init)
    traj = [np.copy(w)]
    losses = [log_loss(w, X, y)]
    for _ in range(n_iter):
        w -= step_size * log_loss_gradient(w, X, y)
        traj += [np.copy(w)]
        losses += [log_loss(w, X, y)]
    return traj, losses


**1) Run the next cell several times. Don't hesitate to play with the amount of data, step size and number of iterations. Do both methods converge to the same classifier? For which parametrization is convergence faster?**


In [None]:
#generate the data as before
x, y, threshold = gen_1d_data(n=10)


#lift the data into 2d to learn affine function instead of linear
X = np.vstack((x, np.ones(len(x)))).T

#learn affine functions with two different parametrizations
step_size = 0.1
n_iter = int(1e4)
traj_1d, losses_1d = grad_descent_1d(x, y, m_init=1, c_init=0, step_size=step_size, n_iter=n_iter)
traj_gd, losses = grad_descent(X, y, w_init=np.array([1., 0.]), step_size=step_size, n_iter=n_iter)

#compare learned thresholds:
print("true max margin threshold: ", threshold,
      "\n part 1 learned threshold: ", traj_1d[-1][1],
      "\n part 2 learned threshold: ", -traj_gd[-1][1]/traj_gd[-1][0],)

#compare evolution of thresholds
plt.plot([w[1] for w in traj_1d], label='part 1 threshold')
plt.plot([-w[1]/w[0] for w in traj_gd], label='part 2 threshold')
plt.plot(threshold * np.ones(len(traj_gd)), label='max margin threshold')
plt.legend()


The next cell plots the norm of weights against iterations.

**2) What do you think is the rate of growth of the norm of the weights? Is it qualitatively different from the rate of growth of $m$ from part 1?** Don't forget that you can change the step size and the number of iterations above.


In [None]:
#print magnitudes of weights:
plt.plot([np.linalg.norm(w) for w in traj_gd])


From the previous experiments it may seem that both ways of learning an affine function are almost the same and always lead to the same results. However, it is not always the case. In the following cell we only have two data points: $x_1 = -1$, $x_2 = 2$ with labels $y_1 = -1$, $y_2 = 1$.

**3) Run the following cell. Do both descents converge to the same threshold? Can you choose such sample size and step size so that the "part 2" threshold converges to the maximum margin value?**


In [None]:
#sometimes the predictions are different
n = 2
x = np.array([-1, 2])
y = np.array([-1, 1])
threshold = np.mean(x)

#lift the data into 2d to learn affine function instead of linear
X = np.vstack((x, np.ones(n))).T

#learn affine functions with two different parametrizations
n_iter = int(1e4)
traj_1d, losses_1d = grad_descent_1d(x, y, m_init=1, c_init=0, step_size=0.1, n_iter=n_iter)

#now start the new gradient descent at the weights found by the previous one
traj_gd, losses_gd = grad_descent(X, y, w_init= np.array([1, -0.5]), step_size=10., n_iter=n_iter)

#compare learned thresholds:
print("true max margin threshold: ", threshold,
      "\n part 1 learned threshold: ", -traj_1d[-1][1],
      "\n part 2 learned threshold: ", -traj_gd[-1][1]/traj_gd[-1][0])

#compare evolution of thresholds
plt.plot([w[1] for w in traj_1d], label='part 1 threshold')
plt.plot([-w[1]/w[0] for w in traj_gd], label='part 2 threshold')
plt.plot(threshold * np.ones(len(traj_gd)), label='max margin threshold')
plt.legend()


What we've seen in the previous cell happened because the convergence of the gradient descent on lifted data is very slow. To demonstrate that, in the next cell we implement another version of gradient descent: instead of making a step proportional to the gradient,  we make a step in the direction of the gradient, buth the length of the step is proportional to $1/\sqrt{i}$ where $i$ is the iteration number.

**4) Make sure that your data is x = [-1, 2], y = [-1,1]. Run the code in the following cell. Do both descents converge to the maximum margin threshold now?** Note that since the gradient becomes effectively zero, some numerical issues arise with the rescaled GD after large number of steps.


In [None]:
def grad_descent_rescaled(X, y, w_init, step_size, n_iter):
    w = np.copy(w_init)
    traj = [np.copy(w)]
    losses = [log_loss(w, X, y)]
    for step_number in range(n_iter):
        grad = log_loss_gradient(w, X, y)
        w -= step_size * grad / np.linalg.norm(grad) / (step_number + 1)**0.5
        traj += [np.copy(w)]
        losses += [log_loss(w, X, y)]
    return traj, losses

traj_gd_rescaled, losses_rescaled = grad_descent_rescaled(X, y, w_init= np.array([0., 0.]),
                                                          step_size=2.,
                                                          n_iter=n_iter)

plt.plot([w[1] for w in traj_1d], label='part 1 threshold')
plt.plot([-w[1]/w[0] for w in traj_gd_rescaled], label='part 2 threshold')
plt.plot(threshold * np.ones(len(traj_gd)), label='max margin threshold')
plt.legend()


Now we managed to deal with the slow convergence, but was that the only issue why the predictions were different? The answer turns out to be "no"!

**5) Now repeat the previous two tasks (3 and 4) but take the data to be x = [-1, 4], y = [-1,1]. Can you make rescaled descent to converge to the max margin threshold by tweaking step size and number of iterations? What if you set x = [-0.1, 0.4], y = [-1,1]?**


To understand what's going on we can look at the 2d picture. The code below plots the separatin line that our GD finds after lifting the data.

**6) Try increasing the coordinate of the second data point (i.e. try x = [-1, 2], [-1, 2.5], [-1, 3] and so on). What happens with separating line when that coordinate becomes larger than 3? In what sense is this a maximum margin classifier?** Note that the scale on both axes is the same for convenience, so the angles are visualized correctly. **Can you explain why the scale of the data was so important in the previous experiment?**


In [None]:
import seaborn
n = 2
x = np.array([-1, 4])
y = np.array([-1, 1])
threshold = np.mean(x)
X = np.vstack((x, np.ones(n))).T

#learn affine functions with two different parametrizations
n_iter = int(1e4)

traj_gd_rescaled, losses_rescaled = grad_descent_rescaled(X, y, w_init= np.array([0., 0.]),
                                                          step_size=2.,
                                                          n_iter=n_iter)

#find the last weight vector which does not contain nan or inf
w = np.array(traj_gd_rescaled)[ [max(w) < np.inf and min(w) > -np.inf for w in  traj_gd_rescaled], :][-1, :]


fig = plt.figure(figsize=(20,20))
plt.ylim([0., 1.1])
plt.xlim(-2, 7)
plt.scatter(X[:, 0], X[:,1], c = ['b' if label==1 else 'r' for label in y])

plt.plot([threshold] * 2, [0, 1.1], c='g', label='threshold with maximum 1d margin')
plt.plot([w[1], -w[1]], [-w[0], w[0]], c='y', label='separating line GD found in 2d')
plt.gca().set_aspect('equal', adjustable='box')

plt.legend()


# Part 3

In this part we only work with high-dimensional linearly separable data. We've seen before that GD on logistic loss converges to the max margin separating solution. Let's compare our gradient descent with Newton's method in terms of the margin that they learn.

The next cell contains the implementaion of the hessian of the loss and of the Newton's method for optimization.


In [None]:
#implement hessian and Newton's method

def log_loss_hessian(w, X, y):
    n, d = X.shape
    exps = np.exp(-(X @ w) * y)
    return (X.T * exps / (1 + exps)**2) @ X / n

def newtons_method(X, y, w_init, n_iter):
    n, d = X.shape
    w = np.copy(w_init)
    traj = [np.copy(w)]
    losses = [log_loss(w, X, y)]
    for _ in range(n_iter):
        w -= 0.5 * np.linalg.pinv(log_loss_hessian(w, X, y)) @ log_loss_gradient(w, X, y)
        traj += [np.copy(w)]
        losses += [log_loss(w, X, y)]
    return traj, losses


def compute_margin(X, w, y):
    return np.min(X @ (w/ np.linalg.norm(w)) * y)


**1) Run the following cell several times, and try different values of d and n. Which method converges faster and which finds solutions with better margins? What happens as d grows?**

**2) When d > n the hessian is degenerate. How does our implementation of Newton's method work in this case?.**


In [None]:
#generate separable data
d = 20
n = 15
X = np.random.randn(n, d)
true_weights = np.random.randn(d) #these are NOT MAX MARGIN weights
y = np.sign(X @ true_weights).astype('int')

w_init = np.zeros(d)
n_iter = int(1e3)
traj_gd, losses_gd = grad_descent_rescaled(X, y, w_init, step_size=0.5, n_iter=n_iter)
traj_newton, losses_newton = newtons_method(X, y, w_init, n_iter=n_iter)

#compute the margins and compare
plt.plot(np.log(range(len(traj_gd))), [compute_margin(X, w, y) for w in traj_gd], label='gd margins')
plt.plot(np.log(range(len(traj_newton))),[compute_margin(X, w, y) for w in traj_newton], label='newtons margins')
plt.xlabel('log iterations')
plt.legend()


The following cell plots how far each data point is from the decision boundary found by Newton's method that you ran in the previous cell.

**3) Run the cell below after running Newton's method in the overparametrized regime (d > n). What do you see? How would you describe the solution that Newton's method finds in this regime?**


In [None]:
w = np.array(traj_newton[-1])
plt.plot(X @ (w/ np.linalg.norm(w)) * y)
