In [24]:
import numpy as np
from numpy import random

In [25]:
# DATA GENERATION
p = 500
N = 10**5
cov = 2 * np.identity(p)
mean = np.ones(p)
X = random.multivariate_normal(mean, cov, size=N)
nnz = int(p/25)
nz_nums = random.multivariate_normal(np.ones(nnz), np.identity(nnz), size=1)
indexes = random.choice(range(p), size=nnz, replace=False, p=None)
ground_beta = np.zeros((p,1))
ground_beta[indexes,0] = nz_nums
from numpy import matmul
y = matmul(X,ground_beta)

### (a) Implement subgradient descent algorithm with your choice of stepsize. Try a fixed stepsize and a decreasing stepsize. Plot the objective function with growing number of iterations for each algorithm.


In [26]:
from numpy.linalg import norm
from pdb import set_trace

lam = 0.1 # this is lambda
thresh = 0.1
mean = np.zeros(p)
cov = np.identity(p)
beta_0 = random.multivariate_normal(mean, cov, size=1).reshape((p,1))

def loss(X, y, beta, lam):
#     set_trace()
    return   (1/N) * 0.5 * norm((matmul(X,beta)-y), ord=2)**2  + lam * norm(beta, ord=1) 

def subgrad(X, y, beta, lam):
    # gradient of the loss function
    return   (1/N) * matmul(np.transpose(X),(matmul(X,beta)-y)) + lam * np.sign(beta) 

In [27]:
# FIXED ALPHA
alpha = 0.001
beta = np.empty_like(beta_0)
beta[:] = beta_0
L = loss(X, y, beta, lam)
L_min = [L,beta]
L1_store = []
c1_store = []
print (loss)
counter = 0
while L > thresh:
    beta_old = beta
    beta -= alpha * subgrad(X, y, beta, lam)
    L = loss(X, y, beta, lam)
    if L < L_min[0]:
        L_min[0] = L
        L_min[1] = beta
    else:
        beta = beta_old
    counter += 1
    if counter % 10 == 0:
        L1_store.append(L)
        c1_store.append(counter)
        print ("loss = {}".format(L))
print("iterations for fixed alpha done") 

<function loss at 0x7ff0a2ad7b70>
loss = 615.8631102163878
loss = 591.6247485947491
loss = 568.3540114001677
loss = 546.0329313392489
loss = 524.5940040589243
loss = 504.0000603057408


KeyboardInterrupt: 

In [28]:
# DECREASING ALPHA
alpha_0 = 0.01
beta = np.empty_like(beta_0)
beta[:] = beta_0
L = loss(X, y, beta, lam)
L_min = [L,beta]
L2_store = []
c2_store = []
counter = 0
while L > thresh:
    counter += 1
    beta_old = beta
    beta -= alpha * subgrad(X, y, beta, lam)
    alpha = alpha_0 / (alpha_0 + counter)
    L = loss(X, y, beta, lam)
    if L < L_min[0]:
        L_min[0] = L
        L_min[1] = beta
    else:
        beta = beta_old
    if counter % 10 == 0:
        L2_store.append(L)
        c2_store.append(counter)
        print ("loss = {}".format(L))
print("iterations for varying alpha done")

loss = 570.0389047756707
loss = 553.8705544369836
loss = 544.7714834450585
loss = 538.4453943878948
loss = 533.6052327430299


KeyboardInterrupt: 

### (b) Implement proximal gradient descent with your choice of stepsize. Plot the objective function with growing number of iterations.


In [12]:
def soft_thresh(lam, alpha, beta):
    m, _ = np.shape(beta)
    s_t = np.zeros((m,1))
    for i in range(m):
        if beta[i] > lam * alpha:
            s_t[i,0] = beta[i] - lam * alpha
        elif beta[i] < - lam * alpha:
            s_t[i,0] = beta[i] + lam * alpha
        else:
            s_t[i,0] = 0
    return s_t

def g(beta):
    # corresponds to the smooth term in the loss
    return   (1/N) * 0.5 * norm((matmul(X,beta)-y), ord=2)**2 

def grad_g(beta):
    # corresponds to the gradient of the smooth term in the loss
    return   (1/N) * matmul(np.transpose(X),(matmul(X,beta)-y)) 


In [13]:
alpha = 0.001
beta = np.empty_like(beta_0)
beta[:] = beta_0
L = loss(X, y, beta, lam)
L3_store = []
c3_store = []
counter = 0
while L > thresh:
    counter += 1
    beta = beta - alpha * grad_g(beta)
    beta = soft_thresh(lam, alpha, beta)       
    L = loss(X, y, beta, lam)
    counter += 1
    if counter % 10 == 0:
        L3_store.append(L)
        c3_store.append(counter)
        print ("loss = {}".format(L))

loss = 492.7994930742372
loss = 482.30580410399915
loss = 472.34709625428366
loss = 462.5927960148957
loss = 453.04110126969573
loss = 443.6840008469633
loss = 434.5175803542272
loss = 425.5380638977054
loss = 416.74154462981966
loss = 408.1243128533027
loss = 399.6827355843921
loss = 391.4132534577697
loss = 383.31238467340853
loss = 375.37676586536656
loss = 367.60464495835635
loss = 359.99697706695633
loss = 352.5446345783274
loss = 345.2442972850416
loss = 338.09284543108856


KeyboardInterrupt: 

### (c) Implement proximal gradient descent with backtracking line search. You can find more about backtracking line search in https://www.robots.ox.ac. uk/~vgg/rg/slides/fgrad.pdf. Plot the objective function with growing number of iterations.


In [22]:
alpha = 1
multiplier = 0.9
beta = np.empty_like(beta_0)
beta[:] = beta_0
L = loss(X, y, beta, lam)
L4_store = []
c4_store = []
counter = 0

while L > thresh:
    counter += 1
    # while loop is backtracking line search to find optimal alpha
    while True:
        beta_plus = beta - alpha * grad_g(beta)
        value = g(beta_plus) <= g(beta) + matmul(np.transpose(grad_g(beta)), (beta_plus - beta)) + (1/(2*alpha)) * norm(beta_plus-beta)**2
        if not value:
            alpha *= multiplier
        else:
            break
    
    beta = beta - alpha * grad_g(beta)
    beta = soft_thresh(lam, alpha, beta)       
    L = loss(X, y, beta, lam)
    counter += 1
    if counter % 10 == 0:
        L4_store.append(L)
        c4_store.append(counter)
        print ("loss = {}".format(L))
        print ("alpha = {}".format(alpha))

loss = 482.31865637570826
alpha = 0.001996678111016037
loss = 462.6192350340075
alpha = 0.001996678111016037
loss = 443.7215069050127
alpha = 0.001996678111016037
loss = 425.5857623191486
alpha = 0.001996678111016037
loss = 408.18127317255926
alpha = 0.001996678111016037
loss = 391.478656325768
alpha = 0.001996678111016037
loss = 375.44986349472424
alpha = 0.001996678111016037
loss = 360.07702168926363
alpha = 0.001996678111016037
loss = 345.3306427136509
alpha = 0.001996678111016037
loss = 331.17924107598367
alpha = 0.001996678111016037
loss = 317.5989734925165
alpha = 0.001996678111016037
loss = 304.56691833174557
alpha = 0.001996678111016037
loss = 292.0622324926676
alpha = 0.001996678111016037
loss = 280.0669819408345
alpha = 0.001996678111016037
loss = 268.5564269592902
alpha = 0.001996678111016037
loss = 257.51115092538123
alpha = 0.001996678111016037
loss = 246.91237200316155
alpha = 0.001996678111016037
loss = 236.74459190482872
alpha = 0.001996678111016037
loss = 226.991543559

loss = 0.4166605927410841
alpha = 0.001996678111016037
loss = 0.3963332598584042
alpha = 0.001996678111016037
loss = 0.37714232732918235
alpha = 0.001996678111016037
loss = 0.3590208610782121
alpha = 0.001996678111016037
loss = 0.3419047705286867
alpha = 0.001996678111016037
loss = 0.32577128211114925
alpha = 0.001996678111016037
loss = 0.3106079580299944
alpha = 0.001996678111016037
loss = 0.29639604727331376
alpha = 0.001996678111016037
loss = 0.28296763619298065
alpha = 0.001996678111016037
loss = 0.2702800386513738
alpha = 0.001996678111016037
loss = 0.25828097176519815
alpha = 0.001996678111016037
loss = 0.24708355381801383
alpha = 0.001996678111016037
loss = 0.23646977328595922
alpha = 0.001996678111016037
loss = 0.2265019096917789
alpha = 0.001996678111016037
loss = 0.21700750619255063
alpha = 0.001996678111016037
loss = 0.2080471461509635
alpha = 0.001996678111016037
loss = 0.1995772023603555
alpha = 0.001996678111016037
loss = 0.19153158579748708
alpha = 0.001996678111016037
l

### (d) Now compare these methods with any publicly available software for lasso, e.g. glmfit or lasso or scikit-learn.

In [None]:
from sklearn.linear_model import Lasso

clf = Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, 
                           copy_X=True, max_iter=1000, tol=0.001, warm_start=False, positive=False, 
                           random_state=None, selection=’cyclic’)
clf.fit(X,y)
y_pred = clf.predict()