# Experiments 1. Logistic Regression

In [1]:
import numpy as np
import os
#import copy
#import time
#from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
import jax
import jax.numpy as jnp
from jax.config import config
import timeit


In [2]:
from methods import gradf_inexact
from methods import GradientDescent, parse_logs, AdaptiveNoiseGD
from methods import ConstantStepSize, AdaptiveL, AdaptiveLdelta

In [3]:
matplotlib.use('Agg')
params = {'legend.fontsize': 20,
          'legend.handlelength': 4,
          "axes.labelsize": 45,
          "xtick.labelsize": 25,
          "ytick.labelsize": 25,
          "lines.linewidth": 2,
           "axes.titlesize":30}
matplotlib.rcParams.update(params)

In [4]:
config.update("jax_enable_x64", True)

In [5]:
path_pics = "../pics/"

In [6]:
def f1(w, X, Y, sigma=0):
    logit = -Y * (X@w)
    logit1 = jnp.clip(logit, None, 0)
    logit2 = jnp.clip(logit, 0, None)
    log = jnp.log(jnp.exp(logit1)+jnp.exp(-logit2)) + logit2
    return (log).sum() / len(Y) + sigma * (w**2).sum()

gradf = jax.grad(f1, argnums=0, has_aux=False)
jit_gradf = jax.jit(gradf)

## 0. Dataset

Creating of such dataset that logistic regression without regularization has finite solution

In [7]:
np.random.seed(1)
n, m = 100, 200
k = 10
X = np.random.randn(n, m)
X = np.vstack([X, X[-k:]])
w = np.random.randn(m)
Y = np.sign(X @ w)
Y[-k:] *= -1
X.shape, Y.shape, w.shape, np.linalg.norm(w)

((110, 200), (110,), (200,), 15.744252812419115)

In [8]:
np.random.seed(1)
n, m = 700, 200
k = 50

W, _ = np.linalg.qr(np.random.randn(m, m))
#W = np.eye(m)
X_base = W[:k]
X_w = np.random.randint(-10, 10, (n-2*k, k))
#X_w = (X_w.T / np.linalg.norm(X_w, 2, 1)).T
X = np.vstack([X_base, X_base, X_w @ X_base])
X = (X - X.mean(0))/X.std()
w = np.random.randn(m)
Y = np.sign(X@w)
Y[:k] *= -1
X.shape, Y.shape

((700, 200), (700,))

In [9]:
np.linalg.matrix_rank(X)

50

In [10]:
A = X
eigvals, _ = np.linalg.eigh(A.T @ A)
L = np.real((eigvals.max()) / 4 / A.shape[0])
L

1.5794450694715507

In [11]:
sigma=0
gradf = lambda x: np.array(jit_gradf(x, X, Y, sigma).block_until_ready())
f = lambda x: f1(x, X, Y, sigma)

In [12]:
#w = np.random.randn(X.shape[-1])
f(w).item(), f(np.zeros(m)).item()



(0.1374615491296541, 0.6931471805599454)

## 1. Noise Distributed on the Unit Sphere

The case when $\xi \sim \mathcal{U}(S_1(0))$

In [26]:
np.random.seed(1)
mu_list = [0]
res = {mu:{"delta":[], 
           "iters_adaptL":[], "time_adaptL":[], "adaptL,x0-x*": [], "normg_adaptL": [], "residual_adaptL":[],
           "iters_exact":[], "time_exact":[], "exact,x0-x*": [], "normg_exact": [], "residual_exact":[], 
          "iters_adaptLdelta":[], "time_adaptLdelta":[], "adaptLdelta,x0-x*": [], "normg_adaptLdelta": []} for mu in mu_list}
dist = {}
number = 1

Delta_list = [1e-5, 1e-4, 1e-2]
N = int(2e4)

for mu in mu_list:
    w = np.random.randn(m)

    alpha = 1/L
    w = np.ones(m)*0.1

    v = np.random.randn(*w.shape)
    v = np.ones(*w.shape)
    save_iter = int(1)
    tol = 1e-9
    methods = []
    for Delta in Delta_list:
        eps = Delta**2 / 16
        f2 = lambda x: f(x) + eps*np.random.uniform(-1, 1)
        res[mu]["delta"].append(int(np.log10(Delta)))
        tol = np.sqrt(6)*Delta

        grad_inexact = lambda w: gradf_inexact(w, gradf, Delta, 1, v=v)
        method = GradientDescent(AdaptiveL(L0=1, Delta=Delta, Lmin=mu/4, delta=eps), name="GD, Delta={}".format(Delta), save_iter=save_iter)
        x = method.solve(w, f2, grad_inexact, tol=tol, max_iter=N)
        g = lambda: GradientDescent(AdaptiveL(L0=1, Delta=Delta, Lmin=mu/4, delta=eps),
                                    return_history=False).solve(w, f2, grad_inexact, tol=tol, max_iter=N)
        T = timeit.timeit(g, number=number)/number        
        print("\t{}\t{}\t{:.2f}\t{:.6f}\t{:.2f}".format(Delta, len(method.history), T*1000, np.linalg.norm(x-w), 
                                                np.linalg.norm(gradf(x))/Delta))
        methods.append(method)
        res[mu]["iters_adaptL"].append(len(method.history))
        res[mu]["time_adaptL"].append("{:.2f}".format(T*1000))
        res[mu]["adaptL,x0-x*"].append("{:.1f}".format(np.linalg.norm(x-w)))
        res[mu]["normg_adaptL"].append("{:.2f}".format(np.linalg.norm(gradf(x))/Delta))
        res[mu]["residual_adaptL"].append(f(x))

        Lmin = mu/4
        
        method = AdaptiveNoiseGD(AdaptiveLdelta(L0=1, mindelta=1e-12, Lmin=Lmin, mu=mu, delta_alpha=2), 
                                 name="GD, Delta={}".format(Delta), save_iter=save_iter, alpha=np.sqrt(6))
        x = method.solve(w, f, grad_inexact, max_iter=N)
        g = lambda: method.solve(w, f, grad_inexact, max_iter=N)
        T = timeit.timeit(g, number=number)/number        
        #T = 0
        print("\t{}\t{}\t{:.2f}\t{:.6f}\t{:.2f}".format(Delta, len(method.history), T*1000, np.linalg.norm(x-w), 
                                                np.linalg.norm(gradf(x))/Delta))
        methods.append(method)
        res[mu]["iters_adaptLdelta"].append(len(method.history))
        res[mu]["time_adaptLdelta"].append("{:.2f}".format(T*1000))
        res[mu]["adaptLdelta,x0-x*"].append("{:.1f}".format(np.linalg.norm(x-w)))
        res[mu]["normg_adaptLdelta"].append("{:.2f}".format(np.linalg.norm(gradf(x))/Delta))        

        method = GradientDescent(ConstantStepSize(alpha), name="GD, Delta={}".format(Delta), save_iter=save_iter)
        x = method.solve(w, f, grad_inexact, tol=tol, max_iter=N)
        g = lambda: GradientDescent(ConstantStepSize(alpha),
                                    return_history=False).solve(w, f, grad_inexact, tol=tol, max_iter=N)
        T = timeit.timeit(g, number=number)/number
        print("\t{}\t{}\t{:.2f}\t{:.6f}\t{:.2f}".format(Delta, len(method.history), T*1000, np.linalg.norm(x-w), 
                                                np.linalg.norm(gradf(x))/Delta))
        methods.append(method)
        res[mu]["iters_exact"].append(len(method.history))
        res[mu]["time_exact"].append("{:.2f}".format(T*1000))
        res[mu]["exact,x0-x*"].append("{:.1f}".format(np.linalg.norm(x-w)))
        res[mu]["normg_exact"].append("{:.2f}".format(np.linalg.norm(gradf(x))/Delta))
        res[mu]["residual_exact"].append(f(x))
        print()

	1e-05	902	2856.98	13.392339	2.22
	1e-05	23	449.68	13.414217	3.37
	1e-05	20002	5604.34	13.342222	3.56

	0.0001	472	1678.02	12.753213	2.16
	0.0001	25	370.09	12.908592	3.62
	0.0001	9700	2605.02	12.655907	2.42

	0.01	17	68.36	3.846313	2.10
	0.01	17	161.27	5.597237	0.84
	0.01	83	49.86	3.468000	2.29



In [28]:
s = ""

for mu in mu_list:
    #s += str(mu) + " & "
    cur_list = ["$10^{{{}}}$".format(i) for i in res[mu]["delta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    
    cur_list = ["${}$".format(i) for i in res[mu]["iters_exact"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[mu]["time_exact"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[mu]["normg_exact"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    
    cur_list = ["${}$".format(i) for i in res[mu]["iters_adaptL"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[mu]["time_adaptL"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[mu]["normg_adaptL"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    
    cur_list = ["${}$".format(i) for i in res[mu]["iters_adaptLdelta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[mu]["time_adaptLdelta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[mu]["normg_adaptLdelta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}"
    
    s+= "\\\\\n\\hline\n"
print(s)

\begin{tabular}{@{}c@{}} $10^{-5}$ \\ $10^{-4}$ \\ $10^{-2}$ \end{tabular}&\begin{tabular}{@{}c@{}} $20002$ \\ $9700$ \\ $83$ \end{tabular}&\begin{tabular}{@{}c@{}} $5604.34$ \\ $2605.02$ \\ $49.86$ \end{tabular}&\begin{tabular}{@{}c@{}} $3.56$ \\ $2.42$ \\ $2.29$ \end{tabular}&\begin{tabular}{@{}c@{}} $902$ \\ $472$ \\ $17$ \end{tabular}&\begin{tabular}{@{}c@{}} $2856.98$ \\ $1678.02$ \\ $68.36$ \end{tabular}&\begin{tabular}{@{}c@{}} $2.22$ \\ $2.16$ \\ $2.10$ \end{tabular}&\begin{tabular}{@{}c@{}} $23$ \\ $25$ \\ $17$ \end{tabular}&\begin{tabular}{@{}c@{}} $449.68$ \\ $370.09$ \\ $161.27$ \end{tabular}&\begin{tabular}{@{}c@{}} $3.37$ \\ $3.62$ \\ $0.84$ \end{tabular}\\
\hline

