# Experiments 1. Non-linear Equation System: sinus

We will minimize

$$f(w_1, w_2) = \sum\limits_{i=1}^d \left(\langle w_1, x_i\rangle + \sin\left(w_2, x_i\right) - y_i\right)^2$$

for $w_1, w_2\in\mathbb{R}^n$, $x_i \in \mathbb{R}^n$, $d\leq n$ with the condition

$$XX^\top \succeq \mu I_d,$$
where $X = (x_1 \dots x_d)^\top \in \mathbb{R}^{d\times n}$

In [120]:
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import jax
import jax.numpy as jnp
import timeit
from jax.config import config

In [121]:
from methods import gradf_inexact
from methods import GradientDescent, parse_logs, AdaptiveL, StepSize, AdaptiveNoiseGD
from methods import ConstantStepSize, AdaptiveLdelta

In [122]:
matplotlib.use('Agg')
params = {'legend.fontsize': 20,
          'legend.handlelength': 4,
          "axes.labelsize": 45,
          "xtick.labelsize": 25,
          "ytick.labelsize": 25,
          "lines.linewidth": 2,
           "axes.titlesize":30}
matplotlib.rcParams.update(params)

In [123]:
config.update("jax_enable_x64", True)

In [124]:
path_pics = "../pics/"

In [125]:
X.shape, (np.hstack([X, X])@w).shape, Y.shape

((20, 300), (20,), (20,))

In [126]:
def f1(w, X, Y):
    w1 = w[:n]
    w2 = w[n:]
    G = (X @ w[:n] + jnp.sin(X@w[:n]) - Y)**2
    return G.sum()

In [127]:
def f1(w1, w2, X, Y):
    G = (X @ w1 + jnp.sin(X@w2) - Y)**2
    return G.sum()

gradf = jax.grad(f1, argnums=(0, 1), has_aux=False)
jit_gradf = jax.jit(gradf)

## 0. Dataset

In [144]:
d = 200
n = 300
X = np.random.randn(d, n)
w = np.random.randn(2*n)
Y = X @ w[:n] + np.sin(X@w[n:])
eig = np.linalg.eig(X@X.T)[0]
mu = min(eig)
min(eig), max(eig)

(10.847227731959508, 970.5657981735665)

In [146]:
sigma=0
def gradf(x):
    z = jit_gradf(x[:n], x[n:], X, Y)
    g = np.hstack([np.array(i.block_until_ready()) for i in z])
    return g
f = lambda x: f1(x[:n], x[n:], X, Y)

In [147]:
def params(X, Y):
    def gradf(x):
        z = jit_gradf(x[:n], x[n:], X, Y)
        g = np.hstack([np.array(i.block_until_ready()) for i in z])
        return g
    f = lambda x: f1(x[:n], x[n:], X, Y)
    return f, gradf
f, gradf = params(X, Y)

In [148]:
f(w).item(), f(np.zeros(2*n)).item(), gradf(w).shape

(124669.52974630086, 70948.26085081871, (600,))

## 1. Noise Distributed on the Unit Sphere

The case when $\xi \sim \mathcal{U}(S_1(0))$

In [163]:
Delta_list = [1e-7, 1e-4, 1e-1]

w = np.ones(2*n)
v = np.random.randn(2*n)
d_list = [10, 50, 100, 200, 250]
res = {d:{"delta":[], 
           "iters_adaptL":[], "time_adaptL":[], "adaptL,x0-x*": [], "normg_adaptL": [],
           "iters_exact":[], "time_exact":[], "exact,x0-x*": [], "normg_exact": [],
          "iters_adaptLdelta":[], "time_adaptLdelta":[], "adaptLdelta,x0-x*": [], "normg_adaptLdelta": []} for d in d_list}
mu_list = {}
number = 10
save_iter = 1
N = 10000
methods = []
np.random.seed(1)
for d in d_list:
    X = np.random.randn(d, n)
    w = np.random.randn(2*n)
    Y = X @ w[:n] + np.sin(X@w[n:])
    eig = np.linalg.eig(X@X.T)[0]
    wsol=w.copy()
    w = np.ones(2*n)
    v = np.random.randn(2*n)
    mu = min(eig)
    f, gradf = params(X, Y)
    print(d, mu)
    mu_list[d] = mu
    for Delta in Delta_list:
        res[d]["delta"].append(int(np.log10(Delta)))
        tol = 2*Delta

        grad_inexact = lambda w: gradf_inexact(w, gradf, Delta, 1, v=v)
        method = GradientDescent(AdaptiveL(L0=1, Delta=Delta, Lmin=mu/4), name="GD, Delta={}".format(Delta), save_iter=save_iter)
        x = method.solve(w, f, grad_inexact, tol=tol, max_iter=N)
        g = lambda: GradientDescent(AdaptiveL(L0=1, Delta=Delta, Lmin=mu/4),
                                    return_history=False).solve(w, f, grad_inexact, tol=tol, max_iter=N)
        T = timeit.timeit(g, number=number)/number        
        print("\t{}\t{}\t{:.2f}\t{:.6f}\t{:.2f}\t{}".format(Delta, len(method.history), T*1000, np.linalg.norm(x-w), 
                                                np.linalg.norm(gradf(x))/Delta, f(x)))
        methods.append(method)
        res[d]["iters_adaptL"].append(len(method.history))
        res[d]["time_adaptL"].append("{:.2f}".format(T*1000))
        res[d]["adaptL,x0-x*"].append("{:.1f}".format(np.linalg.norm(x-w)))
        res[d]["normg_adaptL"].append("{:.2f}".format(np.linalg.norm(gradf(x))/Delta))


        method = AdaptiveNoiseGD(AdaptiveLdelta(L0=1, mindelta=1e-12, Lmin=mu/4, mu=mu), name="GD, Delta={}".format(Delta), save_iter=save_iter, alpha=np.sqrt(6))
        x = method.solve(w, f, grad_inexact, max_iter=N)
        g = lambda: AdaptiveNoiseGD(AdaptiveLdelta(L0=1, mindelta=1e-12, Lmin=mu/4, mu=mu), return_history=False, 
                                    alpha=np.sqrt(6)).solve(w, f, grad_inexact, max_iter=N)
        T = timeit.timeit(g, number=number)/number        
        print("\t{}\t{}\t{:.2f}\t{:.6f}\t{:.2f}\t{}".format(Delta, len(method.history), T*1000, np.linalg.norm(x-w), 
                                                np.linalg.norm(gradf(x))/Delta, f(x)))
        methods.append(method)
        res[d]["iters_adaptLdelta"].append(len(method.history))
        res[d]["time_adaptLdelta"].append("{:.2f}".format(T*1000))
        res[d]["adaptLdelta,x0-x*"].append("{:.1f}".format(np.linalg.norm(x-w)))
        res[d]["normg_adaptLdelta"].append("{:.2f}".format(np.linalg.norm(gradf(x))/Delta))    
        print("\n")

10 221.187991129037
	1e-07	72	69.34	3.514826	0.23	5.1197911038693455e-19
	1e-07	83	219.70	3.514784	1.75	3.2470390687198864e-17


	0.0001	50	48.06	3.514826	0.21	3.918266875115143e-13
	0.0001	85	183.15	3.514784	0.72	5.446867607589956e-12


	0.1	28	29.02	3.514577	1.63	2.7917950111808907e-05
	0.1	56	141.51	3.514683	0.67	4.724898995910441e-06


50 123.26937616329923
	1e-07	84	310.36	10.173008	1.38	2.418284774218323e-17
	1e-07	113	1030.22	10.180285	0.72	9.898540527794811e-18


	0.0001	62	370.77	10.173007	1.40	3.046334490147606e-11
	0.0001	67	873.09	10.180285	1.66	5.210587479637894e-11


	0.1	41	143.23	10.172884	1.33	2.6156457763002835e-05
	0.1	47	550.34	10.180097	1.72	5.438803969449771e-05


100 58.55124414197606
	1e-07	138	587.38	14.834304	1.58	7.96804217246692e-17
	1e-07	158	1301.62	14.837213	2.56	4.7406460302447384e-17


	0.0001	106	335.90	14.834304	1.39	4.488005886621077e-11
	0.0001	112	967.21	14.837212	1.73	1.0116511860403428e-10


	0.1	73	224.40	14.834024	1.61	6.892593180259463e-05
	0.

In [166]:
s = ""

for d in d_list:
    s += str(d) + " & "
    s += "{:.1f}".format(mu_list[d]) + " & "

    cur_list = ["$10^{{{}}}$".format(i) for i in res[d]["delta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"

    cur_list = ["${}$".format(i) for i in res[d]["iters_adaptL"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["time_adaptL"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    
    cur_list = ["${}$".format(i) for i in res[d]["iters_adaptLdelta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["time_adaptLdelta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}"

    s+= "\\\\\n\\hline\n"
print(s)

10 & 221.2 & \begin{tabular}{@{}c@{}} $10^{-7}$ \\ $10^{-4}$ \\ $10^{-1}$ \end{tabular}&\begin{tabular}{@{}c@{}} $72$ \\ $50$ \\ $28$ \end{tabular}&\begin{tabular}{@{}c@{}} $69.34$ \\ $48.06$ \\ $29.02$ \end{tabular}&\begin{tabular}{@{}c@{}} $83$ \\ $85$ \\ $56$ \end{tabular}&\begin{tabular}{@{}c@{}} $219.70$ \\ $183.15$ \\ $141.51$ \end{tabular}\\
\hline
50 & 123.3 & \begin{tabular}{@{}c@{}} $10^{-7}$ \\ $10^{-4}$ \\ $10^{-1}$ \end{tabular}&\begin{tabular}{@{}c@{}} $84$ \\ $62$ \\ $41$ \end{tabular}&\begin{tabular}{@{}c@{}} $310.36$ \\ $370.77$ \\ $143.23$ \end{tabular}&\begin{tabular}{@{}c@{}} $113$ \\ $67$ \\ $47$ \end{tabular}&\begin{tabular}{@{}c@{}} $1030.22$ \\ $873.09$ \\ $550.34$ \end{tabular}\\
\hline
100 & 58.6 & \begin{tabular}{@{}c@{}} $10^{-7}$ \\ $10^{-4}$ \\ $10^{-1}$ \end{tabular}&\begin{tabular}{@{}c@{}} $138$ \\ $106$ \\ $73$ \end{tabular}&\begin{tabular}{@{}c@{}} $587.38$ \\ $335.90$ \\ $224.40$ \end{tabular}&\begin{tabular}{@{}c@{}} $158$ \\ $112$ \\ $72$ \end{tabu

In [167]:
s = ""

for d in d_list:
    s += str(d) + " & "
    cur_list = ["$10^{{{}}}$".format(i) for i in res[d]["delta"]]


    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["adaptL,x0-x*"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["normg_adaptL"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    
    cur_list = ["${}$".format(i) for i in res[d]["adaptLdelta,x0-x*"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["normg_adaptLdelta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}"

    s+= "\\\\\n\\hline\n"
print(s)

10 & \begin{tabular}{@{}c@{}} $10^{-7}$ \\ $10^{-4}$ \\ $10^{-1}$ \end{tabular}&\begin{tabular}{@{}c@{}} $3.5$ \\ $3.5$ \\ $3.5$ \end{tabular}&\begin{tabular}{@{}c@{}} $0.23$ \\ $0.21$ \\ $1.63$ \end{tabular}&\begin{tabular}{@{}c@{}} $3.5$ \\ $3.5$ \\ $3.5$ \end{tabular}&\begin{tabular}{@{}c@{}} $1.75$ \\ $0.72$ \\ $0.67$ \end{tabular}\\
\hline
50 & \begin{tabular}{@{}c@{}} $10^{-7}$ \\ $10^{-4}$ \\ $10^{-1}$ \end{tabular}&\begin{tabular}{@{}c@{}} $10.2$ \\ $10.2$ \\ $10.2$ \end{tabular}&\begin{tabular}{@{}c@{}} $1.38$ \\ $1.40$ \\ $1.33$ \end{tabular}&\begin{tabular}{@{}c@{}} $10.2$ \\ $10.2$ \\ $10.2$ \end{tabular}&\begin{tabular}{@{}c@{}} $0.72$ \\ $1.66$ \\ $1.72$ \end{tabular}\\
\hline
100 & \begin{tabular}{@{}c@{}} $10^{-7}$ \\ $10^{-4}$ \\ $10^{-1}$ \end{tabular}&\begin{tabular}{@{}c@{}} $14.8$ \\ $14.8$ \\ $14.8$ \end{tabular}&\begin{tabular}{@{}c@{}} $1.58$ \\ $1.39$ \\ $1.61$ \end{tabular}&\begin{tabular}{@{}c@{}} $14.8$ \\ $14.8$ \\ $14.8$ \end{tabular}&\begin{tabular}{@{}c@