# Experiments 1. Non-linear Equation System: sinus

We will minimize

$$f(w_1, w_2) = \sum\limits_{i=1}^d \left(\langle w_1, x_i\rangle + \sin\left(w_2, x_i\right) - y_i\right)^2$$

for $w_1, w_2\in\mathbb{R}^n$, $x_i \in \mathbb{R}^n$, $d\leq n$ with the condition

$$XX^\top \succeq \mu I_d,$$
where $X = (x_1 \dots x_d)^\top \in \mathbb{R}^{d\times n}.$

Now we will use spherical approximation

$$\tilde{\nabla}f(x)=\frac{n}{h}\mathbb{E}\tilde{f}(x+eh)e$$


In [1]:
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import jax
import jax.numpy as jnp
import timeit
from jax.config import config

In [2]:
from methods import gradf_inexact
from methods import GradientDescent, parse_logs, AdaptiveL, StepSize, AdaptiveNoiseGD
from methods import ConstantStepSize, AdaptiveLdelta

In [3]:
matplotlib.use('Agg')
params = {'legend.fontsize': 20,
          'legend.handlelength': 4,
          "axes.labelsize": 45,
          "xtick.labelsize": 25,
          "ytick.labelsize": 25,
          "lines.linewidth": 2,
           "axes.titlesize":30}
matplotlib.rcParams.update(params)

In [4]:
config.update("jax_enable_x64", True)

In [5]:
path_pics = "../pics/"

In [6]:
def f1(w1, w2, X, Y):
    G = (X @ w1 + jnp.sin(X@w2) - Y)**2
    return G.sum(axis=0)
gradf = jax.grad(f1, argnums=(0, 1), has_aux=False)
jit_gradf = jax.jit(gradf)

In [7]:
np.random.uniform(-1, 1)

0.12308191420612347

## 0. Dataset

In [8]:
d = 20
n = 30
X = np.random.randn(d, n)
w = np.random.randn(2*n)
Y = X @ w[:n] + np.sin(X@w[n:])
eig = np.linalg.eig(X@X.T)[0]
mu = min(eig)
min(eig), max(eig)

(2.932412249490965, 83.66651720351642)

In [9]:
sigma=0
def gradf(x):
    z = jit_gradf(x[:n], x[n:], X, Y)
    g = np.hstack([np.array(i.block_until_ready()) for i in z])
    return g
f = lambda x: f1(x[:n], x[n:], X, Y, delta)

In [10]:
def params(X, Y, delta, h, K):
    f = lambda x: f1(x[:n], x[n:], X, Y)  + delta * np.random.uniform(-1, 1)
    def gradf(x):
        s = np.zeros(2 * X.shape[-1])
        for k in range(K):
            e = np.random.randn(2 * X.shape[-1])
            e /= np.linalg.norm(e)
            s +=  f(x + e * h) * e
        return 2 * X.shape[-1] / h * s/K
    return f, gradf


def params(X, Y, delta, h, K):
    f = lambda x: f1(x[:n], x[n:], X, Y)  + delta * np.random.uniform(-1, 1)
    def gradf(x):
        z = jit_gradf(x[:n], x[n:], X, Y)
        g = np.hstack([np.array(i.block_until_ready()) for i in z])
        return g
    return f, gradf

eps = 1e-10
h = 3e-1 * np.sqrt(eps)
f, gradf = params(X, Y, eps, h, 10000)

In [11]:
f(w).item(), f(w)-fexact(w[:n], w[n:], X, Y), np.linalg.norm(gradf(w))



NameError: name 'fexact' is not defined

## 1. Noise Distributed on the Unit Sphere

The case when $\xi \sim \mathcal{U}(S_1(0))$

In [12]:
eps_list = [1e-8, 1e-6, 1e-4]

n = 30
d_list = [2, 10, 20]

w = np.ones(2*n)
v = np.random.randn(2*n)
res = {d:{"delta":[], 
           "iters_adaptL":[], "time_adaptL":[], "adaptL,x0-x*": [], "normg_adaptL": [],
           "iters_exact":[], "time_exact":[], "exact,x0-x*": [], "normg_exact": [],
          "iters_adaptLdelta":[], "time_adaptLdelta":[], "adaptLdelta,x0-x*": [], "normg_adaptLdelta": []} for d in d_list}
mu_list = {}
number = 10
save_iter = 1
N = 10000
methods = []
np.random.seed(1)
for d in d_list:
    X = np.random.randn(d, n)
    w = np.random.randn(2*n)
    Y = X @ w[:n] + np.sin(X@w[n:])
    eig = np.linalg.eig(X@X.T)[0]
    wsol=w.copy()
    w = np.ones(2*n)
    v = np.random.randn(2*n)
    mu = min(eig)

    mu_list[d] = mu
    print(d, mu)
    for eps in eps_list:
        h = 3e-1 * np.sqrt(eps)
        Delta = np.sqrt(eps)
        f, gradf = params(X, Y, eps, h, 10)
        res[d]["delta"].append(int(np.log10(Delta)))
        tol = 2*Delta

        grad_inexact = gradf
        grad_inexact = lambda w: gradf_inexact(w, gradf, Delta, 1, v=v)
        stepsize = AdaptiveLdelta(L0=1, mindelta=1e-12, Lmin=mu/4, mu=mu, delta_alpha=2.1)
        method = AdaptiveNoiseGD(stepsize, name="GD, Delta={}".format(Delta), save_iter=save_iter, alpha=np.sqrt(6))
        x = method.solve(w, f, grad_inexact, max_iter=N)
        g = lambda: AdaptiveNoiseGD(AdaptiveLdelta(L0=1, mindelta=1e-12, Lmin=mu/4, mu=mu, delta_alpha=2), return_history=False, 
                                    alpha=np.sqrt(6)).solve(w, f, grad_inexact, max_iter=N)
        T = timeit.timeit(g, number=number)/number        
        print("\t{}\t{}\t{:.2f}\t{:.6f}\t{:.2f}\t{}".format(Delta, len(method.history), T*1000, np.linalg.norm(x-w), 
                                                np.linalg.norm(gradf(x))/Delta, f(x)))
        methods.append(method)
        res[d]["iters_adaptLdelta"].append(len(method.history))
        res[d]["time_adaptLdelta"].append("{:.2f}".format(T*1000))
        res[d]["adaptLdelta,x0-x*"].append("{:.1f}".format(np.linalg.norm(x-w)))
        res[d]["normg_adaptLdelta"].append("{:.2f}".format(np.linalg.norm(gradf(x))/Delta))    
    print("\n")

2 18.977936872901132
	0.0001	35	81.74	1.419305	3.76	1.0121319182649616e-08
	0.001	27	63.31	1.419739	19.66	3.905218190818588e-06
	0.01	19	44.94	1.419447	0.61	2.338044129518015e-05


10 6.8029268057078705
	0.0001	52	127.01	5.348248	24.49	2.7406010311450852e-08
	0.001	38	103.05	5.350924	285.78	0.00044947126057567227
	0.01	32	79.38	5.346481	7.82	6.166135357481751e-05


20 0.6445302383410392
	0.0001	108	317.40	4.242053	45.55	8.653600035852517e-07
	0.001	79	219.05	4.240251	30.82	5.9551214634589696e-05
	0.01	45	126.00	4.217464	42.85	0.009108292438334473




In [None]:
s = ""

for d in d_list:
    s += str(d) + " & "
    s += "{:.1f}".format(mu_list[d]) + " & "

    cur_list = ["$10^{{{}}}$".format(i) for i in res[d]["delta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"

    cur_list = ["${}$".format(i) for i in res[d]["iters_adaptL"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["time_adaptL"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    
    cur_list = ["${}$".format(i) for i in res[d]["iters_adaptLdelta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["time_adaptLdelta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}"

    s+= "\\\\\n\\hline\n"
print(s)

In [167]:
s = ""

for d in d_list:
    s += str(d) + " & "
    cur_list = ["$10^{{{}}}$".format(i) for i in res[d]["delta"]]


    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["adaptL,x0-x*"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["normg_adaptL"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    
    cur_list = ["${}$".format(i) for i in res[d]["adaptLdelta,x0-x*"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}&"
    cur_list = ["${}$".format(i) for i in res[d]["normg_adaptLdelta"]]
    s+= "\\begin{tabular}{@{}c@{}} " + " \\\\ ".join(cur_list) + " \\end{tabular}"

    s+= "\\\\\n\\hline\n"
print(s)

10 & \begin{tabular}{@{}c@{}} $10^{-7}$ \\ $10^{-4}$ \\ $10^{-1}$ \end{tabular}&\begin{tabular}{@{}c@{}} $3.5$ \\ $3.5$ \\ $3.5$ \end{tabular}&\begin{tabular}{@{}c@{}} $0.23$ \\ $0.21$ \\ $1.63$ \end{tabular}&\begin{tabular}{@{}c@{}} $3.5$ \\ $3.5$ \\ $3.5$ \end{tabular}&\begin{tabular}{@{}c@{}} $1.75$ \\ $0.72$ \\ $0.67$ \end{tabular}\\
\hline
50 & \begin{tabular}{@{}c@{}} $10^{-7}$ \\ $10^{-4}$ \\ $10^{-1}$ \end{tabular}&\begin{tabular}{@{}c@{}} $10.2$ \\ $10.2$ \\ $10.2$ \end{tabular}&\begin{tabular}{@{}c@{}} $1.38$ \\ $1.40$ \\ $1.33$ \end{tabular}&\begin{tabular}{@{}c@{}} $10.2$ \\ $10.2$ \\ $10.2$ \end{tabular}&\begin{tabular}{@{}c@{}} $0.72$ \\ $1.66$ \\ $1.72$ \end{tabular}\\
\hline
100 & \begin{tabular}{@{}c@{}} $10^{-7}$ \\ $10^{-4}$ \\ $10^{-1}$ \end{tabular}&\begin{tabular}{@{}c@{}} $14.8$ \\ $14.8$ \\ $14.8$ \end{tabular}&\begin{tabular}{@{}c@{}} $1.58$ \\ $1.39$ \\ $1.61$ \end{tabular}&\begin{tabular}{@{}c@{}} $14.8$ \\ $14.8$ \\ $14.8$ \end{tabular}&\begin{tabular}{@{}c@