In this notebook we investigate the effect of normalization on the fitting procedure.

In [1]:
# General imports
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from deepmod_l1.diff_library import theta_analytical

#Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Remainder imports
from os import listdir, path, getcwd

# Setting cuda
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

# Settings for reproducibility
np.random.seed(42)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Defining output folder
output_folder = getcwd()

%load_ext autoreload
%autoreload 2

# Making library

In [2]:
D = 0.5
a = 0.25

x = np.linspace(-5, 5, 500, dtype=np.float32)
t = np.linspace(0, 5, 100, dtype=np.float32)
x_grid, t_grid = np.meshgrid(x, t, indexing='ij')
    
# Analytical
time_deriv, theta = theta_analytical(x_grid, t_grid, D, a)

And performing lst-sq we get:

In [38]:
xi_base = np.linalg.lstsq(theta, time_deriv, rcond=None)[0].squeeze()

In [39]:
xi_base

array([-2.4918742e-17,  2.0816682e-16,  5.0000000e-01,  4.4582393e-16,
       -1.2127323e-16, -1.0972126e-16, -3.7296555e-17,  1.0139718e-16,
       -7.5894152e-17], dtype=float32)

# No normalization

In [4]:
X_train = torch.tensor(theta, dtype=torch.float32)
y_train = torch.tensor(time_deriv, dtype=torch.float32)

In [5]:
model = nn.Sequential(*[nn.Linear(X_train.shape[1], 1, bias=False)])

In [6]:
optimizer = torch.optim.Adam(model.parameters())
iterations = 10000

In [7]:
for it in np.arange(iterations):
    prediction = model(X_train)
    loss = torch.mean((prediction - y_train)**2)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if it % 1000 == 0:
        print(loss.item())

0.16057567298412323
0.005893857218325138
0.00028583381208591163
0.00013026372471358627
7.630345498910174e-05
3.997812746092677e-05
1.569796950207092e-05
3.4312636216782266e-06
2.764786586340051e-07
4.198431380331158e-09


In [9]:
xi = model[0].weight.detach().numpy().squeeze()

In [27]:
xi

array([ 4.0761206e-06,  3.2607575e-10,  4.9999043e-01, -1.8266408e-05,
        5.0569660e-10,  3.7870883e-05,  5.8061538e-05,  5.9180172e-10,
       -1.8908288e-05], dtype=float32)

In [42]:
np.mean((xi_base - xi)**2)

6.2274047e-10

Which looks okay; now on to standardizing:

# Standardization

In [52]:
a = np.mean(theta, axis=0)
b = np.std(theta, axis=0)

a[0] = 0.0 # for the ones.
b[0] = 1.0

theta_standard = (theta - a)/b

In [53]:
X_train = torch.tensor(theta_standard, dtype=torch.float32)
y_train = torch.tensor(time_deriv, dtype=torch.float32)

In [54]:
model = nn.Sequential(*[nn.Linear(X_train.shape[1], 1, bias=False)])

In [55]:
optimizer = torch.optim.Adam(model.parameters())
iterations = 10000

In [56]:
for it in np.arange(iterations):
    prediction = model(X_train)
    loss = torch.mean((prediction - y_train)**2)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if it % 1000 == 0:
        print(loss.item())

0.2830438017845154
0.00010726555046858266
7.093116494161222e-08
4.345527937488214e-09
2.2145265621276167e-09
9.250656907155985e-10
2.1765943214457906e-10
1.9710988050092304e-11
3.7908841647379954e-13
1.5320766666977281e-15


Loss is definitely lower, let's see about the weights:

In [57]:
weights = model[0].weight.detach().numpy().squeeze()
weights

array([-5.2046875e-04, -8.9323121e-10,  2.8207892e-01,  4.9862003e-10,
        2.0298783e-09,  8.4423490e-08, -6.2131666e-10, -3.4909617e-09,
       -6.4450063e-08], dtype=float32)

Let's transform them back:

In [58]:
weights[0] = weights[0] - np.sum(weights[1:] * a[1:])
weights[1:] = weights[1:] / b[1:]

In [59]:
xi_standard = weights

In [60]:
xi_standard

array([-2.2684195e-04, -5.6303091e-09,  4.9999994e-01,  4.3541983e-09,
        1.8576582e-08,  1.2963892e-07, -8.6668486e-09, -3.0171726e-08,
       -6.8665500e-08], dtype=float32)

In [61]:
np.mean((xi_base-xi_standard)**2)

5.717477e-09

So basically same level as the other one; although the MSE is smaller :-)

# Norm

Now let's do the norm over theta

In [73]:
b = np.linalg.norm(theta, axis=0)
b[0] = 1.0
theta_norm = theta / b

In [74]:
X_train = torch.tensor(theta_norm, dtype=torch.float32)
y_train = torch.tensor(time_deriv, dtype=torch.float32)

In [75]:
model = nn.Sequential(*[nn.Linear(X_train.shape[1], 1, bias=False)])

In [89]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
iterations = 10000

In [90]:
for it in np.arange(iterations):
    prediction = model(X_train)
    loss = torch.mean((prediction - y_train)**2)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if it % 1000 == 0:
        print(loss.item())

0.02311607450246811
0.00010733594535849988
2.6500165404286236e-05
1.1781864486692939e-05
2.9803682082274463e-06
3.0434139830504137e-07
1.684245454214306e-08
1.633518494081887e-11
4.6763735245258786e-11
4.7436592848226766e-11


In [91]:
weights = model[0].weight.detach().numpy().squeeze()
weights

array([ 1.4915479e-06, -1.4072922e-05,  6.3074940e+01,  1.3593244e-06,
       -1.5191477e-05, -6.4075721e-07,  1.7985615e-06, -1.5493950e-05,
       -1.2513128e-06], dtype=float32)

In [96]:
xi_norm = weights / b
xi_norm

array([ 1.4915479e-06, -3.9670496e-07,  5.0000000e-01,  4.0122760e-08,
       -6.2174291e-07, -4.3958046e-09,  1.0685707e-07, -5.9886946e-07,
       -5.9579546e-09], dtype=float32)

In [97]:
b

array([  1.      ,  35.47453 , 126.14988 ,  33.879135,  24.433697,
       145.76562 ,  16.831469,  25.871998, 210.0239  ], dtype=float32)

In [98]:
np.std(theta, axis=0)

array([0.        , 0.15864693, 0.5641579 , 0.11451477, 0.10927082,
       0.65122026, 0.07168888, 0.11570308, 0.9386091 ], dtype=float32)

In [99]:
np.mean((xi_base-xi_norm)**2)

3.4893133e-13

So xi_norm sucks, but that's probably because the scaling is too high? Update: training another round with high lr fixes it, but good to keep in mind it doesn't converge quickly.

# Min - max

In [132]:
a = np.min(theta, axis=0)
b = np.max(theta, axis=0) - np.min(theta, axis=0)

a[0] = 0.0 # for the ones.
b[0] = 1.0

theta_minmax = (theta - a)/b

In [133]:
X_train = torch.tensor(theta_minmax, dtype=torch.float32)
y_train = torch.tensor(time_deriv, dtype=torch.float32)

In [134]:
model = nn.Sequential(*[nn.Linear(X_train.shape[1], 1, bias=False)])

In [135]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
iterations = 10000

In [136]:
for it in np.arange(iterations):
    prediction = model(X_train)
    loss = torch.mean((prediction - y_train)**2)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if it % 1000 == 0:
        print(loss.item())

0.13371394574642181
0.016092004254460335
0.0053431629203259945
0.0018680129433050752
0.001681806636042893
0.00027302891248837113
0.00011851068848045543
5.7303361245431006e-05
3.059543450945057e-05
1.818890632421244e-05


In [137]:
weights = model[0].weight.detach().numpy().squeeze()
weights

array([-12.376632  ,  -0.07785565,  18.173344  ,   0.02831443,
         0.44426605,   1.3437068 ,  -0.11400798,  -0.50503683,
        -1.3617116 ], dtype=float32)

In [138]:
weights[0] = weights[0] - np.sum(weights[1:] * a[1:])
weights[1:] = weights[1:] / b[1:]

In [139]:
xi_minmax = weights

In [140]:
xi_minmax

array([ 4.1567212e+02, -1.0054930e-02,  4.9298874e-01,  1.7757695e-02,
        5.0916437e-02,  2.9132446e-02, -4.4842806e-02, -4.4438783e-02,
       -1.9940261e-02], dtype=float32)

In [141]:
np.mean((xi_base-xi_minmax)**2)

19198.146

Which is baddd...... No idea why though... 

In [146]:
np.linalg.lstsq(theta_minmax, time_deriv, rcond=None)[0].squeeze()[1:] / b[1:]

array([ 6.0760186e-13,  5.0000006e-01,  3.5277884e-07, -2.4898915e-12,
       -8.8995215e-08, -5.4412334e-07,  1.7290317e-12,  3.0291073e-08],
      dtype=float32)

In [147]:
np.linalg.lstsq(theta_minmax, time_deriv, rcond=None)[0].squeeze()[0] - np.sum(np.linalg.lstsq(theta_minmax, time_deriv, rcond=None)[0].squeeze()[1:] * a[1:])

456.73785

it's def not related to our implementation though.... Seems to be numerical errors?

# Conclusion

For fitting there doesn't seem to be much difference; we seems to get same results for standardized and nothing. In future might be important to compare coeffs. Norming doesn't seem great way because the number if too big

**Conclusion**: Standardize theta.