In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures
import torch

# Regression example

In [None]:
def func(x):
    return x*(x-1)*(x+1)

In [None]:
rxs = np.random.uniform(-1,1,12)
rys = func(rxs)+np.random.normal(0,0.05,len(rxs))

That's how the data looks like:

In [None]:
plt.scatter(rxs,rys, alpha=0.7, color='none', edgecolor="black");
plt.grid()

## Model capacity, underfitting and overfitting

In [None]:
train_errors = []
polys  = []
for deg in range(8):
    p = np.polyfit(rxs, rys,deg)
    polys.append(p)
    pred_rys = np.polyval(p, rxs) 
    resid = pred_rys-rys
    err_train= 0.5*np.dot(resid, resid)/len(rxs)
    train_errors.append(np.array([deg, err_train]))
fit_res_train = np.stack(train_errors,axis=0)    

And here is how the MSE looks as a function of the polynomial degree:

In [None]:
plt.grid()
plt.scatter(fit_res_train[:,0], fit_res_train[:,1]);

In [None]:
rxs_valid = np.random.uniform(-1,1,8)
rys_valid = func(rxs_valid)+np.random.normal(0,0.05, len(rxs_valid))

And check how the model performs on them

In [None]:
validation_errors =[]
for deg in range(8):
    p = polys[deg]
    pred_ys = np.polyval(p, rxs_valid) 
    resid = pred_ys-rys_valid
    err_valid= 0.5*np.dot(resid, resid)/len(rxs_valid)
    validation_errors.append(np.array([deg, err_valid]))
fit_res_valid = np.stack(validation_errors, axis=0)    

In [None]:
filter = ((fit_res_valid[:,0]>0) & (fit_res_valid[:,0]<25) )
plt.scatter(fit_res_train[filter,0], fit_res_train[filter,1])
plt.scatter(fit_res_valid[filter,0], fit_res_valid[filter,1], c='red');

We see that this *validation error* behaves at the begining similarly to training error. It's decreases with increasing degree. But at certain moment it start to increase in this case quite dramaticaly.

This unfortunately is also typical. Let's look what happens:

In [None]:
xs = np.linspace(-1,1, 500)
plt.scatter(rxs,rys, alpha=0.7, color='none', edgecolor="black", label="training");
plt.scatter(rxs_valid,rys_valid, alpha=0.7, color='none', edgecolor="red", label="validation");
for i in range(8):
    ys = np.polyval(polys[i],xs)
    plt.plot(xs,ys);
plt.legend();


## Regularization

$$L(\theta|x,y)+\alpha \frac{1}{2}||\theta||^2$$ 

$$\nabla_\theta L(\theta|x,y) +\alpha\cdot  \theta$$

In [None]:
poly = PolynomialFeatures(20)

In [None]:
poly.fit(rxs.reshape(-1,1))

In [None]:
poly_rxs = poly.fit_transform(rxs.reshape(-1,1))

In [None]:
t_rxs = torch.from_numpy(poly_rxs.astype('float32'))
t_rys = torch.from_numpy(rys.reshape(-1,1).astype('float32'))

In [None]:
poly_rxs_valid = poly.fit_transform(rxs_valid.reshape(-1,1))
t_rxs_valid = torch.from_numpy(poly_rxs_valid.astype('float32'))
t_rys_valid = torch.from_numpy(rys_valid.reshape(-1,1).astype('float32'))

In [None]:
t_rxs_valid.shape

In [None]:
linear = torch.nn.Linear(in_features=t_rxs.shape[-1], out_features=1, bias=False)
torch.nn.init.uniform_(linear.weight,-1.0, 1.0)

In [None]:
optim = torch.optim.SGD(linear.parameters(), lr=0.01, weight_decay=0.0)

In [None]:
%%time
for e in range(100000):
    optim.zero_grad()
    pred = linear(t_rxs)
    loss = torch.nn.functional.mse_loss(pred, t_rys)
    loss.backward()
    optim.step()
print(loss.item())    
coeffs = linear.weight.data.detach().numpy().ravel();

In [None]:
torch.nn.functional.mse_loss(linear(t_rxs_valid), t_rys_valid)

In [None]:
poly_xs = poly.fit_transform(xs.reshape(-1,1))

In [None]:
ys = poly_xs@coeffs

In [None]:
plt.plot(xs, ys);
plt.scatter(rxs,rys, alpha=0.7, color='none', edgecolor="black", label="training");
plt.scatter(rxs_valid,rys_valid, alpha=0.7, color='none', edgecolor="red", label="validation");