In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import matplotlib.pyplot as plt
%run ~/Dropbox/NYU/Work/plot_conf.py

In [None]:
plt_style()

# Here we will:
* Construct a model $f(x)$ with parameters $\theta$
* Train it to learn a simple function $y = x^2$ by gradient descent
* We do this by minimizing a loss function $\mathcal{L}(\theta) = \|x_i^2 - f(x_i) \|$ for many random points $x_i$ 
* Visualize the function that it learns


Gradient descent works as follows:

* Randomly initialize model parameters $\theta$
* Repeat: $\theta_{t+1} \leftarrow \theta_t - \alpha \frac{\partial \mathcal{L}}{\partial \theta}$







# Initialize the Model

In [None]:
n_hidden = 100
model = nn.Sequential(
        nn.Linear(1, n_hidden), 
        nn.ReLU(), 
        nn.Linear(n_hidden, n_hidden), 
        nn.ReLU(), 
        nn.Linear(n_hidden, 1)
        )

x = Variable(torch.linspace(-2, 2, 1000).view(-1, 1))
y = model(x)
plt.plot(x.data.squeeze().numpy(), y.data.squeeze().numpy())
plt.title('f(x) before training')
plt.xlabel('x')
plt.ylabel('y')

# Train the Model

* Sample random points $x_i \in [-2, 2]$
* Minimize network weights with respect to $\mathcal{L}(\theta) = \sum_i \|x_i^2 - f(x_i) \|$ 

In [None]:
learning_rate = 0.01
batch_size = 1000
optimizer = optim.SGD(model.parameters(), learning_rate)
loss = []
for t in range(1000):
    model.zero_grad()
    # sample points in [-2, 2]
    x = 4*(torch.rand(batch_size, 1) - 0.5)
    x = Variable(x.view(-1, 1))
    y = model(x)
    current_loss = F.mse_loss(y, x**2)
    current_loss.backward()
    optimizer.step()
    loss.append(current_loss.data[0])
    
plt.plot(loss)
plt.xlabel('training steps')
plt.ylabel('loss')

# Show the Learned Function

Let's compare the learned function to the true one on the interval $[-2, 2]$:

In [None]:
x = Variable(torch.linspace(-2, 2, 1000).view(-1, 1))
y = model(x)
x = x.data.squeeze()
y = y.data.squeeze()
y_true = x**2
plt.plot(x.numpy(), y.numpy())
plt.plot(x.numpy(), y_true.numpy())
plt.xlabel('x')
plt.ylabel('y')
plt.legend(['learned function f(x)', 'true function'])

# What about on a larger interval?

Now let's compare on a larger interval $[-5, 5]$:

In [None]:
x = Variable(torch.linspace(-5, 5, 1000).view(-1, 1))
y = model(x)
x = x.data.squeeze()
y = y.data.squeeze()
y_true = x**2
plt.plot(x.numpy(), y.numpy())
plt.plot(x.numpy(), y_true.numpy())
plt.xlabel('x')
plt.ylabel('y')
plt.legend(['learned function f(x)', 'true function'])

* We only trained the function on points $x \in [-2, 2]$. 
* For new points within this interval, the learned function is very close to the true one
* However for points outside, it is no longer accurate

* Generalization to new domains is a big challenge in machine learning