In [None]:
# %% Deep learning - Section 7.45
#    ANN for regression

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import copy

from google.colab                     import files
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Create data

n = 30
x = torch.randn(n,1)
y = x + torch.randn(n,1)/2

# Plotting
plt.plot(x,y,'s')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Some data')

plt.savefig('figure1_ann_regression.png')

plt.show()

files.download('figure1_ann_regression.png')


In [None]:
# %% Build the model

ANNreg = nn.Sequential(
            nn.Linear(1,1),   # input layer (num inputs, num outputs)
            nn.ReLU(),        # activation function
            nn.Linear(1,1)    # output layer (num inputs, num outputs)
            )

ANNreg


In [None]:
# %% Training parameters

# Learning rate
learning_rate = 0.05

# Loss function
loss_fun = nn.MSELoss()

# Optimizer (i.e. the flavour of gradient to implement; here stocastic gradient descent)
optimizer = torch.optim.SGD(ANNreg.parameters(),lr=learning_rate)


In [None]:
# %% Train the model

# Parameters
num_epochs = 500
losses = torch.zeros(num_epochs)

## Training
for epoch_i in range(num_epochs):

    # Forward propagation
    yHat = ANNreg(x)

    # Loss
    loss = loss_fun(yHat,y)
    losses[epoch_i] = loss

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [None]:
# %% Show losses

# Final forward pass
predictions = ANNreg(x)

# Final loss (MSE)
testloss    = (predictions-y).pow(2).mean()

# Plotting
plt.plot(losses.detach(),'o',markerfacecolor='w',linewidth=.1)
plt.plot(num_epochs,testloss.detach(),'ro')
plt.xlabel('Epoch')
plt.ylabel('Losses')
plt.title(f'Final loss = {round(testloss.item(),4)}')

plt.savefig('figure2_ann_regression.png')

plt.show()

files.download('figure2_ann_regression.png')


In [None]:
# %% Show data

plt.plot(x,y,'bo',label='Real data')
plt.plot(x,predictions.detach(),'rs',label='Predictions')
plt.title(f'Data-prediction correlation: r = {np.round(np.corrcoef(y.T,predictions.detach().T)[0,1],2)}')
plt.legend()

plt.savefig('figure3_ann_regression.png')

plt.show()

files.download('figure3_ann_regression.png')


In [None]:
# %% Exercise 1
#    How much data is "enough"? Try different values of N and see how low the loss gets.
#    Do you still get low loss ("low" is subjective, but let's say loss<.25) with N=10? N=5?

# In statistics the sample size is always an important factor; in the case of a regression, a low
# sample size increases the risk that the regression coeffiecient is heavily influenced by noise
# and/or outliers. In deep learning, I guess a similar consideration is also valid, since fewer
# data points make the error landscape more 'rugged'; a further consideration is that DL models
# include some randomness in the computation of the weights, while a linear regression has a closed
# form solution that guarantees we are getting the best possible solution for the given data.
# Bottom line, more data is better, but given a fortunate starting point and low noise, it's still
# not impossible to get good results even with small samples


In [None]:
# %% Exercise 2
#    Does your conclusion above depend on the amount of noise in the data? Try changing the noise level
#    by changing the division ("/2") when creating y as x+randn.

# Similar considerations to the point above. More noise is in a certain sense similar to less data points,
# in the sense that both these factors tend to increase the variance, and thus the MSE. For example, training
# the same model with n = 30 but with a noise of : y = x + torch.randn(n,1) produces a lower performance


In [None]:
# %% Exercise 3
#    Notice that the model doesn't always work well. Put the original code (that is, N=30 and /2 noise)
#    into a function or a for-loop and repeat the training 100 times (each time using a fresh model instance).
#    Then count the number of times the model had a loss>.25.

# Create data
n = 30
x = torch.randn(n,1)
y = x + torch.randn(n,1)/2

# Repeat training 100 times
testloss = torch.zeros(100)

for i in range(100):

    # Build the model
    ANNreg = nn.Sequential(
                nn.Linear(1,1),   # input layer (num inputs, num outputs)
                nn.ReLU(),        # activation function
                nn.Linear(1,1)    # output layer (num inputs, num outputs)
                )

    # Training parameters
    learning_rate = 0.05
    loss_fun = nn.MSELoss()
    optimizer = torch.optim.SGD(ANNreg.parameters(),lr=learning_rate)

    # Train the model
    num_epochs = 500
    losses = torch.zeros(num_epochs)

    for epoch_i in range(num_epochs):

        yHat = ANNreg(x)

        loss = loss_fun(yHat,y)
        losses[epoch_i] = loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Final forward pass and loss
    predictions = ANNreg(x)
    testloss[i] = (predictions-y).pow(2).mean()

low_loss_count = (testloss < 0.25).sum().item()

print(f'Proportion of losses < 0.25: {low_loss_count/len(testloss)}')

# Indeed, as mentioned above, the stocasticity inherent to DL model does not guarantee
# to get good approximation of our data, especially for small datasets where the loss function
# might be affected by the presence of poor local minima (an event whose probability should
# decrease for higher dimentional datasets). This is coherent with the fact that the solution
# is either quite good or totally bad, no middle ground

# See also histogram
plt.hist(testloss.detach().numpy(),bins=100,edgecolor='black',alpha=0.7)
plt.xlabel('Value')
plt.ylabel('Count')
plt.title('Histogram of loss values')

plt.savefig('figure13_ann_regression_extra3.png')

plt.show()

files.download('figure13_ann_regression_extra3.png')
