In [None]:
# %% Deep learning - Section 8.68
#    Cross validation on regression

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Data

n = 100
x = torch.randn(n,1)
y = x + torch.randn(n,1)

plt.plot(x,y,'o')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Some correlated data')

plt.savefig('figure11_cross_validation_regression.png')

plt.show()

files.download('figure11_cross_validation_regression.png')


In [None]:
# %% Model

ANNreg = nn.Sequential(
            nn.Linear(1,1),
            nn.ReLU(),
            nn.Linear(1,1)
            )

loss_fun = nn.MSELoss()

learn_rate = 0.05
optimizer  = torch.optim.SGD(ANNreg.parameters(),lr=learn_rate)


In [None]:
# %% Select data for training

# Random indices, inizialise False vector, select samples to true
train_prop = int(len(x)*.8)
train_idx  = np.random.choice(range(n),train_prop,replace=False)
train_bool = np.zeros(n,dtype=bool)
train_bool[train_idx] = True

# Show sizes
print(f'Training data: {x[train_bool].shape}')
print(f'Test data: {x[~train_bool].shape}')


In [None]:
# %% Train model

num_epochs = 500

for epochs_i in range(num_epochs):

    # Forward pass
    yHat = ANNreg(x[train_bool])

    # Compute loss
    loss = loss_fun(yHat,y[train_bool])

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [None]:
# %% Report losses

# Model pass on test data
pred_test = ANNreg(x[~train_bool])
test_loss = (pred_test-y[~train_bool]).pow(2).mean()

print(f'Final train loss: {loss.detach():.2f}')
print(f'Final test loss: {test_loss.detach():.2f}')

pred_test = pred_test.detach().numpy()


In [None]:
# %% Plotting

# Final pass on train data
pred_train = ANNreg(x[train_bool]).detach().numpy()

# Plot
plt.plot(x,y,'o',label='All data')
plt.plot(x[train_bool],pred_train,'s',label='Training predictions')
plt.plot(x[~train_bool],pred_test,'^',label='Test predictions')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Model performance on training and test data')
plt.legend()

plt.savefig('figure12_cross_validation_regression.png')

plt.show()

files.download('figure12_cross_validation_regression.png')


In [None]:
# %% Exercise 1
#    The train/test split is currently hard-coded to be 80/20 (note the number "80"). This is bad coding style, because
#    if you change the number of datapoints from N=100 to N=10000, then we're still only training on 80 samples and testing
#    on 10000-80=9920 samples. Change how the variable trainBool is created so that it always trains on 80% of the data,
#    regardless of the dataset size.

# Easy-peasy fix:

train_prop = int(len(x)*.8)
train_idx  = np.random.choice(range(n),train_prop,replace=False)
train_bool = np.zeros(n,dtype=bool)
train_bool[train_idx] = True


In [None]:
# %% Exercise 2
#    Re-write this code to use scikitlearn and/or DataLoader instead of manually separating the data into train/test.

partition = [.8,.2]
train_x,test_x,train_y,test_y = train_test_split(x,y,train_size=partition[0])

num_epochs = 500

for epochs_i in range(num_epochs):

    # Forward pass
    yHat = ANNreg(train_x)

    # Compute loss
    loss = loss_fun(yHat,train_y)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

pred_test = ANNreg(test_x)
test_loss = (pred_test-test_y).pow(2).mean()
print(f'Final train loss: {loss.detach():.2f}')
print(f'Final test loss: {test_loss.detach():.2f}')


In [None]:
# %% Exercise 3
#    Do we really need 500 epochs to train the model? To find out, add code to the training loop to compute the MSEloss
#    for the train and test data on each iteration during training. Then plot the train and test error as a function of
#    training epoch. What is your evaluation of an appropriate amount of training for this model/dataset?

# Not 100% sure I got this right, I basically computed the model for the test set at each iteration, for
# the purpose of exploring how well it fits those data, even though normally we would pass the test set
# through the model only for a final pass. Now..  Assuming I got this right, 100-150 iterations is quite
# enough to get to a stable loss pattern (Fig. 1), but it's also true that sometimes the test set loss
# bounces back to higher values (Fig. 2). Is this a "signature" of overfitting in the training set ? And if
# yes, what is the most likely cause? The stochasticity in the gradient descend algrithm, the random selection
# of training and test samples? Intuitively I'd say at least a mixture of these two, but happy to hear more about it.

ANNreg = nn.Sequential(
            nn.Linear(1,1),
            nn.ReLU(),
            nn.Linear(1,1)
            )

loss_fun = nn.MSELoss()

learn_rate = 0.05
optimizer  = torch.optim.SGD(ANNreg.parameters(),lr=learn_rate)


train_prop = int(len(x)*.8)
train_idx  = np.random.choice(range(n),train_prop,replace=False)
train_bool = np.zeros(n,dtype=bool)
train_bool[train_idx] = True


num_epochs   = 500
losses_train = torch.zeros(num_epochs)
losses_test  = torch.zeros(num_epochs)

for epochs_i in range(num_epochs):

    # Forward pass
    yHat      = ANNreg(x[train_bool])
    yHat_test = ANNreg(x[~train_bool])

    # Compute loss
    loss      = loss_fun(yHat,y[train_bool])
    test_loss = loss_fun(yHat_test,y[~train_bool])

    losses_train[epochs_i] = loss
    losses_test[epochs_i]  = test_loss

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [None]:
# Plotting

fig = plt.figure(figsize=(10,5))

plt.plot(losses_train.detach(),'o-',alpha=.75)
plt.plot(losses_test.detach(),'o-',alpha=.75)
plt.xlabel('Epochs')
plt.ylabel('Losses')
plt.legend(['Train losses','Test losses'])
plt.title('Losses over epochs for training and test dataset')

plt.savefig('figure13_cross_validation_dataloader_extra3.png')

plt.show()

files.download('figure13_cross_validation_dataloader_extra3.png')


In [None]:
# More plotting

pred_test = ANNreg(x[~train_bool])
test_loss = (pred_test-y[~train_bool]).pow(2).mean()

print(f'Final train loss: {loss.detach():.2f}')
print(f'Final test loss: {test_loss.detach():.2f}')

pred_test  = pred_test.detach().numpy()
pred_train = ANNreg(x[train_bool]).detach().numpy()

# Plot
plt.plot(x,y,'o',label='All data')
plt.plot(x[train_bool],pred_train,'s',label='Training predictions')
plt.plot(x[~train_bool],pred_test,'^',label='Test predictions')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Model performance on training and test data')
plt.legend()

plt.savefig('figure19_cross_validation_dataloader_extra3.png')

plt.show()

files.download('figure19_cross_validation_dataloader_extra3.png')
